From 747ce36915b3cd1f3d868d778db42ee811c97a04 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 18 Mar 2013 22:58:57 +0100 Subject: [PATCH] Specialise the default codec to reuse Lucene41 files in the common case. Closes #2799 --- .../TransportBroadcastOperationAction.java | 2 +- .../PerFieldMappingPostingFormatCodec.java | 8 +- .../BloomFilterPostingsFormat.java | 38 +++-- .../ElasticSearch090PostingsFormat.java | 84 +++++++++++ .../codec/postingsformat/PostingFormats.java | 17 +-- .../PreBuiltPostingsFormatProvider.java | 3 + .../index/mapper/internal/UidFieldMapper.java | 2 +- .../org.apache.lucene.codecs.PostingsFormat | 1 + .../test/unit/index/codec/CodecTests.java | 6 +- .../DefaultPostingsFormatTests.java | 130 ++++++++++++++++++ 10 files changed, 266 insertions(+), 25 deletions(-) create mode 100644 src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java create mode 100644 src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java diff --git a/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java b/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java index 8aff54f80e4..faf33a83cba 100644 --- a/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java +++ b/src/main/java/org/elasticsearch/action/support/broadcast/TransportBroadcastOperationAction.java @@ -263,7 +263,7 @@ public abstract class TransportBroadcastOperationAction bloomsByFieldName = new HashMap(); + + // for internal use only + FieldsProducer getDelegate() { + return delegateFieldsProducer; + } public BloomFilteredFieldsProducer(SegmentReadState state) throws IOException { @@ -119,15 +125,18 @@ public final class BloomFilterPostingsFormat extends PostingsFormat { // Load the delegate postings format PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn .readString()); - + this.delegateFieldsProducer = delegatePostingsFormat .fieldsProducer(state); int numBlooms = bloomIn.readInt(); - for (int i = 0; i < numBlooms; i++) { - int fieldNum = bloomIn.readInt(); - BloomFilter bloom = BloomFilter.deserialize(bloomIn); - FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum); - bloomsByFieldName.put(fieldInfo.name, bloom); + if (state.context.context != IOContext.Context.MERGE) { + // if we merge we don't need to load the bloom filters + for (int i = 0; i < numBlooms; i++) { + int fieldNum = bloomIn.readInt(); + BloomFilter bloom = BloomFilter.deserialize(bloomIn); + FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum); + bloomsByFieldName.put(fieldInfo.name, bloom); + } } IOUtils.close(bloomIn); success = true; @@ -332,7 +341,7 @@ public final class BloomFilterPostingsFormat extends PostingsFormat { } - class BloomFilteredFieldsConsumer extends FieldsConsumer { + final class BloomFilteredFieldsConsumer extends FieldsConsumer { private FieldsConsumer delegateFieldsConsumer; private Map bloomFilters = new HashMap(); private SegmentWriteState state; @@ -345,6 +354,11 @@ public final class BloomFilterPostingsFormat extends PostingsFormat { // this.delegatePostingsFormat=delegatePostingsFormat; this.state = state; } + + // for internal use only + FieldsConsumer getDelegate() { + return delegateFieldsConsumer; + } @Override public TermsConsumer addField(FieldInfo field) throws IOException { @@ -449,4 +463,8 @@ public final class BloomFilterPostingsFormat extends PostingsFormat { } + public PostingsFormat getDelegate() { + return this.delegatePostingsFormat; + } + } diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java new file mode 100644 index 00000000000..a5b5f0b1d92 --- /dev/null +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/ElasticSearch090PostingsFormat.java @@ -0,0 +1,84 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.elasticsearch.index.codec.postingsformat; + +import java.io.IOException; +import java.util.Iterator; + +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.TermsConsumer; +import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsConsumer; +import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat.BloomFilteredFieldsProducer; +import org.elasticsearch.index.mapper.internal.UidFieldMapper; + +/** + * This is the default postings format for ElasticSearch that special cases + * the _uid field to use a bloom filter while all other fields + * will use a {@link Lucene41PostingsFormat}. This format will reuse the underlying + * {@link Lucene41PostingsFormat} and it's files also for the _uid saving up to + * 5 files per segment in the default case. + */ +public final class ElasticSearch090PostingsFormat extends PostingsFormat { + private final BloomFilterPostingsFormat bloomPostings; + + public ElasticSearch090PostingsFormat() { + super("es090"); + bloomPostings = new BloomFilterPostingsFormat(new Lucene41PostingsFormat(), BloomFilter.Factory.DEFAULT); + } + + public PostingsFormat getDefaultWrapped() { + return bloomPostings.getDelegate(); + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state); + return new FieldsConsumer() { + + @Override + public void close() throws IOException { + fieldsConsumer.close(); + } + + @Override + public TermsConsumer addField(FieldInfo field) throws IOException { + if (UidFieldMapper.NAME.equals(field.name)) { + // only go through bloom for the UID field + return fieldsConsumer.addField(field); + } + return fieldsConsumer.getDelegate().addField(field); + } + }; + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + // we can just return the delegate here since we didn't record bloom filters for + // the other fields. + return bloomPostings.fieldsProducer(state); + } + +} diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java index 957f74112a1..c7d14cc0d54 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PostingFormats.java @@ -23,10 +23,6 @@ import com.google.common.collect.ImmutableCollection; import com.google.common.collect.ImmutableMap; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat; -import org.apache.lucene.codecs.lucene41.Lucene41PostingsFormat; -import org.apache.lucene.codecs.memory.DirectPostingsFormat; -import org.apache.lucene.codecs.memory.MemoryPostingsFormat; -import org.apache.lucene.codecs.pulsing.Pulsing41PostingsFormat; import org.elasticsearch.common.collect.MapBuilder; /** @@ -69,14 +65,15 @@ public class PostingFormats { for (String luceneName : PostingsFormat.availablePostingsFormats()) { buildInPostingFormatsX.put(luceneName, new PreBuiltPostingsFormatProvider.Factory(PostingsFormat.forName(luceneName))); } - buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", new DirectPostingsFormat())); - buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", new MemoryPostingsFormat())); + final ElasticSearch090PostingsFormat defaultFormat = new ElasticSearch090PostingsFormat(); + buildInPostingFormatsX.put("direct", new PreBuiltPostingsFormatProvider.Factory("direct", PostingsFormat.forName("Direct"))); + buildInPostingFormatsX.put("memory", new PreBuiltPostingsFormatProvider.Factory("memory", PostingsFormat.forName("Memory"))); // LUCENE UPGRADE: Need to change this to the relevant ones on a lucene upgrade - buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", new Pulsing41PostingsFormat())); - buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", new Lucene41PostingsFormat())); + buildInPostingFormatsX.put("pulsing", new PreBuiltPostingsFormatProvider.Factory("pulsing", PostingsFormat.forName("Pulsing41"))); + buildInPostingFormatsX.put("default", new PreBuiltPostingsFormatProvider.Factory("default", defaultFormat)); - buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(new Pulsing41PostingsFormat()))); - buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(new Lucene41PostingsFormat()))); + buildInPostingFormatsX.put("bloom_pulsing", new PreBuiltPostingsFormatProvider.Factory("bloom_pulsing", wrapInBloom(PostingsFormat.forName("Pulsing41")))); + buildInPostingFormatsX.put("bloom_default", new PreBuiltPostingsFormatProvider.Factory("bloom_default", wrapInBloom(PostingsFormat.forName("Lucene41")))); builtInPostingFormats = buildInPostingFormatsX.immutableMap(); } diff --git a/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java b/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java index df195047e64..450b0a37e81 100644 --- a/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java +++ b/src/main/java/org/elasticsearch/index/codec/postingsformat/PreBuiltPostingsFormatProvider.java @@ -60,6 +60,9 @@ public class PreBuiltPostingsFormatProvider implements PostingsFormatProvider { } public PreBuiltPostingsFormatProvider(String name, PostingsFormat postingsFormat) { + if (postingsFormat == null) { + throw new IllegalArgumentException("PostingsFormat must not be null"); + } this.name = name; this.postingsFormat = postingsFormat; } diff --git a/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java b/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java index 711d42dab4e..60611104568 100644 --- a/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java +++ b/src/main/java/org/elasticsearch/index/mapper/internal/UidFieldMapper.java @@ -134,7 +134,7 @@ public class UidFieldMapper extends AbstractFieldMapper implements Internal @Override protected String defaultPostingFormat() { - return "bloom_default"; + return "default"; } @Override diff --git a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 3147f905bf8..68ac25d0065 100644 --- a/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/src/main/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -1 +1,2 @@ org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat +org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat diff --git a/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java b/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java index 133f6b7b8bd..7d790241a7b 100644 --- a/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java +++ b/src/test/java/org/elasticsearch/test/unit/index/codec/CodecTests.java @@ -69,8 +69,10 @@ public class CodecTests { public void testResolveDefaultPostingFormats() throws Exception { PostingsFormatService postingsFormatService = createCodecService().postingsFormatService(); assertThat(postingsFormatService.get("default"), instanceOf(PreBuiltPostingsFormatProvider.class)); + assertThat(postingsFormatService.get("default").get(), instanceOf(ElasticSearch090PostingsFormat.class)); + // Should fail when upgrading Lucene with codec changes - assertThat(postingsFormatService.get("default").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass())); + assertThat(((ElasticSearch090PostingsFormat)postingsFormatService.get("default").get()).getDefaultWrapped(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass())); assertThat(postingsFormatService.get("Lucene41"), instanceOf(PreBuiltPostingsFormatProvider.class)); // Should fail when upgrading Lucene with codec changes assertThat(postingsFormatService.get("Lucene41").get(), instanceOf(((PerFieldPostingsFormat) Codec.getDefault().postingsFormat()).getPostingsFormatForField(null).getClass())); @@ -126,7 +128,7 @@ public class CodecTests { CodecService codecService = createCodecService(indexSettings); DocumentMapper documentMapper = codecService.mapperService().documentMapperParser().parse(mapping); assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider(), instanceOf(PreBuiltPostingsFormatProvider.class)); - assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(Lucene41PostingsFormat.class)); + assertThat(documentMapper.mappers().name("field1").mapper().postingsFormatProvider().get(), instanceOf(ElasticSearch090PostingsFormat.class)); assertThat(documentMapper.mappers().name("field2").mapper().postingsFormatProvider(), instanceOf(DefaultPostingsFormatProvider.class)); DefaultPostingsFormatProvider provider = (DefaultPostingsFormatProvider) documentMapper.mappers().name("field2").mapper().postingsFormatProvider(); diff --git a/src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java b/src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java new file mode 100644 index 00000000000..f430e9d21f7 --- /dev/null +++ b/src/test/java/org/elasticsearch/test/unit/index/codec/postingformat/DefaultPostingsFormatTests.java @@ -0,0 +1,130 @@ +/* + * Licensed to ElasticSearch and Shay Banon under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. ElasticSearch licenses this + * file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.test.unit.index.codec.postingformat; + +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.instanceOf; +import static org.hamcrest.Matchers.not; + +import java.io.IOException; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.lucene42.Lucene42Codec; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.TextField; +import org.apache.lucene.index.AtomicReader; +import org.apache.lucene.index.AtomicReaderContext; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.elasticsearch.common.lucene.Lucene; +import org.elasticsearch.index.codec.postingsformat.BloomFilterPostingsFormat; +import org.elasticsearch.index.codec.postingsformat.ElasticSearch090PostingsFormat; +import org.elasticsearch.index.mapper.internal.UidFieldMapper; +import org.testng.annotations.Test; + +/** + * Simple smoke test for {@link ElasticSearch090PostingsFormat} + */ +public class DefaultPostingsFormatTests { + + private final class TestCodec extends Lucene42Codec { + + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return new ElasticSearch090PostingsFormat(); + } + } + + @Test + public void testUseDefault() throws IOException { + + Codec codec = new TestCodec(); + Directory d = new RAMDirectory(); + IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION)); + config.setCodec(codec); + IndexWriter writer = new IndexWriter(d, config); + writer.addDocument(Arrays.asList(new TextField("foo", "bar", Store.YES), new TextField(UidFieldMapper.NAME, "1234", Store.YES))); + writer.commit(); + DirectoryReader reader = DirectoryReader.open(writer, false); + List leaves = reader.leaves(); + assertThat(leaves.size(), equalTo(1)); + AtomicReader ar = leaves.get(0).reader(); + Terms terms = ar.terms("foo"); + Terms uidTerms = ar.terms(UidFieldMapper.NAME); + + assertThat(terms.size(), equalTo(1l)); + assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class))); + assertThat(uidTerms, instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class)); + + reader.close(); + writer.close(); + d.close(); + } + + @Test + public void testNoUIDField() throws IOException { + + Codec codec = new TestCodec(); + Directory d = new RAMDirectory(); + IndexWriterConfig config = new IndexWriterConfig(Lucene.VERSION, new WhitespaceAnalyzer(Lucene.VERSION)); + config.setCodec(codec); + IndexWriter writer = new IndexWriter(d, config); + for (int i = 0; i < 100; i++) { + writer.addDocument(Arrays.asList(new TextField("foo", "foo bar foo bar", Store.YES), new TextField("some_other_field", "1234", Store.YES))); + } + writer.forceMerge(1); + writer.commit(); + + DirectoryReader reader = DirectoryReader.open(writer, false); + List leaves = reader.leaves(); + assertThat(leaves.size(), equalTo(1)); + AtomicReader ar = leaves.get(0).reader(); + Terms terms = ar.terms("foo"); + Terms some_other_field = ar.terms("some_other_field"); + + assertThat(terms.size(), equalTo(2l)); + assertThat(terms, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class))); + assertThat(some_other_field, not(instanceOf(BloomFilterPostingsFormat.BloomFilteredFieldsProducer.BloomFilteredTerms.class))); + TermsEnum iterator = terms.iterator(null); + Set expected = new HashSet(); + expected.add("foo"); + expected.add("bar"); + while(iterator.next() != null) { + expected.remove(iterator.term().utf8ToString()); + } + assertThat(expected.size(), equalTo(0)); + reader.close(); + writer.close(); + d.close(); + } + +}