266 lines
16 KiB
Java
Raw Normal View History

2010-02-08 15:30:06 +02:00
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
2010-02-08 15:30:06 +02:00
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.cluster;
import org.elasticsearch.cluster.action.index.MappingUpdatedAction;
import org.elasticsearch.cluster.action.index.NodeMappingRefreshAction;
2010-02-08 15:30:06 +02:00
import org.elasticsearch.cluster.action.shard.ShardStateAction;
import org.elasticsearch.cluster.metadata.IndexGraveyard;
import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
import org.elasticsearch.cluster.metadata.MetaData;
import org.elasticsearch.cluster.metadata.MetaDataCreateIndexService;
import org.elasticsearch.cluster.metadata.MetaDataDeleteIndexService;
import org.elasticsearch.cluster.metadata.MetaDataIndexAliasesService;
import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
import org.elasticsearch.cluster.metadata.MetaDataIndexTemplateService;
import org.elasticsearch.cluster.metadata.MetaDataMappingService;
import org.elasticsearch.cluster.metadata.MetaDataUpdateSettingsService;
import org.elasticsearch.cluster.metadata.RepositoriesMetaData;
import org.elasticsearch.cluster.routing.DelayedAllocationService;
2010-02-08 15:30:06 +02:00
import org.elasticsearch.cluster.routing.RoutingService;
import org.elasticsearch.cluster.routing.allocation.AllocationService;
import org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator;
import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocator;
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
import org.elasticsearch.cluster.routing.allocation.decider.AwarenessAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.ClusterRebalanceAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.ConcurrentRebalanceAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDecider;
import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.FilterAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.NodeVersionAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.RebalanceOnlyWhenActiveAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.ReplicaAfterPrimaryActiveAllocationDecider;
Add ability to split shards (#26931) This change adds a new `_split` API that allows to split indices into a new index with a power of two more shards that the source index. This API works alongside the `_shrink` API but doesn't require any shard relocation before indices can be split. The split operation is conceptually an inverse `_shrink` operation since we initialize the index with a _syntetic_ number of routing shards that are used for the consistent hashing at index time. Compared to indices created with earlier versions this might produce slightly different shard distributions but has no impact on the per-index backwards compatibility. For now, the user is required to prepare an index to be splittable by setting the `index.number_of_routing_shards` at index creation time. The setting allows the user to prepare the index to be splittable in factors of `index.number_of_routing_shards` ie. if the index is created with `index.number_of_routing_shards: 16` and `index.number_of_shards: 2` it can be split into `4, 8, 16` shards. This is an intermediate step until we can make this the default. This also allows us to safely backport this change to 6.x. The `_split` operation is implemented internally as a DeleteByQuery on the lucene level that is executed while the primary shards execute their initial recovery. Subsequent merges that are triggered due to this operation will not be executed immediately. All merges will be deferred unti the shards are started and will then be throttled accordingly. This change is intended for the 6.1 feature release but will not support pre-6.1 indices to be split unless these indices have been shrunk before. In that case these indices can be split backwards into their original number of shards.
2017-11-06 10:37:55 +00:00
import org.elasticsearch.cluster.routing.allocation.decider.ResizeAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.SameShardAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.ShardsLimitAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.SnapshotInProgressAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.RestoreInProgressAllocationDecider;
import org.elasticsearch.cluster.routing.allocation.decider.ThrottlingAllocationDecider;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.inject.AbstractModule;
import org.elasticsearch.common.io.stream.NamedWriteable;
import org.elasticsearch.common.io.stream.NamedWriteableRegistry.Entry;
import org.elasticsearch.common.io.stream.Writeable.Reader;
import org.elasticsearch.common.settings.ClusterSettings;
import org.elasticsearch.common.settings.Setting;
import org.elasticsearch.common.settings.Setting.Property;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.NamedXContentRegistry;
import org.elasticsearch.gateway.GatewayAllocator;
import org.elasticsearch.ingest.IngestMetadata;
import org.elasticsearch.plugins.ClusterPlugin;
import org.elasticsearch.script.ScriptMetaData;
import org.elasticsearch.tasks.TaskResultsService;
2010-02-08 15:30:06 +02:00
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.function.Function;
import java.util.function.Supplier;
2010-02-08 15:30:06 +02:00
/**
* Configures classes and services that affect the entire cluster.
2010-02-08 15:30:06 +02:00
*/
public class ClusterModule extends AbstractModule {
2010-02-08 15:30:06 +02:00
public static final String BALANCED_ALLOCATOR = "balanced"; // default
public static final Setting<String> SHARDS_ALLOCATOR_TYPE_SETTING =
new Setting<>("cluster.routing.allocation.type", BALANCED_ALLOCATOR, Function.identity(), Property.NodeScope);
2010-02-08 15:30:06 +02:00
private final ClusterService clusterService;
private final IndexNameExpressionResolver indexNameExpressionResolver;
private final AllocationDeciders allocationDeciders;
private final AllocationService allocationService;
// pkg private for tests
final Collection<AllocationDecider> deciderList;
final ShardsAllocator shardsAllocator;
public ClusterModule(Settings settings, ClusterService clusterService, List<ClusterPlugin> clusterPlugins,
ClusterInfoService clusterInfoService) {
this.deciderList = createAllocationDeciders(settings, clusterService.getClusterSettings(), clusterPlugins);
this.allocationDeciders = new AllocationDeciders(settings, deciderList);
this.shardsAllocator = createShardsAllocator(settings, clusterService.getClusterSettings(), clusterPlugins);
this.clusterService = clusterService;
this.indexNameExpressionResolver = new IndexNameExpressionResolver(settings);
this.allocationService = new AllocationService(settings, allocationDeciders, shardsAllocator, clusterInfoService);
2010-02-08 15:30:06 +02:00
}
public static Map<String, Supplier<ClusterState.Custom>> getClusterStateCustomSuppliers(List<ClusterPlugin> clusterPlugins) {
final Map<String, Supplier<ClusterState.Custom>> customSupplier = new HashMap<>();
customSupplier.put(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress::new);
customSupplier.put(RestoreInProgress.TYPE, RestoreInProgress::new);
customSupplier.put(SnapshotsInProgress.TYPE, SnapshotsInProgress::new);
for (ClusterPlugin plugin : clusterPlugins) {
Map<String, Supplier<ClusterState.Custom>> initialCustomSupplier = plugin.getInitialClusterStateCustomSupplier();
for (String key : initialCustomSupplier.keySet()) {
if (customSupplier.containsKey(key)) {
throw new IllegalStateException("custom supplier key [" + key + "] is registered more than once");
}
}
customSupplier.putAll(initialCustomSupplier);
}
return Collections.unmodifiableMap(customSupplier);
}
public static List<Entry> getNamedWriteables() {
List<Entry> entries = new ArrayList<>();
// Cluster State
registerClusterCustom(entries, SnapshotsInProgress.TYPE, SnapshotsInProgress::new, SnapshotsInProgress::readDiffFrom);
registerClusterCustom(entries, RestoreInProgress.TYPE, RestoreInProgress::new, RestoreInProgress::readDiffFrom);
registerClusterCustom(entries, SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress::new,
SnapshotDeletionsInProgress::readDiffFrom);
// Metadata
registerMetaDataCustom(entries, RepositoriesMetaData.TYPE, RepositoriesMetaData::new, RepositoriesMetaData::readDiffFrom);
registerMetaDataCustom(entries, IngestMetadata.TYPE, IngestMetadata::new, IngestMetadata::readDiffFrom);
registerMetaDataCustom(entries, ScriptMetaData.TYPE, ScriptMetaData::new, ScriptMetaData::readDiffFrom);
registerMetaDataCustom(entries, IndexGraveyard.TYPE, IndexGraveyard::new, IndexGraveyard::readDiffFrom);
return entries;
}
public static List<NamedXContentRegistry.Entry> getNamedXWriteables() {
List<NamedXContentRegistry.Entry> entries = new ArrayList<>();
// Metadata
entries.add(new NamedXContentRegistry.Entry(MetaData.Custom.class, new ParseField(RepositoriesMetaData.TYPE),
RepositoriesMetaData::fromXContent));
entries.add(new NamedXContentRegistry.Entry(MetaData.Custom.class, new ParseField(IngestMetadata.TYPE),
IngestMetadata::fromXContent));
entries.add(new NamedXContentRegistry.Entry(MetaData.Custom.class, new ParseField(ScriptMetaData.TYPE),
ScriptMetaData::fromXContent));
entries.add(new NamedXContentRegistry.Entry(MetaData.Custom.class, new ParseField(IndexGraveyard.TYPE),
IndexGraveyard::fromXContent));
return entries;
}
private static <T extends ClusterState.Custom> void registerClusterCustom(List<Entry> entries, String name, Reader<? extends T> reader,
Reader<NamedDiff> diffReader) {
registerCustom(entries, ClusterState.Custom.class, name, reader, diffReader);
}
private static <T extends MetaData.Custom> void registerMetaDataCustom(List<Entry> entries, String name, Reader<? extends T> reader,
Reader<NamedDiff> diffReader) {
registerCustom(entries, MetaData.Custom.class, name, reader, diffReader);
}
private static <T extends NamedWriteable> void registerCustom(List<Entry> entries, Class<T> category, String name,
Reader<? extends T> reader, Reader<NamedDiff> diffReader) {
entries.add(new Entry(category, name, reader));
entries.add(new Entry(NamedDiff.class, name, diffReader));
}
public IndexNameExpressionResolver getIndexNameExpressionResolver() {
return indexNameExpressionResolver;
}
// TODO: this is public so allocation benchmark can access the default deciders...can we do that in another way?
/** Return a new {@link AllocationDecider} instance with builtin deciders as well as those from plugins. */
public static Collection<AllocationDecider> createAllocationDeciders(Settings settings, ClusterSettings clusterSettings,
List<ClusterPlugin> clusterPlugins) {
// collect deciders by class so that we can detect duplicates
Map<Class, AllocationDecider> deciders = new LinkedHashMap<>();
addAllocationDecider(deciders, new MaxRetryAllocationDecider(settings));
Add ability to split shards (#26931) This change adds a new `_split` API that allows to split indices into a new index with a power of two more shards that the source index. This API works alongside the `_shrink` API but doesn't require any shard relocation before indices can be split. The split operation is conceptually an inverse `_shrink` operation since we initialize the index with a _syntetic_ number of routing shards that are used for the consistent hashing at index time. Compared to indices created with earlier versions this might produce slightly different shard distributions but has no impact on the per-index backwards compatibility. For now, the user is required to prepare an index to be splittable by setting the `index.number_of_routing_shards` at index creation time. The setting allows the user to prepare the index to be splittable in factors of `index.number_of_routing_shards` ie. if the index is created with `index.number_of_routing_shards: 16` and `index.number_of_shards: 2` it can be split into `4, 8, 16` shards. This is an intermediate step until we can make this the default. This also allows us to safely backport this change to 6.x. The `_split` operation is implemented internally as a DeleteByQuery on the lucene level that is executed while the primary shards execute their initial recovery. Subsequent merges that are triggered due to this operation will not be executed immediately. All merges will be deferred unti the shards are started and will then be throttled accordingly. This change is intended for the 6.1 feature release but will not support pre-6.1 indices to be split unless these indices have been shrunk before. In that case these indices can be split backwards into their original number of shards.
2017-11-06 10:37:55 +00:00
addAllocationDecider(deciders, new ResizeAllocationDecider(settings));
addAllocationDecider(deciders, new ReplicaAfterPrimaryActiveAllocationDecider(settings));
addAllocationDecider(deciders, new RebalanceOnlyWhenActiveAllocationDecider(settings));
addAllocationDecider(deciders, new ClusterRebalanceAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new ConcurrentRebalanceAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new EnableAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new NodeVersionAllocationDecider(settings));
addAllocationDecider(deciders, new SnapshotInProgressAllocationDecider(settings));
addAllocationDecider(deciders, new RestoreInProgressAllocationDecider(settings));
addAllocationDecider(deciders, new FilterAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new SameShardAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new DiskThresholdDecider(settings, clusterSettings));
addAllocationDecider(deciders, new ThrottlingAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new ShardsLimitAllocationDecider(settings, clusterSettings));
addAllocationDecider(deciders, new AwarenessAllocationDecider(settings, clusterSettings));
clusterPlugins.stream()
.flatMap(p -> p.createAllocationDeciders(settings, clusterSettings).stream())
.forEach(d -> addAllocationDecider(deciders, d));
return deciders.values();
}
/** Add the given allocation decider to the given deciders collection, erroring if the class name is already used. */
private static void addAllocationDecider(Map<Class, AllocationDecider> deciders, AllocationDecider decider) {
if (deciders.put(decider.getClass(), decider) != null) {
throw new IllegalArgumentException("Cannot specify allocation decider [" + decider.getClass().getName() + "] twice");
}
}
private static ShardsAllocator createShardsAllocator(Settings settings, ClusterSettings clusterSettings,
List<ClusterPlugin> clusterPlugins) {
Map<String, Supplier<ShardsAllocator>> allocators = new HashMap<>();
allocators.put(BALANCED_ALLOCATOR, () -> new BalancedShardsAllocator(settings, clusterSettings));
for (ClusterPlugin plugin : clusterPlugins) {
plugin.getShardsAllocators(settings, clusterSettings).forEach((k, v) -> {
if (allocators.put(k, v) != null) {
throw new IllegalArgumentException("ShardsAllocator [" + k + "] already defined");
}
});
}
String allocatorName = SHARDS_ALLOCATOR_TYPE_SETTING.get(settings);
Supplier<ShardsAllocator> allocatorSupplier = allocators.get(allocatorName);
if (allocatorSupplier == null) {
throw new IllegalArgumentException("Unknown ShardsAllocator [" + allocatorName + "]");
}
return Objects.requireNonNull(allocatorSupplier.get(),
"ShardsAllocator factory for [" + allocatorName + "] returned null");
}
public AllocationService getAllocationService() {
return allocationService;
}
2010-02-08 15:30:06 +02:00
@Override
protected void configure() {
bind(GatewayAllocator.class).asEagerSingleton();
bind(AllocationService.class).toInstance(allocationService);
bind(ClusterService.class).toInstance(clusterService);
bind(NodeConnectionsService.class).asEagerSingleton();
bind(MetaDataCreateIndexService.class).asEagerSingleton();
bind(MetaDataDeleteIndexService.class).asEagerSingleton();
bind(MetaDataIndexStateService.class).asEagerSingleton();
bind(MetaDataMappingService.class).asEagerSingleton();
bind(MetaDataIndexAliasesService.class).asEagerSingleton();
bind(MetaDataUpdateSettingsService.class).asEagerSingleton();
2010-11-26 15:45:18 +02:00
bind(MetaDataIndexTemplateService.class).asEagerSingleton();
bind(IndexNameExpressionResolver.class).toInstance(indexNameExpressionResolver);
2010-02-08 15:30:06 +02:00
bind(RoutingService.class).asEagerSingleton();
bind(DelayedAllocationService.class).asEagerSingleton();
2010-02-08 15:30:06 +02:00
bind(ShardStateAction.class).asEagerSingleton();
bind(NodeMappingRefreshAction.class).asEagerSingleton();
bind(MappingUpdatedAction.class).asEagerSingleton();
bind(TaskResultsService.class).asEagerSingleton();
bind(AllocationDeciders.class).toInstance(allocationDeciders);
bind(ShardsAllocator.class).toInstance(shardsAllocator);
2010-02-08 15:30:06 +02:00
}
2015-10-14 05:41:41 -04:00
}