YARN-4559. Make leader elector and zk store share the same curator
client. Contributed by Jian He
This commit is contained in:
parent
2ec438e8f7
commit
890a2ebd1a
|
@ -98,6 +98,9 @@ Release 2.9.0 - UNRELEASED
|
||||||
YARN-4526. Make SystemClock singleton so AppSchedulingInfo could use it.
|
YARN-4526. Make SystemClock singleton so AppSchedulingInfo could use it.
|
||||||
(kasha)
|
(kasha)
|
||||||
|
|
||||||
|
YARN-4559. Make leader elector and zk store share the same curator client.
|
||||||
|
(Jian He via xgong)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
BUG FIXES
|
BUG FIXES
|
||||||
|
|
|
@ -271,6 +271,11 @@
|
||||||
<Field name="numRetries" />
|
<Field name="numRetries" />
|
||||||
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||||
</Match>
|
</Match>
|
||||||
|
<Match>
|
||||||
|
<Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore" />
|
||||||
|
<Field name="resourceManager"/>
|
||||||
|
<Bug pattern="IS2_INCONSISTENT_SYNC" />
|
||||||
|
</Match>
|
||||||
<Match>
|
<Match>
|
||||||
<Class name="org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer"/>
|
<Class name="org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer"/>
|
||||||
<Field name="renewalTimer" />
|
<Field name="renewalTimer" />
|
||||||
|
|
|
@ -19,14 +19,11 @@
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager;
|
package org.apache.hadoop.yarn.server.resourcemanager;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.curator.framework.CuratorFramework;
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
import org.apache.curator.framework.CuratorFrameworkFactory;
|
|
||||||
import org.apache.curator.framework.recipes.leader.LeaderLatch;
|
import org.apache.curator.framework.recipes.leader.LeaderLatch;
|
||||||
import org.apache.curator.framework.recipes.leader.LeaderLatchListener;
|
import org.apache.curator.framework.recipes.leader.LeaderLatchListener;
|
||||||
import org.apache.curator.retry.RetryNTimes;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
|
@ -44,35 +41,23 @@ public class LeaderElectorService extends AbstractService implements
|
||||||
private RMContext rmContext;
|
private RMContext rmContext;
|
||||||
private String latchPath;
|
private String latchPath;
|
||||||
private String rmId;
|
private String rmId;
|
||||||
|
private ResourceManager rm;
|
||||||
|
|
||||||
public LeaderElectorService(RMContext rmContext) {
|
public LeaderElectorService(RMContext rmContext, ResourceManager rm) {
|
||||||
super(LeaderElectorService.class.getName());
|
super(LeaderElectorService.class.getName());
|
||||||
this.rmContext = rmContext;
|
this.rmContext = rmContext;
|
||||||
|
this.rm = rm;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void serviceInit(Configuration conf) throws Exception {
|
protected void serviceInit(Configuration conf) throws Exception {
|
||||||
String zkHostPort = conf.get(YarnConfiguration.RM_ZK_ADDRESS);
|
|
||||||
Preconditions.checkNotNull(zkHostPort,
|
|
||||||
YarnConfiguration.RM_ZK_ADDRESS + " is not set");
|
|
||||||
|
|
||||||
rmId = HAUtil.getRMHAId(conf);
|
rmId = HAUtil.getRMHAId(conf);
|
||||||
String clusterId = YarnConfiguration.getClusterId(conf);
|
String clusterId = YarnConfiguration.getClusterId(conf);
|
||||||
|
|
||||||
int zkSessionTimeout = conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
|
|
||||||
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
|
|
||||||
int maxRetryNum = conf.getInt(YarnConfiguration.RM_ZK_NUM_RETRIES,
|
|
||||||
YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES);
|
|
||||||
|
|
||||||
String zkBasePath = conf.get(
|
String zkBasePath = conf.get(
|
||||||
YarnConfiguration.AUTO_FAILOVER_ZK_BASE_PATH,
|
YarnConfiguration.AUTO_FAILOVER_ZK_BASE_PATH,
|
||||||
YarnConfiguration.DEFAULT_AUTO_FAILOVER_ZK_BASE_PATH);
|
YarnConfiguration.DEFAULT_AUTO_FAILOVER_ZK_BASE_PATH);
|
||||||
latchPath = zkBasePath + "/" + clusterId;
|
latchPath = zkBasePath + "/" + clusterId;
|
||||||
|
curator = rm.getCurator();
|
||||||
curator = CuratorFrameworkFactory.builder().connectString(zkHostPort)
|
|
||||||
.retryPolicy(new RetryNTimes(maxRetryNum, zkSessionTimeout)).build();
|
|
||||||
curator.start();
|
|
||||||
initAndStartLeaderLatch();
|
initAndStartLeaderLatch();
|
||||||
super.serviceInit(conf);
|
super.serviceInit(conf);
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,6 +21,10 @@ package org.apache.hadoop.yarn.server.resourcemanager;
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.curator.framework.AuthInfo;
|
||||||
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
|
import org.apache.curator.framework.CuratorFrameworkFactory;
|
||||||
|
import org.apache.curator.retry.RetryNTimes;
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.ha.HAServiceProtocol;
|
import org.apache.hadoop.ha.HAServiceProtocol;
|
||||||
|
@ -28,7 +32,11 @@ import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
|
||||||
import org.apache.hadoop.http.lib.StaticUserWebFilter;
|
import org.apache.hadoop.http.lib.StaticUserWebFilter;
|
||||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||||
import org.apache.hadoop.metrics2.source.JvmMetrics;
|
import org.apache.hadoop.metrics2.source.JvmMetrics;
|
||||||
import org.apache.hadoop.security.*;
|
import org.apache.hadoop.security.AuthenticationFilterInitializer;
|
||||||
|
import org.apache.hadoop.security.Groups;
|
||||||
|
import org.apache.hadoop.security.HttpCrossOriginFilterInitializer;
|
||||||
|
import org.apache.hadoop.security.SecurityUtil;
|
||||||
|
import org.apache.hadoop.security.UserGroupInformation;
|
||||||
import org.apache.hadoop.security.authentication.server.KerberosAuthenticationHandler;
|
import org.apache.hadoop.security.authentication.server.KerberosAuthenticationHandler;
|
||||||
import org.apache.hadoop.security.authorize.ProxyUsers;
|
import org.apache.hadoop.security.authorize.ProxyUsers;
|
||||||
import org.apache.hadoop.service.AbstractService;
|
import org.apache.hadoop.service.AbstractService;
|
||||||
|
@ -40,6 +48,7 @@ import org.apache.hadoop.util.JvmPauseMonitor;
|
||||||
import org.apache.hadoop.util.ReflectionUtils;
|
import org.apache.hadoop.util.ReflectionUtils;
|
||||||
import org.apache.hadoop.util.ShutdownHookManager;
|
import org.apache.hadoop.util.ShutdownHookManager;
|
||||||
import org.apache.hadoop.util.StringUtils;
|
import org.apache.hadoop.util.StringUtils;
|
||||||
|
import org.apache.hadoop.util.ZKUtil;
|
||||||
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
import org.apache.hadoop.yarn.YarnUncaughtExceptionHandler;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
|
||||||
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
|
@ -58,8 +67,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.amlauncher.ApplicationMaste
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.metrics.SystemMetricsPublisher;
|
import org.apache.hadoop.yarn.server.resourcemanager.metrics.SystemMetricsPublisher;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
|
import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingMonitor;
|
import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingMonitor;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMDelegatedNodeLabelsUpdater;
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMDelegatedNodeLabelsUpdater;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.NullRMStateStore;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.NullRMStateStore;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState;
|
||||||
|
@ -78,7 +87,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.ContainerAlloca
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.*;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
|
import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer;
|
import org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer;
|
||||||
|
@ -96,12 +107,15 @@ import org.apache.hadoop.yarn.webapp.WebApp;
|
||||||
import org.apache.hadoop.yarn.webapp.WebApps;
|
import org.apache.hadoop.yarn.webapp.WebApps;
|
||||||
import org.apache.hadoop.yarn.webapp.WebApps.Builder;
|
import org.apache.hadoop.yarn.webapp.WebApps.Builder;
|
||||||
import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
|
import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
|
||||||
|
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
|
import java.nio.charset.Charset;
|
||||||
import java.security.PrivilegedExceptionAction;
|
import java.security.PrivilegedExceptionAction;
|
||||||
|
import java.security.SecureRandom;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.BlockingQueue;
|
import java.util.concurrent.BlockingQueue;
|
||||||
|
@ -158,6 +172,10 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
protected ResourceTrackerService resourceTracker;
|
protected ResourceTrackerService resourceTracker;
|
||||||
private JvmPauseMonitor pauseMonitor;
|
private JvmPauseMonitor pauseMonitor;
|
||||||
private boolean curatorEnabled = false;
|
private boolean curatorEnabled = false;
|
||||||
|
private CuratorFramework curator;
|
||||||
|
private final String zkRootNodePassword =
|
||||||
|
Long.toString(new SecureRandom().nextLong());
|
||||||
|
private boolean recoveryEnabled;
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
protected String webAppAddress;
|
protected String webAppAddress;
|
||||||
|
@ -232,7 +250,8 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
curatorEnabled = conf.getBoolean(YarnConfiguration.CURATOR_LEADER_ELECTOR,
|
curatorEnabled = conf.getBoolean(YarnConfiguration.CURATOR_LEADER_ELECTOR,
|
||||||
YarnConfiguration.DEFAULT_CURATOR_LEADER_ELECTOR_ENABLED);
|
YarnConfiguration.DEFAULT_CURATOR_LEADER_ELECTOR_ENABLED);
|
||||||
if (curatorEnabled) {
|
if (curatorEnabled) {
|
||||||
LeaderElectorService elector = new LeaderElectorService(rmContext);
|
this.curator = createAndStartCurator(conf);
|
||||||
|
LeaderElectorService elector = new LeaderElectorService(rmContext, this);
|
||||||
addService(elector);
|
addService(elector);
|
||||||
rmContext.setLeaderElectorService(elector);
|
rmContext.setLeaderElectorService(elector);
|
||||||
}
|
}
|
||||||
|
@ -276,7 +295,58 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
|
|
||||||
super.serviceInit(this.conf);
|
super.serviceInit(this.conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public CuratorFramework createAndStartCurator(Configuration conf)
|
||||||
|
throws Exception {
|
||||||
|
String zkHostPort = conf.get(YarnConfiguration.RM_ZK_ADDRESS);
|
||||||
|
if (zkHostPort == null) {
|
||||||
|
throw new YarnRuntimeException(
|
||||||
|
YarnConfiguration.RM_ZK_ADDRESS + " is not configured.");
|
||||||
|
}
|
||||||
|
int numRetries = conf.getInt(YarnConfiguration.RM_ZK_NUM_RETRIES,
|
||||||
|
YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES);
|
||||||
|
int zkSessionTimeout = conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
|
||||||
|
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
|
||||||
|
int zkRetryInterval = conf.getInt(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
|
||||||
|
YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
|
||||||
|
|
||||||
|
// set up zk auths
|
||||||
|
List<ZKUtil.ZKAuthInfo> zkAuths = RMZKUtils.getZKAuths(conf);
|
||||||
|
List<AuthInfo> authInfos = new ArrayList<>();
|
||||||
|
for (ZKUtil.ZKAuthInfo zkAuth : zkAuths) {
|
||||||
|
authInfos.add(new AuthInfo(zkAuth.getScheme(), zkAuth.getAuth()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (HAUtil.isHAEnabled(conf) && HAUtil.getConfValueForRMInstance(
|
||||||
|
YarnConfiguration.ZK_RM_STATE_STORE_ROOT_NODE_ACL, conf) == null) {
|
||||||
|
String zkRootNodeUsername = HAUtil
|
||||||
|
.getConfValueForRMInstance(YarnConfiguration.RM_ADDRESS,
|
||||||
|
YarnConfiguration.DEFAULT_RM_ADDRESS, conf);
|
||||||
|
byte[] defaultFencingAuth =
|
||||||
|
(zkRootNodeUsername + ":" + zkRootNodePassword)
|
||||||
|
.getBytes(Charset.forName("UTF-8"));
|
||||||
|
authInfos.add(new AuthInfo(new DigestAuthenticationProvider().getScheme(),
|
||||||
|
defaultFencingAuth));
|
||||||
|
}
|
||||||
|
|
||||||
|
CuratorFramework client = CuratorFrameworkFactory.builder()
|
||||||
|
.connectString(zkHostPort)
|
||||||
|
.sessionTimeoutMs(zkSessionTimeout)
|
||||||
|
.retryPolicy(new RetryNTimes(numRetries, zkRetryInterval))
|
||||||
|
.authorization(authInfos).build();
|
||||||
|
client.start();
|
||||||
|
return client;
|
||||||
|
}
|
||||||
|
|
||||||
|
public CuratorFramework getCurator() {
|
||||||
|
return this.curator;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getZkRootNodePassword() {
|
||||||
|
return this.zkRootNodePassword;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
protected QueueACLsManager createQueueACLsManager(ResourceScheduler scheduler,
|
protected QueueACLsManager createQueueACLsManager(ResourceScheduler scheduler,
|
||||||
Configuration conf) {
|
Configuration conf) {
|
||||||
return new QueueACLsManager(scheduler, conf);
|
return new QueueACLsManager(scheduler, conf);
|
||||||
|
@ -412,7 +482,6 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
private ApplicationMasterLauncher applicationMasterLauncher;
|
private ApplicationMasterLauncher applicationMasterLauncher;
|
||||||
private ContainerAllocationExpirer containerAllocationExpirer;
|
private ContainerAllocationExpirer containerAllocationExpirer;
|
||||||
private ResourceManager rm;
|
private ResourceManager rm;
|
||||||
private boolean recoveryEnabled;
|
|
||||||
private RMActiveServiceContext activeServiceContext;
|
private RMActiveServiceContext activeServiceContext;
|
||||||
|
|
||||||
RMActiveServices(ResourceManager rm) {
|
RMActiveServices(ResourceManager rm) {
|
||||||
|
@ -453,29 +522,26 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
rmContext.setRMDelegatedNodeLabelsUpdater(delegatedNodeLabelsUpdater);
|
rmContext.setRMDelegatedNodeLabelsUpdater(delegatedNodeLabelsUpdater);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isRecoveryEnabled = conf.getBoolean(
|
recoveryEnabled = conf.getBoolean(YarnConfiguration.RECOVERY_ENABLED,
|
||||||
YarnConfiguration.RECOVERY_ENABLED,
|
|
||||||
YarnConfiguration.DEFAULT_RM_RECOVERY_ENABLED);
|
YarnConfiguration.DEFAULT_RM_RECOVERY_ENABLED);
|
||||||
|
|
||||||
RMStateStore rmStore = null;
|
RMStateStore rmStore = null;
|
||||||
if (isRecoveryEnabled) {
|
if (recoveryEnabled) {
|
||||||
recoveryEnabled = true;
|
|
||||||
rmStore = RMStateStoreFactory.getStore(conf);
|
rmStore = RMStateStoreFactory.getStore(conf);
|
||||||
boolean isWorkPreservingRecoveryEnabled =
|
boolean isWorkPreservingRecoveryEnabled =
|
||||||
conf.getBoolean(
|
conf.getBoolean(
|
||||||
YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED,
|
YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED,
|
||||||
YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_ENABLED);
|
YarnConfiguration.DEFAULT_RM_WORK_PRESERVING_RECOVERY_ENABLED);
|
||||||
rmContext
|
rmContext
|
||||||
.setWorkPreservingRecoveryEnabled(isWorkPreservingRecoveryEnabled);
|
.setWorkPreservingRecoveryEnabled(isWorkPreservingRecoveryEnabled);
|
||||||
} else {
|
} else {
|
||||||
recoveryEnabled = false;
|
|
||||||
rmStore = new NullRMStateStore();
|
rmStore = new NullRMStateStore();
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
rmStore.setResourceManager(rm);
|
||||||
rmStore.init(conf);
|
rmStore.init(conf);
|
||||||
rmStore.setRMDispatcher(rmDispatcher);
|
rmStore.setRMDispatcher(rmDispatcher);
|
||||||
rmStore.setResourceManager(rm);
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
// the Exception from stateStore.init() needs to be handled for
|
// the Exception from stateStore.init() needs to be handled for
|
||||||
// HA and we need to give up master status if we got fenced
|
// HA and we need to give up master status if we got fenced
|
||||||
|
@ -1130,6 +1196,9 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
configurationProvider.close();
|
configurationProvider.close();
|
||||||
}
|
}
|
||||||
super.serviceStop();
|
super.serviceStop();
|
||||||
|
if (curator != null) {
|
||||||
|
curator.close();
|
||||||
|
}
|
||||||
transitionToStandby(false);
|
transitionToStandby(false);
|
||||||
rmContext.setHAServiceState(HAServiceState.STOPPING);
|
rmContext.setHAServiceState(HAServiceState.STOPPING);
|
||||||
}
|
}
|
||||||
|
@ -1177,7 +1246,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
public ClientRMService getClientRMService() {
|
public ClientRMService getClientRMService() {
|
||||||
return this.clientRM;
|
return this.clientRM;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* return the scheduler.
|
* return the scheduler.
|
||||||
* @return the scheduler for the Resource Manager.
|
* @return the scheduler for the Resource Manager.
|
||||||
|
@ -1348,5 +1417,4 @@ public class ResourceManager extends CompositeService implements Recoverable {
|
||||||
out.println(" "
|
out.println(" "
|
||||||
+ "[-remove-application-from-state-store <appId>]" + "\n");
|
+ "[-remove-application-from-state-store <appId>]" + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,7 +95,7 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
"ReservationSystemRoot";
|
"ReservationSystemRoot";
|
||||||
protected static final String VERSION_NODE = "RMVersionNode";
|
protected static final String VERSION_NODE = "RMVersionNode";
|
||||||
protected static final String EPOCH_NODE = "EpochNode";
|
protected static final String EPOCH_NODE = "EpochNode";
|
||||||
private ResourceManager resourceManager;
|
protected ResourceManager resourceManager;
|
||||||
private final ReadLock readLock;
|
private final ReadLock readLock;
|
||||||
private final WriteLock writeLock;
|
private final WriteLock writeLock;
|
||||||
|
|
||||||
|
|
|
@ -18,26 +18,13 @@
|
||||||
|
|
||||||
package org.apache.hadoop.yarn.server.resourcemanager.recovery;
|
package org.apache.hadoop.yarn.server.resourcemanager.recovery;
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.DataInputStream;
|
|
||||||
import java.io.DataOutputStream;
|
|
||||||
import java.nio.charset.Charset;
|
|
||||||
import java.security.NoSuchAlgorithmException;
|
|
||||||
import java.security.SecureRandom;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.curator.framework.AuthInfo;
|
|
||||||
import org.apache.curator.framework.CuratorFramework;
|
import org.apache.curator.framework.CuratorFramework;
|
||||||
import org.apache.curator.framework.CuratorFrameworkFactory;
|
|
||||||
import org.apache.curator.framework.api.transaction.CuratorTransaction;
|
import org.apache.curator.framework.api.transaction.CuratorTransaction;
|
||||||
import org.apache.curator.framework.api.transaction.CuratorTransactionFinal;
|
import org.apache.curator.framework.api.transaction.CuratorTransactionFinal;
|
||||||
import org.apache.curator.retry.RetryNTimes;
|
|
||||||
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
import org.apache.hadoop.classification.InterfaceAudience.Private;
|
||||||
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
import org.apache.hadoop.classification.InterfaceStability.Unstable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
@ -77,7 +64,15 @@ import org.apache.zookeeper.data.Id;
|
||||||
import org.apache.zookeeper.data.Stat;
|
import org.apache.zookeeper.data.Stat;
|
||||||
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
import org.apache.zookeeper.server.auth.DigestAuthenticationProvider;
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.DataInputStream;
|
||||||
|
import java.io.DataOutputStream;
|
||||||
|
import java.security.NoSuchAlgorithmException;
|
||||||
|
import java.security.SecureRandom;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@link RMStateStore} implementation backed by ZooKeeper.
|
* {@link RMStateStore} implementation backed by ZooKeeper.
|
||||||
|
@ -140,12 +135,6 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
private static final String RM_DT_MASTER_KEYS_ROOT_ZNODE_NAME =
|
private static final String RM_DT_MASTER_KEYS_ROOT_ZNODE_NAME =
|
||||||
"RMDTMasterKeysRoot";
|
"RMDTMasterKeysRoot";
|
||||||
|
|
||||||
private String zkHostPort = null;
|
|
||||||
private int numRetries;
|
|
||||||
private int zkSessionTimeout;
|
|
||||||
@VisibleForTesting
|
|
||||||
int zkRetryInterval;
|
|
||||||
|
|
||||||
/** Znode paths */
|
/** Znode paths */
|
||||||
private String zkRootNodePath;
|
private String zkRootNodePath;
|
||||||
private String rmAppRoot;
|
private String rmAppRoot;
|
||||||
|
@ -160,17 +149,15 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
|
|
||||||
/** Fencing related variables */
|
/** Fencing related variables */
|
||||||
private static final String FENCING_LOCK = "RM_ZK_FENCING_LOCK";
|
private static final String FENCING_LOCK = "RM_ZK_FENCING_LOCK";
|
||||||
private boolean useDefaultFencingScheme = false;
|
|
||||||
private String fencingNodePath;
|
private String fencingNodePath;
|
||||||
private Thread verifyActiveStatusThread;
|
private Thread verifyActiveStatusThread;
|
||||||
|
private int zkSessionTimeout;
|
||||||
|
|
||||||
/** ACL and auth info */
|
/** ACL and auth info */
|
||||||
private List<ACL> zkAcl;
|
private List<ACL> zkAcl;
|
||||||
private List<ZKUtil.ZKAuthInfo> zkAuths;
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
List<ACL> zkRootNodeAcl;
|
List<ACL> zkRootNodeAcl;
|
||||||
private String zkRootNodeUsername;
|
private String zkRootNodeUsername;
|
||||||
private final String zkRootNodePassword = Long.toString(random.nextLong());
|
|
||||||
public static final int CREATE_DELETE_PERMS =
|
public static final int CREATE_DELETE_PERMS =
|
||||||
ZooDefs.Perms.CREATE | ZooDefs.Perms.DELETE;
|
ZooDefs.Perms.CREATE | ZooDefs.Perms.DELETE;
|
||||||
private final String zkRootNodeAuthScheme =
|
private final String zkRootNodeAuthScheme =
|
||||||
|
@ -204,45 +191,25 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
YarnConfiguration.DEFAULT_RM_ADDRESS, conf);
|
YarnConfiguration.DEFAULT_RM_ADDRESS, conf);
|
||||||
Id rmId = new Id(zkRootNodeAuthScheme,
|
Id rmId = new Id(zkRootNodeAuthScheme,
|
||||||
DigestAuthenticationProvider.generateDigest(
|
DigestAuthenticationProvider.generateDigest(
|
||||||
zkRootNodeUsername + ":" + zkRootNodePassword));
|
zkRootNodeUsername + ":" + resourceManager.getZkRootNodePassword()));
|
||||||
zkRootNodeAcl.add(new ACL(CREATE_DELETE_PERMS, rmId));
|
zkRootNodeAcl.add(new ACL(CREATE_DELETE_PERMS, rmId));
|
||||||
return zkRootNodeAcl;
|
return zkRootNodeAcl;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void initInternal(Configuration conf) throws Exception {
|
public synchronized void initInternal(Configuration conf) throws Exception {
|
||||||
zkHostPort = conf.get(YarnConfiguration.RM_ZK_ADDRESS);
|
|
||||||
if (zkHostPort == null) {
|
/* Initialize fencing related paths, acls, and ops */
|
||||||
throw new YarnRuntimeException("No server address specified for " +
|
|
||||||
"zookeeper state store for Resource Manager recovery. " +
|
|
||||||
YarnConfiguration.RM_ZK_ADDRESS + " is not configured.");
|
|
||||||
}
|
|
||||||
numRetries =
|
|
||||||
conf.getInt(YarnConfiguration.RM_ZK_NUM_RETRIES,
|
|
||||||
YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES);
|
|
||||||
znodeWorkingPath =
|
znodeWorkingPath =
|
||||||
conf.get(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH,
|
conf.get(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH,
|
||||||
YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_PARENT_PATH);
|
YarnConfiguration.DEFAULT_ZK_RM_STATE_STORE_PARENT_PATH);
|
||||||
zkSessionTimeout =
|
zkRootNodePath = getNodePath(znodeWorkingPath, ROOT_ZNODE_NAME);
|
||||||
conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
|
fencingNodePath = getNodePath(zkRootNodePath, FENCING_LOCK);
|
||||||
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
|
rmAppRoot = getNodePath(zkRootNodePath, RM_APP_ROOT);
|
||||||
|
zkSessionTimeout = conf.getInt(YarnConfiguration.RM_ZK_TIMEOUT_MS,
|
||||||
if (HAUtil.isHAEnabled(conf)) {
|
YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS);
|
||||||
zkRetryInterval = zkSessionTimeout / numRetries;
|
|
||||||
} else {
|
|
||||||
zkRetryInterval =
|
|
||||||
conf.getInt(YarnConfiguration.RM_ZK_RETRY_INTERVAL_MS,
|
|
||||||
YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS);
|
|
||||||
}
|
|
||||||
|
|
||||||
zkAcl = RMZKUtils.getZKAcls(conf);
|
zkAcl = RMZKUtils.getZKAcls(conf);
|
||||||
zkAuths = RMZKUtils.getZKAuths(conf);
|
|
||||||
|
|
||||||
zkRootNodePath = getNodePath(znodeWorkingPath, ROOT_ZNODE_NAME);
|
|
||||||
rmAppRoot = getNodePath(zkRootNodePath, RM_APP_ROOT);
|
|
||||||
|
|
||||||
/* Initialize fencing related paths, acls, and ops */
|
|
||||||
fencingNodePath = getNodePath(zkRootNodePath, FENCING_LOCK);
|
|
||||||
if (HAUtil.isHAEnabled(conf)) {
|
if (HAUtil.isHAEnabled(conf)) {
|
||||||
String zkRootNodeAclConf = HAUtil.getConfValueForRMInstance
|
String zkRootNodeAclConf = HAUtil.getConfValueForRMInstance
|
||||||
(YarnConfiguration.ZK_RM_STATE_STORE_ROOT_NODE_ACL, conf);
|
(YarnConfiguration.ZK_RM_STATE_STORE_ROOT_NODE_ACL, conf);
|
||||||
|
@ -256,7 +223,6 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
throw bafe;
|
throw bafe;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
useDefaultFencingScheme = true;
|
|
||||||
zkRootNodeAcl = constructZkRootNodeACL(conf, zkAcl);
|
zkRootNodeAcl = constructZkRootNodeACL(conf, zkAcl);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -272,19 +238,22 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
amrmTokenSecretManagerRoot =
|
amrmTokenSecretManagerRoot =
|
||||||
getNodePath(zkRootNodePath, AMRMTOKEN_SECRET_MANAGER_ROOT);
|
getNodePath(zkRootNodePath, AMRMTOKEN_SECRET_MANAGER_ROOT);
|
||||||
reservationRoot = getNodePath(zkRootNodePath, RESERVATION_SYSTEM_ROOT);
|
reservationRoot = getNodePath(zkRootNodePath, RESERVATION_SYSTEM_ROOT);
|
||||||
|
curatorFramework = resourceManager.getCurator();
|
||||||
|
if (curatorFramework == null) {
|
||||||
|
curatorFramework = resourceManager.createAndStartCurator(conf);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public synchronized void startInternal() throws Exception {
|
public synchronized void startInternal() throws Exception {
|
||||||
// createConnection for future API calls
|
|
||||||
createConnection();
|
|
||||||
|
|
||||||
// ensure root dirs exist
|
// ensure root dirs exist
|
||||||
createRootDirRecursively(znodeWorkingPath);
|
createRootDirRecursively(znodeWorkingPath);
|
||||||
create(zkRootNodePath);
|
create(zkRootNodePath);
|
||||||
setRootNodeAcls();
|
setRootNodeAcls();
|
||||||
delete(fencingNodePath);
|
delete(fencingNodePath);
|
||||||
if (HAUtil.isHAEnabled(getConfig())) {
|
if (HAUtil.isHAEnabled(getConfig()) && !HAUtil
|
||||||
|
.isAutomaticFailoverEnabled(getConfig())) {
|
||||||
verifyActiveStatusThread = new VerifyActiveStatusThread();
|
verifyActiveStatusThread = new VerifyActiveStatusThread();
|
||||||
verifyActiveStatusThread.start();
|
verifyActiveStatusThread.start();
|
||||||
}
|
}
|
||||||
|
@ -332,7 +301,9 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
verifyActiveStatusThread.interrupt();
|
verifyActiveStatusThread.interrupt();
|
||||||
verifyActiveStatusThread.join(1000);
|
verifyActiveStatusThread.join(1000);
|
||||||
}
|
}
|
||||||
IOUtils.closeStream(curatorFramework);
|
if (!HAUtil.isHAEnabled(getConfig())) {
|
||||||
|
IOUtils.closeStream(curatorFramework);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -909,34 +880,6 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* ZK operations using curator
|
|
||||||
*/
|
|
||||||
private void createConnection() throws Exception {
|
|
||||||
// Curator connection
|
|
||||||
CuratorFrameworkFactory.Builder builder = CuratorFrameworkFactory.builder();
|
|
||||||
builder = builder.connectString(zkHostPort)
|
|
||||||
.connectionTimeoutMs(zkSessionTimeout)
|
|
||||||
.retryPolicy(new RetryNTimes(numRetries, zkRetryInterval));
|
|
||||||
|
|
||||||
// Set up authorization based on fencing scheme
|
|
||||||
List<AuthInfo> authInfos = new ArrayList<>();
|
|
||||||
for (ZKUtil.ZKAuthInfo zkAuth : zkAuths) {
|
|
||||||
authInfos.add(new AuthInfo(zkAuth.getScheme(), zkAuth.getAuth()));
|
|
||||||
}
|
|
||||||
if (useDefaultFencingScheme) {
|
|
||||||
byte[] defaultFencingAuth =
|
|
||||||
(zkRootNodeUsername + ":" + zkRootNodePassword).getBytes(
|
|
||||||
Charset.forName("UTF-8"));
|
|
||||||
authInfos.add(new AuthInfo(zkRootNodeAuthScheme, defaultFencingAuth));
|
|
||||||
}
|
|
||||||
builder = builder.authorization(authInfos);
|
|
||||||
|
|
||||||
// Connect to ZK
|
|
||||||
curatorFramework = builder.build();
|
|
||||||
curatorFramework.start();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
byte[] getData(final String path) throws Exception {
|
byte[] getData(final String path) throws Exception {
|
||||||
return curatorFramework.getData().forPath(path);
|
return curatorFramework.getData().forPath(path);
|
||||||
|
|
|
@ -105,6 +105,7 @@ public class TestZKRMStateStore extends RMStateStoreTestBase {
|
||||||
|
|
||||||
public TestZKRMStateStoreInternal(Configuration conf, String workingZnode)
|
public TestZKRMStateStoreInternal(Configuration conf, String workingZnode)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
setResourceManager(new ResourceManager());
|
||||||
init(conf);
|
init(conf);
|
||||||
start();
|
start();
|
||||||
assertTrue(znodeWorkingPath.equals(workingZnode));
|
assertTrue(znodeWorkingPath.equals(workingZnode));
|
||||||
|
|
|
@ -36,6 +36,7 @@ import org.apache.hadoop.yarn.api.records.ApplicationId;
|
||||||
import org.apache.hadoop.yarn.api.records.ContainerId;
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
||||||
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
import org.apache.hadoop.yarn.security.AMRMTokenIdentifier;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.security.AMRMTokenSecretManager;
|
import org.apache.hadoop.yarn.server.resourcemanager.security.AMRMTokenSecretManager;
|
||||||
|
@ -103,6 +104,7 @@ public class TestZKRMStateStorePerf extends RMStateStoreTestBase
|
||||||
conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode);
|
conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode);
|
||||||
|
|
||||||
store = new ZKRMStateStore();
|
store = new ZKRMStateStore();
|
||||||
|
store.setResourceManager(new ResourceManager());
|
||||||
store.init(conf);
|
store.init(conf);
|
||||||
store.start();
|
store.start();
|
||||||
when(rmContext.getStateStore()).thenReturn(store);
|
when(rmContext.getStateStore()).thenReturn(store);
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.curator.test.TestingServer;
|
import org.apache.curator.test.TestingServer;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
||||||
|
import org.apache.hadoop.yarn.server.resourcemanager.ResourceManager;
|
||||||
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStoreTestBase.TestDispatcher;
|
import org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStoreTestBase.TestDispatcher;
|
||||||
import org.apache.hadoop.util.ZKUtil;
|
import org.apache.hadoop.util.ZKUtil;
|
||||||
|
|
||||||
|
@ -80,6 +81,7 @@ public class TestZKRMStateStoreZKClientConnections {
|
||||||
|
|
||||||
public TestZKRMStateStore(Configuration conf, String workingZnode)
|
public TestZKRMStateStore(Configuration conf, String workingZnode)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
setResourceManager(new ResourceManager());
|
||||||
init(conf);
|
init(conf);
|
||||||
start();
|
start();
|
||||||
assertTrue(znodeWorkingPath.equals(workingZnode));
|
assertTrue(znodeWorkingPath.equals(workingZnode));
|
||||||
|
@ -168,24 +170,4 @@ public class TestZKRMStateStoreZKClientConnections {
|
||||||
|
|
||||||
zkClientTester.getRMStateStore(conf);
|
zkClientTester.getRMStateStore(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testZKRetryInterval() throws Exception {
|
|
||||||
TestZKClient zkClientTester = new TestZKClient();
|
|
||||||
YarnConfiguration conf = new YarnConfiguration();
|
|
||||||
|
|
||||||
ZKRMStateStore store =
|
|
||||||
(ZKRMStateStore) zkClientTester.getRMStateStore(conf);
|
|
||||||
assertEquals(YarnConfiguration.DEFAULT_RM_ZK_RETRY_INTERVAL_MS,
|
|
||||||
store.zkRetryInterval);
|
|
||||||
store.stop();
|
|
||||||
|
|
||||||
conf.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
|
|
||||||
store =
|
|
||||||
(ZKRMStateStore) zkClientTester.getRMStateStore(conf);
|
|
||||||
assertEquals(YarnConfiguration.DEFAULT_RM_ZK_TIMEOUT_MS /
|
|
||||||
YarnConfiguration.DEFAULT_ZK_RM_NUM_RETRIES,
|
|
||||||
store.zkRetryInterval);
|
|
||||||
store.stop();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue