YARN-8151. Yarn RM Epoch should wrap around. Contributed by Young Chen.

This commit is contained in:
Inigo Goiri 2018-05-02 17:23:17 -07:00
parent 87c23ef643
commit e6a80e476d
11 changed files with 51 additions and 7 deletions

View File

@ -188,6 +188,10 @@ public class YarnConfiguration extends Configuration {
public static final String RM_EPOCH = RM_PREFIX + "epoch"; public static final String RM_EPOCH = RM_PREFIX + "epoch";
public static final long DEFAULT_RM_EPOCH = 0L; public static final long DEFAULT_RM_EPOCH = 0L;
/** The epoch range before wrap around. 0 disables wrap around*/
public static final String RM_EPOCH_RANGE = RM_EPOCH + ".range";
public static final long DEFAULT_RM_EPOCH_RANGE = 0;
/** The address of the applications manager interface in the RM.*/ /** The address of the applications manager interface in the RM.*/
public static final String RM_ADDRESS = public static final String RM_ADDRESS =
RM_PREFIX + "address"; RM_PREFIX + "address";

View File

@ -676,6 +676,13 @@
<!--value>yarn-cluster</value--> <!--value>yarn-cluster</value-->
</property> </property>
<property>
<description>The range of values above base epoch that the RM will use before
wrapping around</description>
<name>yarn.resourcemanager.epoch.range</name>
<value>0</value>
</property>
<property> <property>
<description>The list of RM nodes in the cluster when HA is <description>The list of RM nodes in the cluster when HA is
enabled. See description of yarn.resourcemanager.ha enabled. See description of yarn.resourcemanager.ha

View File

@ -205,12 +205,12 @@ public class FileSystemRMStateStore extends RMStateStore {
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data)); Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
currentEpoch = epoch.getEpoch(); currentEpoch = epoch.getEpoch();
// increment epoch and store it // increment epoch and store it
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto() byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
.toByteArray(); .toByteArray();
updateFile(epochNodePath, storeData, false); updateFile(epochNodePath, storeData, false);
} else { } else {
// initialize epoch file with 1 for the next time. // initialize epoch file with 1 for the next time.
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto() byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
.toByteArray(); .toByteArray();
writeFileWithRetries(epochNodePath, storeData, false); writeFileWithRetries(epochNodePath, storeData, false);
} }

View File

@ -259,7 +259,7 @@ public class LeveldbRMStateStore extends RMStateStore {
if (data != null) { if (data != null) {
currentEpoch = EpochProto.parseFrom(data).getEpoch(); currentEpoch = EpochProto.parseFrom(data).getEpoch();
} }
EpochProto proto = Epoch.newInstance(currentEpoch + 1).getProto(); EpochProto proto = Epoch.newInstance(nextEpoch(currentEpoch)).getProto();
db.put(dbKeyBytes, proto.toByteArray()); db.put(dbKeyBytes, proto.toByteArray());
} catch (DBException e) { } catch (DBException e) {
throw new IOException(e); throw new IOException(e);

View File

@ -59,7 +59,7 @@ public class MemoryRMStateStore extends RMStateStore {
@Override @Override
public synchronized long getAndIncrementEpoch() throws Exception { public synchronized long getAndIncrementEpoch() throws Exception {
long currentEpoch = epoch; long currentEpoch = epoch;
epoch = epoch + 1; epoch = nextEpoch(epoch);
return currentEpoch; return currentEpoch;
} }

View File

@ -104,6 +104,7 @@ public abstract class RMStateStore extends AbstractService {
protected static final String VERSION_NODE = "RMVersionNode"; protected static final String VERSION_NODE = "RMVersionNode";
protected static final String EPOCH_NODE = "EpochNode"; protected static final String EPOCH_NODE = "EpochNode";
protected long baseEpoch; protected long baseEpoch;
private long epochRange;
protected ResourceManager resourceManager; protected ResourceManager resourceManager;
private final ReadLock readLock; private final ReadLock readLock;
private final WriteLock writeLock; private final WriteLock writeLock;
@ -732,6 +733,8 @@ public abstract class RMStateStore extends AbstractService {
// read the base epoch value from conf // read the base epoch value from conf
baseEpoch = conf.getLong(YarnConfiguration.RM_EPOCH, baseEpoch = conf.getLong(YarnConfiguration.RM_EPOCH,
YarnConfiguration.DEFAULT_RM_EPOCH); YarnConfiguration.DEFAULT_RM_EPOCH);
epochRange = conf.getLong(YarnConfiguration.RM_EPOCH_RANGE,
YarnConfiguration.DEFAULT_RM_EPOCH_RANGE);
initInternal(conf); initInternal(conf);
} }
@ -819,6 +822,19 @@ public abstract class RMStateStore extends AbstractService {
*/ */
public abstract long getAndIncrementEpoch() throws Exception; public abstract long getAndIncrementEpoch() throws Exception;
/**
* Compute the next epoch value by incrementing by one.
* Wraps around if the epoch range is exceeded so that
* when federation is enabled epoch collisions can be avoided.
*/
protected long nextEpoch(long epoch){
long epochVal = epoch - baseEpoch + 1;
if (epochRange > 0) {
epochVal %= epochRange;
}
return epochVal + baseEpoch;
}
/** /**
* Blocking API * Blocking API
* The derived class must recover state from the store and return a new * The derived class must recover state from the store and return a new

View File

@ -491,13 +491,13 @@ public class ZKRMStateStore extends RMStateStore {
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data)); Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
currentEpoch = epoch.getEpoch(); currentEpoch = epoch.getEpoch();
// increment epoch and store it // increment epoch and store it
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto() byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
.toByteArray(); .toByteArray();
zkManager.safeSetData(epochNodePath, storeData, -1, zkAcl, zkManager.safeSetData(epochNodePath, storeData, -1, zkAcl,
fencingNodePath); fencingNodePath);
} else { } else {
// initialize epoch node with 1 for the next time. // initialize epoch node with 1 for the next time.
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto() byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
.toByteArray(); .toByteArray();
zkManager.safeCreate(epochNodePath, storeData, zkAcl, zkManager.safeCreate(epochNodePath, storeData, zkAcl,
CreateMode.PERSISTENT, zkAcl, fencingNodePath); CreateMode.PERSISTENT, zkAcl, fencingNodePath);

View File

@ -94,6 +94,8 @@ public class RMStateStoreTestBase {
protected final long epoch = 10L; protected final long epoch = 10L;
private final long epochRange = 10L;
static class TestDispatcher implements Dispatcher, EventHandler<Event> { static class TestDispatcher implements Dispatcher, EventHandler<Event> {
ApplicationAttemptId attemptId; ApplicationAttemptId attemptId;
@ -141,6 +143,10 @@ public class RMStateStoreTestBase {
boolean attemptExists(RMAppAttempt attempt) throws Exception; boolean attemptExists(RMAppAttempt attempt) throws Exception;
} }
public long getEpochRange() {
return epochRange;
}
void waitNotify(TestDispatcher dispatcher) { void waitNotify(TestDispatcher dispatcher) {
long startTime = System.currentTimeMillis(); long startTime = System.currentTimeMillis();
while(!dispatcher.notified) { while(!dispatcher.notified) {
@ -576,6 +582,14 @@ public class RMStateStoreTestBase {
long thirdTimeEpoch = store.getAndIncrementEpoch(); long thirdTimeEpoch = store.getAndIncrementEpoch();
Assert.assertEquals(epoch + 2, thirdTimeEpoch); Assert.assertEquals(epoch + 2, thirdTimeEpoch);
for (int i = 0; i < epochRange; ++i) {
store.getAndIncrementEpoch();
}
long wrappedEpoch = store.getAndIncrementEpoch();
// Epoch should have wrapped around and then incremented once for a total
// of + 3
Assert.assertEquals(epoch + 3, wrappedEpoch);
} }
public void testAppDeletion(RMStateStoreHelper stateStoreHelper) public void testAppDeletion(RMStateStoreHelper stateStoreHelper)

View File

@ -118,6 +118,7 @@ public class TestFSRMStateStore extends RMStateStoreTestBase {
conf.setLong(YarnConfiguration.FS_RM_STATE_STORE_RETRY_INTERVAL_MS, conf.setLong(YarnConfiguration.FS_RM_STATE_STORE_RETRY_INTERVAL_MS,
900L); 900L);
conf.setLong(YarnConfiguration.RM_EPOCH, epoch); conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
if (adminCheckEnable) { if (adminCheckEnable) {
conf.setBoolean( conf.setBoolean(
YarnConfiguration.YARN_INTERMEDIATE_DATA_ENCRYPTION, true); YarnConfiguration.YARN_INTERMEDIATE_DATA_ENCRYPTION, true);

View File

@ -83,6 +83,7 @@ public class TestLeveldbRMStateStore extends RMStateStoreTestBase {
@Test(timeout = 60000) @Test(timeout = 60000)
public void testEpoch() throws Exception { public void testEpoch() throws Exception {
conf.setLong(YarnConfiguration.RM_EPOCH, epoch); conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
LeveldbStateStoreTester tester = new LeveldbStateStoreTester(); LeveldbStateStoreTester tester = new LeveldbStateStoreTester();
testEpoch(tester); testEpoch(tester);
} }

View File

@ -210,6 +210,7 @@ public class TestZKRMStateStore extends RMStateStoreTestBase {
curatorTestingServer.getConnectString()); curatorTestingServer.getConnectString());
conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode); conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode);
conf.setLong(YarnConfiguration.RM_EPOCH, epoch); conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
this.store = new TestZKRMStateStoreInternal(conf, workingZnode); this.store = new TestZKRMStateStoreInternal(conf, workingZnode);
return this.store; return this.store;
} }