YARN-8151. Yarn RM Epoch should wrap around. Contributed by Young Chen.
This commit is contained in:
parent
87c23ef643
commit
e6a80e476d
|
@ -188,6 +188,10 @@ public class YarnConfiguration extends Configuration {
|
||||||
public static final String RM_EPOCH = RM_PREFIX + "epoch";
|
public static final String RM_EPOCH = RM_PREFIX + "epoch";
|
||||||
public static final long DEFAULT_RM_EPOCH = 0L;
|
public static final long DEFAULT_RM_EPOCH = 0L;
|
||||||
|
|
||||||
|
/** The epoch range before wrap around. 0 disables wrap around*/
|
||||||
|
public static final String RM_EPOCH_RANGE = RM_EPOCH + ".range";
|
||||||
|
public static final long DEFAULT_RM_EPOCH_RANGE = 0;
|
||||||
|
|
||||||
/** The address of the applications manager interface in the RM.*/
|
/** The address of the applications manager interface in the RM.*/
|
||||||
public static final String RM_ADDRESS =
|
public static final String RM_ADDRESS =
|
||||||
RM_PREFIX + "address";
|
RM_PREFIX + "address";
|
||||||
|
|
|
@ -676,6 +676,13 @@
|
||||||
<!--value>yarn-cluster</value-->
|
<!--value>yarn-cluster</value-->
|
||||||
</property>
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<description>The range of values above base epoch that the RM will use before
|
||||||
|
wrapping around</description>
|
||||||
|
<name>yarn.resourcemanager.epoch.range</name>
|
||||||
|
<value>0</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<description>The list of RM nodes in the cluster when HA is
|
<description>The list of RM nodes in the cluster when HA is
|
||||||
enabled. See description of yarn.resourcemanager.ha
|
enabled. See description of yarn.resourcemanager.ha
|
||||||
|
|
|
@ -205,12 +205,12 @@ public class FileSystemRMStateStore extends RMStateStore {
|
||||||
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
|
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
|
||||||
currentEpoch = epoch.getEpoch();
|
currentEpoch = epoch.getEpoch();
|
||||||
// increment epoch and store it
|
// increment epoch and store it
|
||||||
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto()
|
byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
|
||||||
.toByteArray();
|
.toByteArray();
|
||||||
updateFile(epochNodePath, storeData, false);
|
updateFile(epochNodePath, storeData, false);
|
||||||
} else {
|
} else {
|
||||||
// initialize epoch file with 1 for the next time.
|
// initialize epoch file with 1 for the next time.
|
||||||
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto()
|
byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
|
||||||
.toByteArray();
|
.toByteArray();
|
||||||
writeFileWithRetries(epochNodePath, storeData, false);
|
writeFileWithRetries(epochNodePath, storeData, false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -259,7 +259,7 @@ public class LeveldbRMStateStore extends RMStateStore {
|
||||||
if (data != null) {
|
if (data != null) {
|
||||||
currentEpoch = EpochProto.parseFrom(data).getEpoch();
|
currentEpoch = EpochProto.parseFrom(data).getEpoch();
|
||||||
}
|
}
|
||||||
EpochProto proto = Epoch.newInstance(currentEpoch + 1).getProto();
|
EpochProto proto = Epoch.newInstance(nextEpoch(currentEpoch)).getProto();
|
||||||
db.put(dbKeyBytes, proto.toByteArray());
|
db.put(dbKeyBytes, proto.toByteArray());
|
||||||
} catch (DBException e) {
|
} catch (DBException e) {
|
||||||
throw new IOException(e);
|
throw new IOException(e);
|
||||||
|
|
|
@ -59,7 +59,7 @@ public class MemoryRMStateStore extends RMStateStore {
|
||||||
@Override
|
@Override
|
||||||
public synchronized long getAndIncrementEpoch() throws Exception {
|
public synchronized long getAndIncrementEpoch() throws Exception {
|
||||||
long currentEpoch = epoch;
|
long currentEpoch = epoch;
|
||||||
epoch = epoch + 1;
|
epoch = nextEpoch(epoch);
|
||||||
return currentEpoch;
|
return currentEpoch;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,6 +104,7 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
protected static final String VERSION_NODE = "RMVersionNode";
|
protected static final String VERSION_NODE = "RMVersionNode";
|
||||||
protected static final String EPOCH_NODE = "EpochNode";
|
protected static final String EPOCH_NODE = "EpochNode";
|
||||||
protected long baseEpoch;
|
protected long baseEpoch;
|
||||||
|
private long epochRange;
|
||||||
protected ResourceManager resourceManager;
|
protected ResourceManager resourceManager;
|
||||||
private final ReadLock readLock;
|
private final ReadLock readLock;
|
||||||
private final WriteLock writeLock;
|
private final WriteLock writeLock;
|
||||||
|
@ -732,6 +733,8 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
// read the base epoch value from conf
|
// read the base epoch value from conf
|
||||||
baseEpoch = conf.getLong(YarnConfiguration.RM_EPOCH,
|
baseEpoch = conf.getLong(YarnConfiguration.RM_EPOCH,
|
||||||
YarnConfiguration.DEFAULT_RM_EPOCH);
|
YarnConfiguration.DEFAULT_RM_EPOCH);
|
||||||
|
epochRange = conf.getLong(YarnConfiguration.RM_EPOCH_RANGE,
|
||||||
|
YarnConfiguration.DEFAULT_RM_EPOCH_RANGE);
|
||||||
initInternal(conf);
|
initInternal(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -819,6 +822,19 @@ public abstract class RMStateStore extends AbstractService {
|
||||||
*/
|
*/
|
||||||
public abstract long getAndIncrementEpoch() throws Exception;
|
public abstract long getAndIncrementEpoch() throws Exception;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute the next epoch value by incrementing by one.
|
||||||
|
* Wraps around if the epoch range is exceeded so that
|
||||||
|
* when federation is enabled epoch collisions can be avoided.
|
||||||
|
*/
|
||||||
|
protected long nextEpoch(long epoch){
|
||||||
|
long epochVal = epoch - baseEpoch + 1;
|
||||||
|
if (epochRange > 0) {
|
||||||
|
epochVal %= epochRange;
|
||||||
|
}
|
||||||
|
return epochVal + baseEpoch;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Blocking API
|
* Blocking API
|
||||||
* The derived class must recover state from the store and return a new
|
* The derived class must recover state from the store and return a new
|
||||||
|
|
|
@ -491,13 +491,13 @@ public class ZKRMStateStore extends RMStateStore {
|
||||||
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
|
Epoch epoch = new EpochPBImpl(EpochProto.parseFrom(data));
|
||||||
currentEpoch = epoch.getEpoch();
|
currentEpoch = epoch.getEpoch();
|
||||||
// increment epoch and store it
|
// increment epoch and store it
|
||||||
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto()
|
byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
|
||||||
.toByteArray();
|
.toByteArray();
|
||||||
zkManager.safeSetData(epochNodePath, storeData, -1, zkAcl,
|
zkManager.safeSetData(epochNodePath, storeData, -1, zkAcl,
|
||||||
fencingNodePath);
|
fencingNodePath);
|
||||||
} else {
|
} else {
|
||||||
// initialize epoch node with 1 for the next time.
|
// initialize epoch node with 1 for the next time.
|
||||||
byte[] storeData = Epoch.newInstance(currentEpoch + 1).getProto()
|
byte[] storeData = Epoch.newInstance(nextEpoch(currentEpoch)).getProto()
|
||||||
.toByteArray();
|
.toByteArray();
|
||||||
zkManager.safeCreate(epochNodePath, storeData, zkAcl,
|
zkManager.safeCreate(epochNodePath, storeData, zkAcl,
|
||||||
CreateMode.PERSISTENT, zkAcl, fencingNodePath);
|
CreateMode.PERSISTENT, zkAcl, fencingNodePath);
|
||||||
|
|
|
@ -94,6 +94,8 @@ public class RMStateStoreTestBase {
|
||||||
|
|
||||||
protected final long epoch = 10L;
|
protected final long epoch = 10L;
|
||||||
|
|
||||||
|
private final long epochRange = 10L;
|
||||||
|
|
||||||
static class TestDispatcher implements Dispatcher, EventHandler<Event> {
|
static class TestDispatcher implements Dispatcher, EventHandler<Event> {
|
||||||
|
|
||||||
ApplicationAttemptId attemptId;
|
ApplicationAttemptId attemptId;
|
||||||
|
@ -141,6 +143,10 @@ public class RMStateStoreTestBase {
|
||||||
boolean attemptExists(RMAppAttempt attempt) throws Exception;
|
boolean attemptExists(RMAppAttempt attempt) throws Exception;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getEpochRange() {
|
||||||
|
return epochRange;
|
||||||
|
}
|
||||||
|
|
||||||
void waitNotify(TestDispatcher dispatcher) {
|
void waitNotify(TestDispatcher dispatcher) {
|
||||||
long startTime = System.currentTimeMillis();
|
long startTime = System.currentTimeMillis();
|
||||||
while(!dispatcher.notified) {
|
while(!dispatcher.notified) {
|
||||||
|
@ -576,6 +582,14 @@ public class RMStateStoreTestBase {
|
||||||
|
|
||||||
long thirdTimeEpoch = store.getAndIncrementEpoch();
|
long thirdTimeEpoch = store.getAndIncrementEpoch();
|
||||||
Assert.assertEquals(epoch + 2, thirdTimeEpoch);
|
Assert.assertEquals(epoch + 2, thirdTimeEpoch);
|
||||||
|
|
||||||
|
for (int i = 0; i < epochRange; ++i) {
|
||||||
|
store.getAndIncrementEpoch();
|
||||||
|
}
|
||||||
|
long wrappedEpoch = store.getAndIncrementEpoch();
|
||||||
|
// Epoch should have wrapped around and then incremented once for a total
|
||||||
|
// of + 3
|
||||||
|
Assert.assertEquals(epoch + 3, wrappedEpoch);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testAppDeletion(RMStateStoreHelper stateStoreHelper)
|
public void testAppDeletion(RMStateStoreHelper stateStoreHelper)
|
||||||
|
|
|
@ -118,6 +118,7 @@ public class TestFSRMStateStore extends RMStateStoreTestBase {
|
||||||
conf.setLong(YarnConfiguration.FS_RM_STATE_STORE_RETRY_INTERVAL_MS,
|
conf.setLong(YarnConfiguration.FS_RM_STATE_STORE_RETRY_INTERVAL_MS,
|
||||||
900L);
|
900L);
|
||||||
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
||||||
|
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
|
||||||
if (adminCheckEnable) {
|
if (adminCheckEnable) {
|
||||||
conf.setBoolean(
|
conf.setBoolean(
|
||||||
YarnConfiguration.YARN_INTERMEDIATE_DATA_ENCRYPTION, true);
|
YarnConfiguration.YARN_INTERMEDIATE_DATA_ENCRYPTION, true);
|
||||||
|
|
|
@ -83,6 +83,7 @@ public class TestLeveldbRMStateStore extends RMStateStoreTestBase {
|
||||||
@Test(timeout = 60000)
|
@Test(timeout = 60000)
|
||||||
public void testEpoch() throws Exception {
|
public void testEpoch() throws Exception {
|
||||||
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
||||||
|
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
|
||||||
LeveldbStateStoreTester tester = new LeveldbStateStoreTester();
|
LeveldbStateStoreTester tester = new LeveldbStateStoreTester();
|
||||||
testEpoch(tester);
|
testEpoch(tester);
|
||||||
}
|
}
|
||||||
|
|
|
@ -210,6 +210,7 @@ public class TestZKRMStateStore extends RMStateStoreTestBase {
|
||||||
curatorTestingServer.getConnectString());
|
curatorTestingServer.getConnectString());
|
||||||
conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode);
|
conf.set(YarnConfiguration.ZK_RM_STATE_STORE_PARENT_PATH, workingZnode);
|
||||||
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
conf.setLong(YarnConfiguration.RM_EPOCH, epoch);
|
||||||
|
conf.setLong(YarnConfiguration.RM_EPOCH_RANGE, getEpochRange());
|
||||||
this.store = new TestZKRMStateStoreInternal(conf, workingZnode);
|
this.store = new TestZKRMStateStoreInternal(conf, workingZnode);
|
||||||
return this.store;
|
return this.store;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue