HBASE-4820. Distributed log splitting coding enhancement to make it easier to understand, no semantics change. Contributed by Jimmy Xiang.
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1208801 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b290b43264
commit
54e463f519
|
@ -1546,8 +1546,8 @@ public class AssignmentManager extends ZooKeeperListener {
|
|||
// In case of reassignment the current state in memory need not be
|
||||
// OFFLINE.
|
||||
if (!hijack && !state.isClosed() && !state.isOffline()) {
|
||||
this.master.abort("Unexpected state trying to OFFLINE; " + state,
|
||||
new IllegalStateException());
|
||||
String msg = "Unexpected state : " + state + " .. Cannot transit it to OFFLINE.";
|
||||
this.master.abort(msg, new IllegalStateException(msg));
|
||||
return -1;
|
||||
}
|
||||
boolean allowZNodeCreation = false;
|
||||
|
|
|
@ -268,9 +268,7 @@ public class MasterFileSystem {
|
|||
}
|
||||
|
||||
if (distributedLogSplitting) {
|
||||
for (ServerName serverName : serverNames) {
|
||||
splitLogManager.handleDeadWorker(serverName.toString());
|
||||
}
|
||||
splitLogManager.handleDeadWorkers(serverNames);
|
||||
splitTime = EnvironmentEdgeManager.currentTimeMillis();
|
||||
try {
|
||||
splitLogSize = splitLogManager.splitLogDistributed(logDirs);
|
||||
|
|
|
@ -37,6 +37,7 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Chore;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.Stoppable;
|
||||
import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
|
||||
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
|
||||
|
@ -243,12 +244,12 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
// recover-lease is done. totalSize will be under in most cases and the
|
||||
// metrics that it drives will also be under-reported.
|
||||
totalSize += lf.getLen();
|
||||
if (installTask(lf.getPath().toString(), batch) == false) {
|
||||
if (enqueueSplitTask(lf.getPath().toString(), batch) == false) {
|
||||
throw new IOException("duplicate log split scheduled for "
|
||||
+ lf.getPath());
|
||||
}
|
||||
}
|
||||
waitTasks(batch, status);
|
||||
waitForSplittingCompletion(batch, status);
|
||||
if (batch.done != batch.installed) {
|
||||
stopTrackingTasks(batch);
|
||||
tot_mgr_log_split_batch_err.incrementAndGet();
|
||||
|
@ -278,7 +279,14 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
return totalSize;
|
||||
}
|
||||
|
||||
boolean installTask(String taskname, TaskBatch batch) {
|
||||
/**
|
||||
* Add a task entry to splitlog znode if it is not already there.
|
||||
*
|
||||
* @param taskname the path of the log to be split
|
||||
* @param batch the batch this task belongs to
|
||||
* @return true if a new entry is created, false if it is already there.
|
||||
*/
|
||||
boolean enqueueSplitTask(String taskname, TaskBatch batch) {
|
||||
tot_mgr_log_split_start.incrementAndGet();
|
||||
String path = ZKSplitLog.getEncodedNodeName(watcher, taskname);
|
||||
Task oldtask = createTaskIfAbsent(path, batch);
|
||||
|
@ -292,7 +300,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
return false;
|
||||
}
|
||||
|
||||
private void waitTasks(TaskBatch batch, MonitoredTask status) {
|
||||
private void waitForSplittingCompletion(TaskBatch batch, MonitoredTask status) {
|
||||
synchronized (batch) {
|
||||
while ((batch.done + batch.error) != batch.installed) {
|
||||
try {
|
||||
|
@ -371,7 +379,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
}
|
||||
|
||||
private void createNodeFailure(String path) {
|
||||
// TODO the Manger should split the log locally instead of giving up
|
||||
// TODO the Manager should split the log locally instead of giving up
|
||||
LOG.warn("failed to create task node" + path);
|
||||
setDone(path, FAILURE);
|
||||
}
|
||||
|
@ -767,16 +775,30 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
}
|
||||
}
|
||||
|
||||
void handleDeadWorker(String worker_name) {
|
||||
void handleDeadWorker(String workerName) {
|
||||
// resubmit the tasks on the TimeoutMonitor thread. Makes it easier
|
||||
// to reason about concurrency. Makes it easier to retry.
|
||||
synchronized (deadWorkersLock) {
|
||||
if (deadWorkers == null) {
|
||||
deadWorkers = new HashSet<String>(100);
|
||||
}
|
||||
deadWorkers.add(worker_name);
|
||||
deadWorkers.add(workerName);
|
||||
}
|
||||
LOG.info("dead splitlog worker " + worker_name);
|
||||
LOG.info("dead splitlog worker " + workerName);
|
||||
}
|
||||
|
||||
void handleDeadWorkers(List<ServerName> serverNames) {
|
||||
List<String> workerNames = new ArrayList<String>(serverNames.size());
|
||||
for (ServerName serverName : serverNames) {
|
||||
workerNames.add(serverName.toString());
|
||||
}
|
||||
synchronized (deadWorkersLock) {
|
||||
if (deadWorkers == null) {
|
||||
deadWorkers = new HashSet<String>(100);
|
||||
}
|
||||
deadWorkers.addAll(workerNames);
|
||||
}
|
||||
LOG.info("dead splitlog workers " + workerNames);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -871,7 +893,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
} else {
|
||||
Long retry_count = (Long)ctx;
|
||||
LOG.warn("create rc =" + KeeperException.Code.get(rc) + " for " +
|
||||
path + " retry=" + retry_count);
|
||||
path + " remaining retries=" + retry_count);
|
||||
if (retry_count == 0) {
|
||||
tot_mgr_node_create_err.incrementAndGet();
|
||||
createNodeFailure(path);
|
||||
|
@ -900,7 +922,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
if (rc != 0) {
|
||||
Long retry_count = (Long) ctx;
|
||||
LOG.warn("getdata rc = " + KeeperException.Code.get(rc) + " " +
|
||||
path + " retry=" + retry_count);
|
||||
path + " remaining retries=" + retry_count);
|
||||
if (retry_count == 0) {
|
||||
tot_mgr_get_data_err.incrementAndGet();
|
||||
getDataSetWatchFailure(path);
|
||||
|
@ -930,7 +952,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
tot_mgr_node_delete_err.incrementAndGet();
|
||||
Long retry_count = (Long) ctx;
|
||||
LOG.warn("delete rc=" + KeeperException.Code.get(rc) + " for " +
|
||||
path + " retry=" + retry_count);
|
||||
path + " remaining retries=" + retry_count);
|
||||
if (retry_count == 0) {
|
||||
LOG.warn("delete failed " + path);
|
||||
deleteNodeFailure(path);
|
||||
|
@ -965,7 +987,7 @@ public class SplitLogManager extends ZooKeeperListener {
|
|||
if (rc != 0) {
|
||||
Long retry_count = (Long)ctx;
|
||||
LOG.warn("rc=" + KeeperException.Code.get(rc) + " for "+ path +
|
||||
" retry=" + retry_count);
|
||||
" remaining retries=" + retry_count);
|
||||
if (retry_count == 0) {
|
||||
createRescanFailure();
|
||||
} else {
|
||||
|
|
|
@ -54,8 +54,8 @@ import org.apache.zookeeper.data.Stat;
|
|||
* <p>
|
||||
* If a worker has successfully moved the task from state UNASSIGNED to
|
||||
* OWNED then it owns the task. It keeps heart beating the manager by
|
||||
* periodically moving the task from OWNED to OWNED state. On success it
|
||||
* moves the task to SUCCESS. On unrecoverable error it moves task state to
|
||||
* periodically moving the task from UNASSIGNED to OWNED state. On success it
|
||||
* moves the task to TASK_DONE. On unrecoverable error it moves task state to
|
||||
* ERR. If it cannot continue but wants the master to retry the task then it
|
||||
* moves the task state to RESIGNED.
|
||||
* <p>
|
||||
|
@ -70,7 +70,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
|
||||
Thread worker;
|
||||
private final String serverName;
|
||||
private final TaskExecutor executor;
|
||||
private final TaskExecutor splitTaskExecutor;
|
||||
private long zkretries;
|
||||
|
||||
private Object taskReadyLock = new Object();
|
||||
|
@ -83,10 +83,10 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
|
||||
|
||||
public SplitLogWorker(ZooKeeperWatcher watcher, Configuration conf,
|
||||
String serverName, TaskExecutor executor) {
|
||||
String serverName, TaskExecutor splitTaskExecutor) {
|
||||
super(watcher);
|
||||
this.serverName = serverName;
|
||||
this.executor = executor;
|
||||
this.splitTaskExecutor = splitTaskExecutor;
|
||||
this.zkretries = conf.getLong("hbase.splitlog.zk.retries", 3);
|
||||
}
|
||||
|
||||
|
@ -247,7 +247,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
}
|
||||
|
||||
currentVersion = stat.getVersion();
|
||||
if (ownTask(true) == false) {
|
||||
if (attemptToOwnTask(true) == false) {
|
||||
tot_wkr_failed_to_grab_task_lost_race.incrementAndGet();
|
||||
return;
|
||||
}
|
||||
|
@ -263,12 +263,12 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
t = System.currentTimeMillis();
|
||||
TaskExecutor.Status status;
|
||||
|
||||
status = executor.exec(ZKSplitLog.getFileName(currentTask),
|
||||
status = splitTaskExecutor.exec(ZKSplitLog.getFileName(currentTask),
|
||||
new CancelableProgressable() {
|
||||
|
||||
@Override
|
||||
public boolean progress() {
|
||||
if (ownTask(false) == false) {
|
||||
if (attemptToOwnTask(false) == false) {
|
||||
LOG.warn("Failed to heartbeat the task" + currentTask);
|
||||
return false;
|
||||
}
|
||||
|
@ -327,7 +327,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
* <p>
|
||||
* @return true if task path is successfully locked
|
||||
*/
|
||||
private boolean ownTask(boolean isFirstTime) {
|
||||
private boolean attemptToOwnTask(boolean isFirstTime) {
|
||||
try {
|
||||
Stat stat = this.watcher.getRecoverableZooKeeper().setData(currentTask,
|
||||
TaskState.TASK_OWNED.get(serverName), currentVersion);
|
||||
|
@ -405,7 +405,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
String taskpath = currentTask;
|
||||
if (taskpath != null && taskpath.equals(path)) {
|
||||
// have to compare data. cannot compare version because then there
|
||||
// will be race with ownTask()
|
||||
// will be race with attemptToOwnTask()
|
||||
// cannot just check whether the node has been transitioned to
|
||||
// UNASSIGNED because by the time this worker sets the data watch
|
||||
// the node might have made two transitions - from owned by this
|
||||
|
@ -446,7 +446,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable {
|
|||
|
||||
@Override
|
||||
public void nodeDataChanged(String path) {
|
||||
// there will be a self generated dataChanged event every time ownTask()
|
||||
// there will be a self generated dataChanged event every time attemptToOwnTask()
|
||||
// heartbeats the task znode by upping its version
|
||||
synchronized (grabTaskLock) {
|
||||
if (workerInGrabTask) {
|
||||
|
|
|
@ -671,14 +671,15 @@ public class HLogSplitter {
|
|||
return String.format("%019d", seqid);
|
||||
}
|
||||
|
||||
/*
|
||||
* Parse a single hlog and put the edits in @splitLogsMap
|
||||
/**
|
||||
* Parse a single hlog and put the edits in entryBuffers
|
||||
*
|
||||
* @param logfile to split
|
||||
* @param splitLogsMap output parameter: a map with region names as keys and a
|
||||
* list of edits as values
|
||||
* @param in the hlog reader
|
||||
* @param path the path of the log file
|
||||
* @param entryBuffers the buffer to hold the parsed edits
|
||||
* @param fs the file system
|
||||
* @param conf the configuration
|
||||
* @param skipErrors indicator if CorruptedLogFileException should be thrown instead of IOException
|
||||
* @throws IOException
|
||||
* @throws CorruptedLogFileException if hlog is corrupted
|
||||
*/
|
||||
|
|
|
@ -275,7 +275,7 @@ public class TestDistributedLogSplitting {
|
|||
// slm.splitLogDistributed(logDir);
|
||||
FileStatus[] logfiles = fs.listStatus(logDir);
|
||||
TaskBatch batch = new TaskBatch();
|
||||
slm.installTask(logfiles[0].getPath().toString(), batch);
|
||||
slm.enqueueSplitTask(logfiles[0].getPath().toString(), batch);
|
||||
//waitForCounter but for one of the 2 counters
|
||||
long curt = System.currentTimeMillis();
|
||||
long endt = curt + 30000;
|
||||
|
@ -372,13 +372,9 @@ public class TestDistributedLogSplitting {
|
|||
byte [] qualifier = Bytes.toBytes("c" + Integer.toString(i));
|
||||
e.add(new KeyValue(row, family, qualifier,
|
||||
System.currentTimeMillis(), value));
|
||||
// LOG.info("Region " + i + ": " + e);
|
||||
j++;
|
||||
log.append(hris.get(j % n), table, e, System.currentTimeMillis(), htd);
|
||||
counts[j % n] += 1;
|
||||
// if ((i % 8096) == 0) {
|
||||
// log.sync();
|
||||
// }
|
||||
}
|
||||
}
|
||||
log.sync();
|
||||
|
|
|
@ -138,7 +138,7 @@ public class TestSplitLogManager {
|
|||
zkw.registerListener(listener);
|
||||
ZKUtil.watchAndCheckExists(zkw, tasknode);
|
||||
|
||||
slm.installTask(name, batch);
|
||||
slm.enqueueSplitTask(name, batch);
|
||||
assertEquals(1, batch.installed);
|
||||
assertTrue(slm.findOrCreateOrphanTask(tasknode).batch == batch);
|
||||
assertEquals(1L, tot_mgr_node_create_queued.get());
|
||||
|
|
Loading…
Reference in New Issue