better logging of recovery

This commit is contained in:
Shay Banon 2012-02-02 00:18:17 +02:00
parent a5838dc403
commit 3698bcc2b9
8 changed files with 85 additions and 14 deletions

View File

@ -197,6 +197,7 @@ public class IndexShardGatewayService extends AbstractIndexShardComponent implem
sb.append(" index : files [").append(recoveryStatus.index().numberOfFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().totalSize())).append("], took[").append(TimeValue.timeValueMillis(recoveryStatus.index().time())).append("]\n"); sb.append(" index : files [").append(recoveryStatus.index().numberOfFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().totalSize())).append("], took[").append(TimeValue.timeValueMillis(recoveryStatus.index().time())).append("]\n");
sb.append(" : recovered_files [").append(recoveryStatus.index().numberOfRecoveredFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().recoveredTotalSize())).append("]\n"); sb.append(" : recovered_files [").append(recoveryStatus.index().numberOfRecoveredFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().recoveredTotalSize())).append("]\n");
sb.append(" : reusing_files [").append(recoveryStatus.index().numberOfReusedFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().reusedTotalSize())).append("]\n"); sb.append(" : reusing_files [").append(recoveryStatus.index().numberOfReusedFiles()).append("] with total_size [").append(new ByteSizeValue(recoveryStatus.index().reusedTotalSize())).append("]\n");
sb.append(" start : took [").append(TimeValue.timeValueMillis(recoveryStatus.start().time())).append("], check_index [").append(timeValueMillis(recoveryStatus.start().checkIndexTime())).append("]\n");
sb.append(" translog : number_of_operations [").append(recoveryStatus.translog().currentTranslogOperations()).append("], took [").append(TimeValue.timeValueMillis(recoveryStatus.translog().time())).append("]"); sb.append(" translog : number_of_operations [").append(recoveryStatus.translog().currentTranslogOperations()).append("], took [").append(TimeValue.timeValueMillis(recoveryStatus.translog().time())).append("]");
logger.debug(sb.toString()); logger.debug(sb.toString());
} }

View File

@ -29,6 +29,7 @@ public class RecoveryStatus {
public static enum Stage { public static enum Stage {
INIT, INIT,
INDEX, INDEX,
START,
TRANSLOG, TRANSLOG,
DONE DONE
} }
@ -43,6 +44,8 @@ public class RecoveryStatus {
private Translog translog = new Translog(); private Translog translog = new Translog();
private Start start = new Start();
public Stage stage() { public Stage stage() {
return this.stage; return this.stage;
} }
@ -72,10 +75,44 @@ public class RecoveryStatus {
return index; return index;
} }
public Start start() {
return this.start;
}
public Translog translog() { public Translog translog() {
return translog; return translog;
} }
public static class Start {
private long startTime;
private long time;
private long checkIndexTime;
public long startTime() {
return this.startTime;
}
public void startTime(long startTime) {
this.startTime = startTime;
}
public long time() {
return this.time;
}
public void time(long time) {
this.time = time;
}
public long checkIndexTime() {
return checkIndexTime;
}
public void checkIndexTime(long checkIndexTime) {
this.checkIndexTime = checkIndexTime;
}
}
public static class Translog { public static class Translog {
private long startTime = 0; private long startTime = 0;
private long time; private long time;

View File

@ -405,8 +405,6 @@ public abstract class BlobStoreIndexShardGateway extends AbstractIndexShardCompo
} }
recoveryStatus.index().startTime(System.currentTimeMillis()); recoveryStatus.index().startTime(System.currentTimeMillis());
recoveryStatus.index().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.index().time(System.currentTimeMillis() - recoveryStatus.index().startTime());
recoveryStatus.translog().startTime(System.currentTimeMillis());
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.index().startTime());
return; return;
} }
@ -417,14 +415,10 @@ public abstract class BlobStoreIndexShardGateway extends AbstractIndexShardCompo
} }
try { try {
recoveryStatus.index().startTime(System.currentTimeMillis()); recoveryStatus.index().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.INDEX);
recoverIndex(commitPoint, blobs); recoverIndex(commitPoint, blobs);
recoveryStatus.index().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.index().time(System.currentTimeMillis() - recoveryStatus.index().startTime());
recoveryStatus.translog().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.TRANSLOG);
recoverTranslog(commitPoint, blobs); recoverTranslog(commitPoint, blobs);
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.index().startTime());
return; return;
} catch (Exception e) { } catch (Exception e) {
throw new IndexShardGatewayRecoveryException(shardId, "failed to recover commit_point [" + commitPoint.name() + "]/[" + commitPoint.version() + "]", e); throw new IndexShardGatewayRecoveryException(shardId, "failed to recover commit_point [" + commitPoint.name() + "]/[" + commitPoint.version() + "]", e);
@ -436,12 +430,23 @@ public abstract class BlobStoreIndexShardGateway extends AbstractIndexShardCompo
private void recoverTranslog(CommitPoint commitPoint, ImmutableMap<String, BlobMetaData> blobs) throws IndexShardGatewayRecoveryException { private void recoverTranslog(CommitPoint commitPoint, ImmutableMap<String, BlobMetaData> blobs) throws IndexShardGatewayRecoveryException {
if (commitPoint.translogFiles().isEmpty()) { if (commitPoint.translogFiles().isEmpty()) {
// no translog files, bail // no translog files, bail
recoveryStatus.start().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.START);
indexShard.start("post recovery from gateway, no translog"); indexShard.start("post recovery from gateway, no translog");
recoveryStatus.start().time(System.currentTimeMillis() - recoveryStatus.start().startTime());
recoveryStatus.start().checkIndexTime(indexShard.checkIndexTook());
return; return;
} }
try { try {
recoveryStatus.start().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.START);
indexShard.performRecoveryPrepareForTranslog(); indexShard.performRecoveryPrepareForTranslog();
recoveryStatus.start().time(System.currentTimeMillis() - recoveryStatus.start().startTime());
recoveryStatus.start().checkIndexTime(indexShard.checkIndexTook());
recoveryStatus.updateStage(RecoveryStatus.Stage.TRANSLOG);
recoveryStatus.translog().startTime(System.currentTimeMillis());
final AtomicReference<Throwable> failure = new AtomicReference<Throwable>(); final AtomicReference<Throwable> failure = new AtomicReference<Throwable>();
final CountDownLatch latch = new CountDownLatch(1); final CountDownLatch latch = new CountDownLatch(1);
@ -529,12 +534,14 @@ public abstract class BlobStoreIndexShardGateway extends AbstractIndexShardCompo
} }
indexShard.performRecoveryFinalization(true); indexShard.performRecoveryFinalization(true);
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.translog().startTime());
} catch (Throwable e) { } catch (Throwable e) {
throw new IndexShardGatewayRecoveryException(shardId, "Failed to recover translog", e); throw new IndexShardGatewayRecoveryException(shardId, "Failed to recover translog", e);
} }
} }
private void recoverIndex(CommitPoint commitPoint, ImmutableMap<String, BlobMetaData> blobs) throws Exception { private void recoverIndex(CommitPoint commitPoint, ImmutableMap<String, BlobMetaData> blobs) throws Exception {
recoveryStatus.updateStage(RecoveryStatus.Stage.INDEX);
int numberOfFiles = 0; int numberOfFiles = 0;
long totalSize = 0; long totalSize = 0;
int numberOfReusedFiles = 0; int numberOfReusedFiles = 0;

View File

@ -91,6 +91,7 @@ public class LocalIndexShardGateway extends AbstractIndexShardComponent implemen
@Override @Override
public void recover(boolean indexShouldExists, RecoveryStatus recoveryStatus) throws IndexShardGatewayRecoveryException { public void recover(boolean indexShouldExists, RecoveryStatus recoveryStatus) throws IndexShardGatewayRecoveryException {
recoveryStatus.index().startTime(System.currentTimeMillis()); recoveryStatus.index().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.INDEX);
long version = -1; long version = -1;
long translogId = -1; long translogId = -1;
try { try {
@ -124,12 +125,14 @@ public class LocalIndexShardGateway extends AbstractIndexShardComponent implemen
// ignore // ignore
} }
recoveryStatus.translog().startTime(System.currentTimeMillis()); recoveryStatus.start().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.START);
if (translogId == -1) { if (translogId == -1) {
// no translog files, bail // no translog files, bail
indexShard.start("post recovery from gateway, no translog"); indexShard.start("post recovery from gateway, no translog");
// no index, just start the shard and bail // no index, just start the shard and bail
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.start().time(System.currentTimeMillis() - recoveryStatus.start().startTime());
recoveryStatus.start().checkIndexTime(indexShard.checkIndexTook());
return; return;
} }
@ -163,12 +166,18 @@ public class LocalIndexShardGateway extends AbstractIndexShardComponent implemen
// no translog files, bail // no translog files, bail
indexShard.start("post recovery from gateway, no translog"); indexShard.start("post recovery from gateway, no translog");
// no index, just start the shard and bail // no index, just start the shard and bail
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.start().time(System.currentTimeMillis() - recoveryStatus.start().startTime());
recoveryStatus.start().checkIndexTime(indexShard.checkIndexTook());
return; return;
} }
// recover from the translog file // recover from the translog file
indexShard.performRecoveryPrepareForTranslog(); indexShard.performRecoveryPrepareForTranslog();
recoveryStatus.start().time(System.currentTimeMillis() - recoveryStatus.start().startTime());
recoveryStatus.start().checkIndexTime(indexShard.checkIndexTook());
recoveryStatus.translog().startTime(System.currentTimeMillis());
recoveryStatus.updateStage(RecoveryStatus.Stage.TRANSLOG);
try { try {
InputStreamStreamInput si = new InputStreamStreamInput(new FileInputStream(recoveringTranslogFile)); InputStreamStreamInput si = new InputStreamStreamInput(new FileInputStream(recoveringTranslogFile));
while (true) { while (true) {
@ -195,7 +204,7 @@ public class LocalIndexShardGateway extends AbstractIndexShardComponent implemen
recoveringTranslogFile.delete(); recoveringTranslogFile.delete();
recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.index().startTime()); recoveryStatus.translog().time(System.currentTimeMillis() - recoveryStatus.translog().startTime());
} }
@Override @Override

View File

@ -115,6 +115,8 @@ public class InternalIndexShard extends AbstractIndexShardComponent implements I
private final boolean checkIndexOnStartup; private final boolean checkIndexOnStartup;
private long checkIndexTook = 0;
private volatile IndexShardState state; private volatile IndexShardState state;
private TimeValue refreshInterval; private TimeValue refreshInterval;
@ -550,6 +552,10 @@ public class InternalIndexShard extends AbstractIndexShardComponent implements I
} }
} }
public long checkIndexTook() {
return this.checkIndexTook;
}
/** /**
* After the store has been recovered, we need to start the engine in order to apply operations * After the store has been recovered, we need to start the engine in order to apply operations
*/ */
@ -830,6 +836,8 @@ public class InternalIndexShard extends AbstractIndexShardComponent implements I
private void checkIndex(boolean throwException) throws IndexShardException { private void checkIndex(boolean throwException) throws IndexShardException {
try { try {
checkIndexTook = 0;
long time = System.currentTimeMillis();
if (!IndexReader.indexExists(store.directory())) { if (!IndexReader.indexExists(store.directory())) {
return; return;
} }
@ -853,6 +861,7 @@ public class InternalIndexShard extends AbstractIndexShardComponent implements I
logger.debug("check index [success]\n{}", new String(os.underlyingBytes(), 0, os.size())); logger.debug("check index [success]\n{}", new String(os.underlyingBytes(), 0, os.size()));
} }
} }
checkIndexTook = System.currentTimeMillis() - time;
} catch (Exception e) { } catch (Exception e) {
logger.warn("failed to check index", e); logger.warn("failed to check index", e);
} }

View File

@ -41,6 +41,8 @@ class RecoveryResponse implements Streamable {
long phase1Time; long phase1Time;
long phase1ThrottlingWaitTime; long phase1ThrottlingWaitTime;
long startTime;
int phase2Operations; int phase2Operations;
long phase2Time; long phase2Time;
@ -78,6 +80,7 @@ class RecoveryResponse implements Streamable {
phase1ExistingTotalSize = in.readVLong(); phase1ExistingTotalSize = in.readVLong();
phase1Time = in.readVLong(); phase1Time = in.readVLong();
phase1ThrottlingWaitTime = in.readVLong(); phase1ThrottlingWaitTime = in.readVLong();
startTime = in.readVLong();
phase2Operations = in.readVInt(); phase2Operations = in.readVInt();
phase2Time = in.readVLong(); phase2Time = in.readVLong();
phase3Operations = in.readVInt(); phase3Operations = in.readVInt();
@ -108,6 +111,7 @@ class RecoveryResponse implements Streamable {
out.writeVLong(phase1ExistingTotalSize); out.writeVLong(phase1ExistingTotalSize);
out.writeVLong(phase1Time); out.writeVLong(phase1Time);
out.writeVLong(phase1ThrottlingWaitTime); out.writeVLong(phase1ThrottlingWaitTime);
out.writeVLong(startTime);
out.writeVInt(phase2Operations); out.writeVInt(phase2Operations);
out.writeVLong(phase2Time); out.writeVLong(phase2Time);
out.writeVInt(phase3Operations); out.writeVInt(phase3Operations);

View File

@ -194,13 +194,16 @@ public class RecoverySource extends AbstractComponent {
if (shard.state() == IndexShardState.CLOSED) { if (shard.state() == IndexShardState.CLOSED) {
throw new IndexShardClosedException(request.shardId()); throw new IndexShardClosedException(request.shardId());
} }
logger.trace("[{}][{}] recovery [phase2] to {}: sending transaction log operations", request.shardId().index().name(), request.shardId().id(), request.targetNode()); logger.trace("[{}][{}] recovery [phase2] to {}: start", request.shardId().index().name(), request.shardId().id(), request.targetNode());
StopWatch stopWatch = new StopWatch().start(); StopWatch stopWatch = new StopWatch().start();
transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.PREPARE_TRANSLOG, new RecoveryPrepareForTranslogOperationsRequest(request.shardId()), VoidTransportResponseHandler.INSTANCE_SAME).txGet(); transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.PREPARE_TRANSLOG, new RecoveryPrepareForTranslogOperationsRequest(request.shardId()), VoidTransportResponseHandler.INSTANCE_SAME).txGet();
stopWatch.stop();
response.startTime = stopWatch.totalTime().millis();
logger.trace("[{}][{}] recovery [phase2] to {}: start took [{}]", request.shardId().index().name(), request.shardId().id(), request.targetNode(), stopWatch.totalTime());
logger.trace("[{}][{}] recovery [phase2] to {}: sending transaction log operations", request.shardId().index().name(), request.shardId().id(), request.targetNode());
stopWatch = new StopWatch().start();
int totalOperations = sendSnapshot(snapshot); int totalOperations = sendSnapshot(snapshot);
stopWatch.stop(); stopWatch.stop();
logger.trace("[{}][{}] recovery [phase2] to {}: took [{}]", request.shardId().index().name(), request.shardId().id(), request.targetNode(), stopWatch.totalTime()); logger.trace("[{}][{}] recovery [phase2] to {}: took [{}]", request.shardId().index().name(), request.shardId().id(), request.targetNode(), stopWatch.totalTime());
response.phase2Time = stopWatch.totalTime().millis(); response.phase2Time = stopWatch.totalTime().millis();

View File

@ -195,7 +195,8 @@ public class RecoveryTarget extends AbstractComponent {
.append(", took [").append(timeValueMillis(recoveryStatus.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryStatus.phase1ThrottlingWaitTime)).append(']') .append(", took [").append(timeValueMillis(recoveryStatus.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryStatus.phase1ThrottlingWaitTime)).append(']')
.append("\n"); .append("\n");
sb.append(" : reusing_files [").append(recoveryStatus.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryStatus.phase1ExistingTotalSize)).append("]\n"); sb.append(" : reusing_files [").append(recoveryStatus.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryStatus.phase1ExistingTotalSize)).append("]\n");
sb.append(" phase2: recovered [").append(recoveryStatus.phase2Operations).append("]").append(" transaction log operations") sb.append(" phase2: start took [").append(timeValueMillis(recoveryStatus.startTime)).append("]\n");
sb.append(" : recovered [").append(recoveryStatus.phase2Operations).append("]").append(" transaction log operations")
.append(", took [").append(timeValueMillis(recoveryStatus.phase2Time)).append("]") .append(", took [").append(timeValueMillis(recoveryStatus.phase2Time)).append("]")
.append("\n"); .append("\n");
sb.append(" phase3: recovered [").append(recoveryStatus.phase3Operations).append("]").append(" transaction log operations") sb.append(" phase3: recovered [").append(recoveryStatus.phase3Operations).append("]").append(" transaction log operations")