HBASE-4275 RS should communicate fatal "aborts" back to the master

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1163345 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2011-08-30 19:27:24 +00:00
parent 16171a1232
commit 48560e089c
6 changed files with 232 additions and 3 deletions

View File

@ -422,6 +422,7 @@ Release 0.91.0 - Unreleased
HBASE-4291 Improve display of regions in transition in UI to be more
readable (todd)
HBASE-4281 Add facility to dump current state of all executors (todd)
HBASE-4275 RS should communicate fatal "aborts" back to the master (todd)
TASKS
HBASE-3559 Move report of split to master OFF the heartbeat channel

View File

@ -61,4 +61,12 @@ public interface HMasterRegionInterface extends VersionedProtocol {
*/
public void regionServerReport(byte [] sn, HServerLoad hsl)
throws IOException;
}
/**
* Called by a region server to report a fatal error that is causing
* it to abort.
* @param sn {@link ServerName#getBytes()}
* @param errorMessage informative text to expose in the master logs and UI
*/
public void reportRSFatalError(byte [] sn, String errorMessage);
}

View File

@ -73,6 +73,7 @@ import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.HRegion;
@ -156,6 +157,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
private CatalogTracker catalogTracker;
// Cluster status zk tracker and local setter
private ClusterStatusTracker clusterStatusTracker;
// buffer for "fatal error" notices from region servers
// in the cluster. This is only used for assisting
// operations/debugging.
private MemoryBoundedLogMessageBuffer rsFatals;
// This flag is for stopping this Master instance. Its set when we are
// stopping or aborting
@ -223,6 +229,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
this.isa = this.rpcServer.getListenerAddress();
this.serverName = new ServerName(this.isa.getHostName(),
this.isa.getPort(), System.currentTimeMillis());
this.rsFatals = new MemoryBoundedLogMessageBuffer(
conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
// initialize server principal (if using secure Hadoop)
User.login(conf, "hbase.master.keytab.file",
@ -759,6 +767,15 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
}
}
@Override
public void reportRSFatalError(byte [] sn, String errorText) {
ServerName serverName = new ServerName(sn);
String msg = "Region server " + serverName + " reported a fatal error:\n"
+ errorText;
LOG.error(msg);
rsFatals.add(msg);
}
public boolean isMasterRunning() {
return !isStopped();
}
@ -1207,6 +1224,10 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
public AssignmentManager getAssignmentManager() {
return this.assignmentManager;
}
public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
return rsFatals;
}
@Override
public void shutdown() {

View File

@ -0,0 +1,114 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
import java.io.PrintWriter;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
/**
* A size-bounded repository of alerts, which are kept
* in a linked list. Alerts can be added, and they will
* automatically be removed one by one when the specified heap
* usage is exhausted.
*/
public class MemoryBoundedLogMessageBuffer {
private final long maxSizeBytes;
private long usage = 0;
private LinkedList<LogMessage> messages;
public MemoryBoundedLogMessageBuffer(long maxSizeBytes) {
Preconditions.checkArgument(
maxSizeBytes > 0);
this.maxSizeBytes = maxSizeBytes;
this.messages = Lists.newLinkedList();
}
/**
* Append the given message to this buffer, automatically evicting
* older messages until the desired memory limit is achieved.
*/
public synchronized void add(String messageText) {
LogMessage message = new LogMessage(messageText, System.currentTimeMillis());
usage += message.estimateHeapUsage();
messages.add(message);
while (usage > maxSizeBytes) {
LogMessage removed = messages.remove();
usage -= removed.estimateHeapUsage();
assert usage >= 0;
}
}
/**
* Dump the contents of the buffer to the given stream.
*/
public synchronized void dumpTo(PrintWriter out) {
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
for (LogMessage msg : messages) {
out.write(df.format(new Date(msg.timestamp)));
out.write(" ");
out.println(new String(msg.message, Charsets.UTF_8));
}
}
synchronized List<LogMessage> getMessages() {
// defensive copy
return Lists.newArrayList(messages);
}
/**
* Estimate the number of bytes this buffer is currently
* using.
*/
synchronized long estimateHeapUsage() {
return usage;
}
private static class LogMessage {
/** the error text, encoded in bytes to save memory */
public final byte[] message;
public final long timestamp;
/**
* Completely non-scientific estimate of how much one of these
* objects takes, along with the LinkedList overhead. This doesn't
* need to be exact, since we don't expect a ton of these alerts.
*/
private static final long BASE_USAGE=100;
public LogMessage(String message, long timestamp) {
this.message = message.getBytes(Charsets.UTF_8);
this.timestamp = timestamp;
}
public long estimateHeapUsage() {
return message.length + BASE_USAGE;
}
}
}

View File

@ -1471,16 +1471,29 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
* the exception that caused the abort, or null
*/
public void abort(String reason, Throwable cause) {
String msg = "ABORTING region server " + this + ": " + reason;
if (cause != null) {
LOG.fatal("ABORTING region server " + this + ": " + reason, cause);
LOG.fatal(msg, cause);
} else {
LOG.fatal("ABORTING region server " + this + ": " + reason);
LOG.fatal(msg);
}
this.abortRequested = true;
this.reservedSpace.clear();
if (this.metrics != null) {
LOG.info("Dump of metrics: " + this.metrics);
}
// Do our best to report our abort to the master, but this may not work
try {
if (cause != null) {
msg += "\nCause:\n" + StringUtils.stringifyException(cause);
}
if (hbaseMaster != null) {
hbaseMaster.reportRSFatalError(
this.serverNameFromMasterPOV.getBytes(), msg);
}
} catch (Throwable t) {
LOG.warn("Unable to report fatal error to master", t);
}
stop(reason);
}

View File

@ -0,0 +1,72 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
import static org.junit.Assert.*;
import java.io.PrintWriter;
import java.io.StringWriter;
import org.junit.Test;
/**
* Test case for the MemoryBoundedLogMessageBuffer utility.
* Ensures that it uses no more memory than it's supposed to,
* and that it properly deals with multibyte encodings.
*/
public class TestMemoryBoundedLogMessageBuffer {
private static final long TEN_KB = 10 * 1024;
private static final String JP_TEXT = "こんにちは";
@Test
public void testBuffer() {
MemoryBoundedLogMessageBuffer buf =
new MemoryBoundedLogMessageBuffer(TEN_KB);
for (int i = 0; i < 1000; i++) {
buf.add("hello " + i);
}
assertTrue("Usage too big: " + buf.estimateHeapUsage(),
buf.estimateHeapUsage() < TEN_KB);
assertTrue("Too many retained: " + buf.getMessages().size(),
buf.getMessages().size() < 100);
StringWriter sw = new StringWriter();
buf.dumpTo(new PrintWriter(sw));
String dump = sw.toString();
System.out.println(dump);
assertFalse("The early log messages should be evicted",
dump.contains("hello 1\n"));
assertTrue("The late log messages should be retained",
dump.contains("hello 999\n"));
}
@Test
public void testNonAsciiEncoding() {
MemoryBoundedLogMessageBuffer buf =
new MemoryBoundedLogMessageBuffer(TEN_KB);
buf.add(JP_TEXT);
StringWriter sw = new StringWriter();
buf.dumpTo(new PrintWriter(sw));
String dump = sw.toString();
assertTrue(dump.contains(JP_TEXT));
}
}