HBASE-4275 RS should communicate fatal "aborts" back to the master
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1163345 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
16171a1232
commit
48560e089c
|
@ -422,6 +422,7 @@ Release 0.91.0 - Unreleased
|
|||
HBASE-4291 Improve display of regions in transition in UI to be more
|
||||
readable (todd)
|
||||
HBASE-4281 Add facility to dump current state of all executors (todd)
|
||||
HBASE-4275 RS should communicate fatal "aborts" back to the master (todd)
|
||||
|
||||
TASKS
|
||||
HBASE-3559 Move report of split to master OFF the heartbeat channel
|
||||
|
|
|
@ -61,4 +61,12 @@ public interface HMasterRegionInterface extends VersionedProtocol {
|
|||
*/
|
||||
public void regionServerReport(byte [] sn, HServerLoad hsl)
|
||||
throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called by a region server to report a fatal error that is causing
|
||||
* it to abort.
|
||||
* @param sn {@link ServerName#getBytes()}
|
||||
* @param errorMessage informative text to expose in the master logs and UI
|
||||
*/
|
||||
public void reportRSFatalError(byte [] sn, String errorMessage);
|
||||
}
|
||||
|
|
|
@ -73,6 +73,7 @@ import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
|
|||
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
|
||||
import org.apache.hadoop.hbase.master.handler.CreateTableHandler;
|
||||
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
|
||||
import org.apache.hadoop.hbase.monitoring.MemoryBoundedLogMessageBuffer;
|
||||
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
|
||||
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
|
@ -156,6 +157,11 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
private CatalogTracker catalogTracker;
|
||||
// Cluster status zk tracker and local setter
|
||||
private ClusterStatusTracker clusterStatusTracker;
|
||||
|
||||
// buffer for "fatal error" notices from region servers
|
||||
// in the cluster. This is only used for assisting
|
||||
// operations/debugging.
|
||||
private MemoryBoundedLogMessageBuffer rsFatals;
|
||||
|
||||
// This flag is for stopping this Master instance. Its set when we are
|
||||
// stopping or aborting
|
||||
|
@ -223,6 +229,8 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
this.isa = this.rpcServer.getListenerAddress();
|
||||
this.serverName = new ServerName(this.isa.getHostName(),
|
||||
this.isa.getPort(), System.currentTimeMillis());
|
||||
this.rsFatals = new MemoryBoundedLogMessageBuffer(
|
||||
conf.getLong("hbase.master.buffer.for.rs.fatals", 1*1024*1024));
|
||||
|
||||
// initialize server principal (if using secure Hadoop)
|
||||
User.login(conf, "hbase.master.keytab.file",
|
||||
|
@ -759,6 +767,15 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reportRSFatalError(byte [] sn, String errorText) {
|
||||
ServerName serverName = new ServerName(sn);
|
||||
String msg = "Region server " + serverName + " reported a fatal error:\n"
|
||||
+ errorText;
|
||||
LOG.error(msg);
|
||||
rsFatals.add(msg);
|
||||
}
|
||||
|
||||
public boolean isMasterRunning() {
|
||||
return !isStopped();
|
||||
}
|
||||
|
@ -1207,6 +1224,10 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
public AssignmentManager getAssignmentManager() {
|
||||
return this.assignmentManager;
|
||||
}
|
||||
|
||||
public MemoryBoundedLogMessageBuffer getRegionServerFatalLogBuffer() {
|
||||
return rsFatals;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
/**
|
||||
* Copyright 2011 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.monitoring;
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* A size-bounded repository of alerts, which are kept
|
||||
* in a linked list. Alerts can be added, and they will
|
||||
* automatically be removed one by one when the specified heap
|
||||
* usage is exhausted.
|
||||
*/
|
||||
public class MemoryBoundedLogMessageBuffer {
|
||||
private final long maxSizeBytes;
|
||||
private long usage = 0;
|
||||
private LinkedList<LogMessage> messages;
|
||||
|
||||
public MemoryBoundedLogMessageBuffer(long maxSizeBytes) {
|
||||
Preconditions.checkArgument(
|
||||
maxSizeBytes > 0);
|
||||
this.maxSizeBytes = maxSizeBytes;
|
||||
this.messages = Lists.newLinkedList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Append the given message to this buffer, automatically evicting
|
||||
* older messages until the desired memory limit is achieved.
|
||||
*/
|
||||
public synchronized void add(String messageText) {
|
||||
LogMessage message = new LogMessage(messageText, System.currentTimeMillis());
|
||||
|
||||
usage += message.estimateHeapUsage();
|
||||
messages.add(message);
|
||||
while (usage > maxSizeBytes) {
|
||||
LogMessage removed = messages.remove();
|
||||
usage -= removed.estimateHeapUsage();
|
||||
assert usage >= 0;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Dump the contents of the buffer to the given stream.
|
||||
*/
|
||||
public synchronized void dumpTo(PrintWriter out) {
|
||||
SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");
|
||||
|
||||
for (LogMessage msg : messages) {
|
||||
out.write(df.format(new Date(msg.timestamp)));
|
||||
out.write(" ");
|
||||
out.println(new String(msg.message, Charsets.UTF_8));
|
||||
}
|
||||
}
|
||||
|
||||
synchronized List<LogMessage> getMessages() {
|
||||
// defensive copy
|
||||
return Lists.newArrayList(messages);
|
||||
}
|
||||
|
||||
/**
|
||||
* Estimate the number of bytes this buffer is currently
|
||||
* using.
|
||||
*/
|
||||
synchronized long estimateHeapUsage() {
|
||||
return usage;
|
||||
}
|
||||
|
||||
private static class LogMessage {
|
||||
/** the error text, encoded in bytes to save memory */
|
||||
public final byte[] message;
|
||||
public final long timestamp;
|
||||
|
||||
/**
|
||||
* Completely non-scientific estimate of how much one of these
|
||||
* objects takes, along with the LinkedList overhead. This doesn't
|
||||
* need to be exact, since we don't expect a ton of these alerts.
|
||||
*/
|
||||
private static final long BASE_USAGE=100;
|
||||
|
||||
public LogMessage(String message, long timestamp) {
|
||||
this.message = message.getBytes(Charsets.UTF_8);
|
||||
this.timestamp = timestamp;
|
||||
}
|
||||
|
||||
public long estimateHeapUsage() {
|
||||
return message.length + BASE_USAGE;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -1471,16 +1471,29 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
|
|||
* the exception that caused the abort, or null
|
||||
*/
|
||||
public void abort(String reason, Throwable cause) {
|
||||
String msg = "ABORTING region server " + this + ": " + reason;
|
||||
if (cause != null) {
|
||||
LOG.fatal("ABORTING region server " + this + ": " + reason, cause);
|
||||
LOG.fatal(msg, cause);
|
||||
} else {
|
||||
LOG.fatal("ABORTING region server " + this + ": " + reason);
|
||||
LOG.fatal(msg);
|
||||
}
|
||||
this.abortRequested = true;
|
||||
this.reservedSpace.clear();
|
||||
if (this.metrics != null) {
|
||||
LOG.info("Dump of metrics: " + this.metrics);
|
||||
}
|
||||
// Do our best to report our abort to the master, but this may not work
|
||||
try {
|
||||
if (cause != null) {
|
||||
msg += "\nCause:\n" + StringUtils.stringifyException(cause);
|
||||
}
|
||||
if (hbaseMaster != null) {
|
||||
hbaseMaster.reportRSFatalError(
|
||||
this.serverNameFromMasterPOV.getBytes(), msg);
|
||||
}
|
||||
} catch (Throwable t) {
|
||||
LOG.warn("Unable to report fatal error to master", t);
|
||||
}
|
||||
stop(reason);
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
/**
|
||||
* Copyright 2011 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.monitoring;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.PrintWriter;
|
||||
import java.io.StringWriter;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* Test case for the MemoryBoundedLogMessageBuffer utility.
|
||||
* Ensures that it uses no more memory than it's supposed to,
|
||||
* and that it properly deals with multibyte encodings.
|
||||
*/
|
||||
public class TestMemoryBoundedLogMessageBuffer {
|
||||
|
||||
private static final long TEN_KB = 10 * 1024;
|
||||
private static final String JP_TEXT = "こんにちは";
|
||||
|
||||
@Test
|
||||
public void testBuffer() {
|
||||
MemoryBoundedLogMessageBuffer buf =
|
||||
new MemoryBoundedLogMessageBuffer(TEN_KB);
|
||||
|
||||
for (int i = 0; i < 1000; i++) {
|
||||
buf.add("hello " + i);
|
||||
}
|
||||
assertTrue("Usage too big: " + buf.estimateHeapUsage(),
|
||||
buf.estimateHeapUsage() < TEN_KB);
|
||||
assertTrue("Too many retained: " + buf.getMessages().size(),
|
||||
buf.getMessages().size() < 100);
|
||||
StringWriter sw = new StringWriter();
|
||||
buf.dumpTo(new PrintWriter(sw));
|
||||
String dump = sw.toString();
|
||||
System.out.println(dump);
|
||||
assertFalse("The early log messages should be evicted",
|
||||
dump.contains("hello 1\n"));
|
||||
assertTrue("The late log messages should be retained",
|
||||
dump.contains("hello 999\n"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNonAsciiEncoding() {
|
||||
MemoryBoundedLogMessageBuffer buf =
|
||||
new MemoryBoundedLogMessageBuffer(TEN_KB);
|
||||
|
||||
buf.add(JP_TEXT);
|
||||
StringWriter sw = new StringWriter();
|
||||
buf.dumpTo(new PrintWriter(sw));
|
||||
String dump = sw.toString();
|
||||
assertTrue(dump.contains(JP_TEXT));
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue