HBASE-3168 Sanity date and time check when a region server joins the cluster (Jeff Whiting and jgray)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1033349 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jonathan Gray 2010-11-10 04:53:20 +00:00
parent b3d1a0749f
commit 66ca4976a8
8 changed files with 195 additions and 20 deletions

View File

@ -1134,6 +1134,8 @@ Release 0.90.0 - Unreleased
(Gary Helmling via Stack)
HBASE-3209 HBASE-3209 : New Compaction Algorithm
(Nicolas Spiegelberg via Stack)
HBASE-3168 Sanity date and time check when a region server joins the
cluster (Jeff Whiting and jgray)
NEW FEATURES

View File

@ -0,0 +1,33 @@
/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase;
import java.io.IOException;
/**
* This exception is thrown by the master when a region server clock skew is
* too high.
*/
@SuppressWarnings("serial")
public class ClockOutOfSyncException extends IOException {
public ClockOutOfSyncException(String message) {
super(message);
}
}

View File

@ -78,7 +78,9 @@ public interface HBaseRPCProtocolVersion extends VersionedProtocol {
* <li>Version 24: HBASE-2473, create table with regions.</li>
* <li>Version 25: Added openRegion and Stoppable/Abortable to API.</li>
* <li>Version 26: New master and Increment, 0.90 version bump.</li>
* <li>Version 27: HBASE-3168, Added serverCurrentTime to regionServerStartup
* in HMasterRegionInterface.</li>
* </ul>
*/
public static final long versionID = 26L;
public static final long versionID = 27L;
}

View File

@ -40,11 +40,13 @@ public interface HMasterRegionInterface extends HBaseRPCProtocolVersion {
/**
* Called when a region server first starts
* @param info server info
* @param serverCurrentTime The current time of the region server in ms
* @throws IOException e
* @return Configuration for the regionserver to use: e.g. filesystem,
* hbase rootdir, etc.
*/
public MapWritable regionServerStartup(HServerInfo info) throws IOException;
public MapWritable regionServerStartup(HServerInfo info,
long serverCurrentTime) throws IOException;
/**
* Called to renew lease, tell master what the region server is doing and to

View File

@ -165,6 +165,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
private volatile boolean balanceSwitch = true;
private Thread catalogJanitorChore;
private LogCleaner logCleaner;
/**
* Initializes the HMaster. The steps are as follows:
@ -518,6 +519,14 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
// tables.
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS, 1);
// Start log cleaner thread
String n = Thread.currentThread().getName();
this.logCleaner =
new LogCleaner(conf.getInt("hbase.master.cleaner.interval", 60 * 1000),
this, conf, getMasterFileSystem().getFileSystem(),
getMasterFileSystem().getOldLogDir());
Threads.setDaemonThreadRunning(logCleaner, n + ".oldLogCleaner");
// Put up info server.
int port = this.conf.getInt("hbase.master.info.port", 60010);
if (port >= 0) {
@ -546,6 +555,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
}
if (this.rpcServer != null) this.rpcServer.stop();
// Clean up and close up shop
this.logCleaner.interrupt();
if (this.infoServer != null) {
LOG.info("Stopping infoServer");
try {
@ -579,7 +589,9 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
}
}
public MapWritable regionServerStartup(final HServerInfo serverInfo)
@Override
public MapWritable regionServerStartup(final HServerInfo serverInfo,
final long serverCurrentTime)
throws IOException {
// Set the ip into the passed in serverInfo. Its ip is more than likely
// not the ip that the master sees here. See at end of this method where
@ -591,7 +603,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
serverInfo.setServerAddress(new HServerAddress(rsAddress,
serverInfo.getServerAddress().getPort()));
// Register with server manager
this.serverManager.regionServerStartup(serverInfo);
this.serverManager.regionServerStartup(serverInfo, serverCurrentTime);
// Send back some config info
MapWritable mw = createConfigurationSubset();
mw.put(new Text("hbase.regionserver.address"), new Text(rsAddress));

View File

@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClockOutOfSyncException;
import org.apache.hadoop.hbase.HMsg;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HServerAddress;
@ -48,7 +49,6 @@ import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
import org.apache.hadoop.hbase.util.Threads;
/**
* The ServerManager class manages info about region servers - HServerInfo,
@ -84,13 +84,13 @@ public class ServerManager {
private final Server master;
private final MasterServices services;
private final LogCleaner logCleaner;
// Reporting to track master metrics.
private final MasterMetrics metrics;
final DeadServer deadservers = new DeadServer();
private final long maxSkew;
/**
* Constructor.
* @param master
@ -105,20 +105,16 @@ public class ServerManager {
this.services = services;
this.metrics = metrics;
Configuration c = master.getConfiguration();
String n = Thread.currentThread().getName();
this.logCleaner =
new LogCleaner(c.getInt("hbase.master.cleaner.interval", 60 * 1000),
master, c, this.services.getMasterFileSystem().getFileSystem(),
this.services.getMasterFileSystem().getOldLogDir());
Threads.setDaemonThreadRunning(logCleaner, n + ".oldLogCleaner");
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
}
/**
* Let the server manager know a new regionserver has come online
* @param serverInfo
* @param serverCurrentTime The current time of the region server in ms
* @throws IOException
*/
void regionServerStartup(final HServerInfo serverInfo)
void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
throws IOException {
// Test for case where we get a region startup message from a regionserver
// that has been quickly restarted but whose znode expiration handler has
@ -130,6 +126,7 @@ public class ServerManager {
HServerInfo info = new HServerInfo(serverInfo);
checkIsDead(info.getServerName(), "STARTUP");
checkAlreadySameHostPort(info);
checkClockSkew(info, serverCurrentTime);
recordNewServer(info, false, null);
}
@ -167,6 +164,24 @@ public class ServerManager {
return null;
}
/**
* Checks if the clock skew between the server and the master. If the clock
* skew is too much it will throw an Exception.
* @throws ClockOutOfSyncException
*/
private void checkClockSkew(final HServerInfo serverInfo,
final long serverCurrentTime)
throws ClockOutOfSyncException {
long skew = System.currentTimeMillis() - serverCurrentTime;
if (skew > maxSkew) {
String message = "Server " + serverInfo.getServerName() + " has been " +
"rejected; Reported time is too far out of sync with master. " +
"Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
LOG.warn(message);
throw new ClockOutOfSyncException(message);
}
}
/**
* If this server is on the dead list, reject it with a LeaseStillHeldException
* @param serverName Server name formatted as host_port_startcode.
@ -651,11 +666,9 @@ public class ServerManager {
}
/**
* Stop the ServerManager.
* <p>
* Currently just interrupts the ServerMonitor and LogCleaner chores.
* Stop the ServerManager. Currently does nothing.
*/
public void stop() {
this.logCleaner.interrupt();
}
}

View File

@ -53,6 +53,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.ClockOutOfSyncException;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
@ -71,6 +72,7 @@ import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.UnknownRowLockException;
import org.apache.hadoop.hbase.UnknownScannerException;
import org.apache.hadoop.hbase.YouAreDeadException;
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.catalog.RootLocationEditor;
@ -110,6 +112,7 @@ import org.apache.hadoop.hbase.regionserver.wal.WALObserver;
import org.apache.hadoop.hbase.replication.regionserver.Replication;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CompressionTest;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.InfoServer;
import org.apache.hadoop.hbase.util.Pair;
@ -120,6 +123,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.DNS;
import org.apache.zookeeper.KeeperException;
@ -1399,7 +1403,7 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
* Let the master know we're here Run initialization using parameters passed
* us by the master.
*/
private MapWritable reportForDuty() {
private MapWritable reportForDuty() throws IOException {
HServerAddress masterAddress = null;
while (!stopped && (masterAddress = getMaster()) == null) {
sleeper.sleep();
@ -1417,8 +1421,19 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
this.serverInfo.getServerAddress());
this.serverInfo.setLoad(buildServerLoad());
LOG.info("Telling master at " + masterAddress + " that we are up");
result = this.hbaseMaster.regionServerStartup(this.serverInfo);
result = this.hbaseMaster.regionServerStartup(this.serverInfo,
EnvironmentEdgeManager.currentTimeMillis());
break;
} catch (RemoteException e) {
IOException ioe = e.unwrapRemoteException();
if (ioe instanceof ClockOutOfSyncException) {
LOG.fatal("Master rejected startup because clock is out of sync",
ioe);
// Re-throw IOE will cause RS to abort
throw ioe;
} else {
LOG.warn("remote error telling master we are up", e);
}
} catch (IOException e) {
LOG.warn("error telling master we are up", e);
} catch (KeeperException e) {

View File

@ -0,0 +1,96 @@
/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.master;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClockOutOfSyncException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HServerAddress;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.junit.Test;
public class TestClockSkewDetection {
private static final Log LOG =
LogFactory.getLog(TestClockSkewDetection.class);
@Test
public void testClockSkewDetection() throws Exception {
final Configuration conf = HBaseConfiguration.create();
ServerManager sm = new ServerManager(new Server() {
@Override
public CatalogTracker getCatalogTracker() {
return null;
}
@Override
public Configuration getConfiguration() {
return conf;
}
@Override
public String getServerName() {
return null;
}
@Override
public ZooKeeperWatcher getZooKeeper() {
return null;
}
@Override
public void abort(String why, Throwable e) {}
@Override
public boolean isStopped() {
return false;
}
@Override
public void stop(String why) {
}}, null, null);
LOG.debug("regionServerStartup 1");
HServerInfo hsi1 = new HServerInfo(new HServerAddress("example.org:1234"),
System.currentTimeMillis(), -1, "example.com");
sm.regionServerStartup(hsi1, System.currentTimeMillis());
long maxSkew = 30000;
try {
LOG.debug("regionServerStartup 2");
HServerInfo hsi2 = new HServerInfo(new HServerAddress("example.org:1235"),
System.currentTimeMillis(), -1, "example.com");
sm.regionServerStartup(hsi2, System.currentTimeMillis() - maxSkew * 2);
Assert.assertTrue("HMaster should have thrown an ClockOutOfSyncException "
+ "but didn't.", false);
} catch(ClockOutOfSyncException e) {
//we want an exception
LOG.info("Recieved expected exception: "+e);
}
}
}