HBASE-3168 Sanity date and time check when a region server joins the cluster (Jeff Whiting and jgray)
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1033349 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
b3d1a0749f
commit
66ca4976a8
|
@ -1134,6 +1134,8 @@ Release 0.90.0 - Unreleased
|
|||
(Gary Helmling via Stack)
|
||||
HBASE-3209 HBASE-3209 : New Compaction Algorithm
|
||||
(Nicolas Spiegelberg via Stack)
|
||||
HBASE-3168 Sanity date and time check when a region server joins the
|
||||
cluster (Jeff Whiting and jgray)
|
||||
|
||||
|
||||
NEW FEATURES
|
||||
|
|
|
@ -0,0 +1,33 @@
|
|||
/**
|
||||
* Copyright 2010 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* This exception is thrown by the master when a region server clock skew is
|
||||
* too high.
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class ClockOutOfSyncException extends IOException {
|
||||
public ClockOutOfSyncException(String message) {
|
||||
super(message);
|
||||
}
|
||||
}
|
|
@ -78,7 +78,9 @@ public interface HBaseRPCProtocolVersion extends VersionedProtocol {
|
|||
* <li>Version 24: HBASE-2473, create table with regions.</li>
|
||||
* <li>Version 25: Added openRegion and Stoppable/Abortable to API.</li>
|
||||
* <li>Version 26: New master and Increment, 0.90 version bump.</li>
|
||||
* <li>Version 27: HBASE-3168, Added serverCurrentTime to regionServerStartup
|
||||
* in HMasterRegionInterface.</li>
|
||||
* </ul>
|
||||
*/
|
||||
public static final long versionID = 26L;
|
||||
public static final long versionID = 27L;
|
||||
}
|
||||
|
|
|
@ -40,11 +40,13 @@ public interface HMasterRegionInterface extends HBaseRPCProtocolVersion {
|
|||
/**
|
||||
* Called when a region server first starts
|
||||
* @param info server info
|
||||
* @param serverCurrentTime The current time of the region server in ms
|
||||
* @throws IOException e
|
||||
* @return Configuration for the regionserver to use: e.g. filesystem,
|
||||
* hbase rootdir, etc.
|
||||
*/
|
||||
public MapWritable regionServerStartup(HServerInfo info) throws IOException;
|
||||
public MapWritable regionServerStartup(HServerInfo info,
|
||||
long serverCurrentTime) throws IOException;
|
||||
|
||||
/**
|
||||
* Called to renew lease, tell master what the region server is doing and to
|
||||
|
|
|
@ -165,6 +165,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
private volatile boolean balanceSwitch = true;
|
||||
|
||||
private Thread catalogJanitorChore;
|
||||
private LogCleaner logCleaner;
|
||||
|
||||
/**
|
||||
* Initializes the HMaster. The steps are as follows:
|
||||
|
@ -518,6 +519,14 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
// tables.
|
||||
this.executorService.startExecutorService(ExecutorType.MASTER_TABLE_OPERATIONS, 1);
|
||||
|
||||
// Start log cleaner thread
|
||||
String n = Thread.currentThread().getName();
|
||||
this.logCleaner =
|
||||
new LogCleaner(conf.getInt("hbase.master.cleaner.interval", 60 * 1000),
|
||||
this, conf, getMasterFileSystem().getFileSystem(),
|
||||
getMasterFileSystem().getOldLogDir());
|
||||
Threads.setDaemonThreadRunning(logCleaner, n + ".oldLogCleaner");
|
||||
|
||||
// Put up info server.
|
||||
int port = this.conf.getInt("hbase.master.info.port", 60010);
|
||||
if (port >= 0) {
|
||||
|
@ -546,6 +555,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
}
|
||||
if (this.rpcServer != null) this.rpcServer.stop();
|
||||
// Clean up and close up shop
|
||||
this.logCleaner.interrupt();
|
||||
if (this.infoServer != null) {
|
||||
LOG.info("Stopping infoServer");
|
||||
try {
|
||||
|
@ -579,7 +589,9 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
}
|
||||
}
|
||||
|
||||
public MapWritable regionServerStartup(final HServerInfo serverInfo)
|
||||
@Override
|
||||
public MapWritable regionServerStartup(final HServerInfo serverInfo,
|
||||
final long serverCurrentTime)
|
||||
throws IOException {
|
||||
// Set the ip into the passed in serverInfo. Its ip is more than likely
|
||||
// not the ip that the master sees here. See at end of this method where
|
||||
|
@ -591,7 +603,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
|||
serverInfo.setServerAddress(new HServerAddress(rsAddress,
|
||||
serverInfo.getServerAddress().getPort()));
|
||||
// Register with server manager
|
||||
this.serverManager.regionServerStartup(serverInfo);
|
||||
this.serverManager.regionServerStartup(serverInfo, serverCurrentTime);
|
||||
// Send back some config info
|
||||
MapWritable mw = createConfigurationSubset();
|
||||
mw.put(new Text("hbase.regionserver.address"), new Text(rsAddress));
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ClockOutOfSyncException;
|
||||
import org.apache.hadoop.hbase.HMsg;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HServerAddress;
|
||||
|
@ -48,7 +49,6 @@ import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
|
|||
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
|
||||
import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
|
||||
import org.apache.hadoop.hbase.regionserver.Leases.LeaseStillHeldException;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
|
||||
/**
|
||||
* The ServerManager class manages info about region servers - HServerInfo,
|
||||
|
@ -84,13 +84,13 @@ public class ServerManager {
|
|||
private final Server master;
|
||||
private final MasterServices services;
|
||||
|
||||
private final LogCleaner logCleaner;
|
||||
|
||||
// Reporting to track master metrics.
|
||||
private final MasterMetrics metrics;
|
||||
|
||||
final DeadServer deadservers = new DeadServer();
|
||||
|
||||
private final long maxSkew;
|
||||
|
||||
/**
|
||||
* Constructor.
|
||||
* @param master
|
||||
|
@ -105,20 +105,16 @@ public class ServerManager {
|
|||
this.services = services;
|
||||
this.metrics = metrics;
|
||||
Configuration c = master.getConfiguration();
|
||||
String n = Thread.currentThread().getName();
|
||||
this.logCleaner =
|
||||
new LogCleaner(c.getInt("hbase.master.cleaner.interval", 60 * 1000),
|
||||
master, c, this.services.getMasterFileSystem().getFileSystem(),
|
||||
this.services.getMasterFileSystem().getOldLogDir());
|
||||
Threads.setDaemonThreadRunning(logCleaner, n + ".oldLogCleaner");
|
||||
maxSkew = c.getLong("hbase.master.maxclockskew", 30000);
|
||||
}
|
||||
|
||||
/**
|
||||
* Let the server manager know a new regionserver has come online
|
||||
* @param serverInfo
|
||||
* @param serverCurrentTime The current time of the region server in ms
|
||||
* @throws IOException
|
||||
*/
|
||||
void regionServerStartup(final HServerInfo serverInfo)
|
||||
void regionServerStartup(final HServerInfo serverInfo, long serverCurrentTime)
|
||||
throws IOException {
|
||||
// Test for case where we get a region startup message from a regionserver
|
||||
// that has been quickly restarted but whose znode expiration handler has
|
||||
|
@ -130,6 +126,7 @@ public class ServerManager {
|
|||
HServerInfo info = new HServerInfo(serverInfo);
|
||||
checkIsDead(info.getServerName(), "STARTUP");
|
||||
checkAlreadySameHostPort(info);
|
||||
checkClockSkew(info, serverCurrentTime);
|
||||
recordNewServer(info, false, null);
|
||||
}
|
||||
|
||||
|
@ -167,6 +164,24 @@ public class ServerManager {
|
|||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the clock skew between the server and the master. If the clock
|
||||
* skew is too much it will throw an Exception.
|
||||
* @throws ClockOutOfSyncException
|
||||
*/
|
||||
private void checkClockSkew(final HServerInfo serverInfo,
|
||||
final long serverCurrentTime)
|
||||
throws ClockOutOfSyncException {
|
||||
long skew = System.currentTimeMillis() - serverCurrentTime;
|
||||
if (skew > maxSkew) {
|
||||
String message = "Server " + serverInfo.getServerName() + " has been " +
|
||||
"rejected; Reported time is too far out of sync with master. " +
|
||||
"Time difference of " + skew + "ms > max allowed of " + maxSkew + "ms";
|
||||
LOG.warn(message);
|
||||
throw new ClockOutOfSyncException(message);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* If this server is on the dead list, reject it with a LeaseStillHeldException
|
||||
* @param serverName Server name formatted as host_port_startcode.
|
||||
|
@ -651,11 +666,9 @@ public class ServerManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Stop the ServerManager.
|
||||
* <p>
|
||||
* Currently just interrupts the ServerMonitor and LogCleaner chores.
|
||||
* Stop the ServerManager. Currently does nothing.
|
||||
*/
|
||||
public void stop() {
|
||||
this.logCleaner.interrupt();
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Chore;
|
||||
import org.apache.hadoop.hbase.ClockOutOfSyncException;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
|
@ -71,6 +72,7 @@ import org.apache.hadoop.hbase.Stoppable;
|
|||
import org.apache.hadoop.hbase.UnknownRowLockException;
|
||||
import org.apache.hadoop.hbase.UnknownScannerException;
|
||||
import org.apache.hadoop.hbase.YouAreDeadException;
|
||||
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
|
||||
import org.apache.hadoop.hbase.catalog.CatalogTracker;
|
||||
import org.apache.hadoop.hbase.catalog.MetaEditor;
|
||||
import org.apache.hadoop.hbase.catalog.RootLocationEditor;
|
||||
|
@ -110,6 +112,7 @@ import org.apache.hadoop.hbase.regionserver.wal.WALObserver;
|
|||
import org.apache.hadoop.hbase.replication.regionserver.Replication;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.CompressionTest;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.InfoServer;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
|
@ -120,6 +123,7 @@ import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.apache.hadoop.io.MapWritable;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.hadoop.net.DNS;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
|
||||
|
@ -1399,7 +1403,7 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
|
|||
* Let the master know we're here Run initialization using parameters passed
|
||||
* us by the master.
|
||||
*/
|
||||
private MapWritable reportForDuty() {
|
||||
private MapWritable reportForDuty() throws IOException {
|
||||
HServerAddress masterAddress = null;
|
||||
while (!stopped && (masterAddress = getMaster()) == null) {
|
||||
sleeper.sleep();
|
||||
|
@ -1417,8 +1421,19 @@ public class HRegionServer implements HRegionInterface, HBaseRPCErrorHandler,
|
|||
this.serverInfo.getServerAddress());
|
||||
this.serverInfo.setLoad(buildServerLoad());
|
||||
LOG.info("Telling master at " + masterAddress + " that we are up");
|
||||
result = this.hbaseMaster.regionServerStartup(this.serverInfo);
|
||||
result = this.hbaseMaster.regionServerStartup(this.serverInfo,
|
||||
EnvironmentEdgeManager.currentTimeMillis());
|
||||
break;
|
||||
} catch (RemoteException e) {
|
||||
IOException ioe = e.unwrapRemoteException();
|
||||
if (ioe instanceof ClockOutOfSyncException) {
|
||||
LOG.fatal("Master rejected startup because clock is out of sync",
|
||||
ioe);
|
||||
// Re-throw IOE will cause RS to abort
|
||||
throw ioe;
|
||||
} else {
|
||||
LOG.warn("remote error telling master we are up", e);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("error telling master we are up", e);
|
||||
} catch (KeeperException e) {
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
/**
|
||||
* Copyright 2010 The Apache Software Foundation
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.ClockOutOfSyncException;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HServerAddress;
|
||||
import org.apache.hadoop.hbase.HServerInfo;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.catalog.CatalogTracker;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
public class TestClockSkewDetection {
|
||||
private static final Log LOG =
|
||||
LogFactory.getLog(TestClockSkewDetection.class);
|
||||
|
||||
@Test
|
||||
public void testClockSkewDetection() throws Exception {
|
||||
final Configuration conf = HBaseConfiguration.create();
|
||||
ServerManager sm = new ServerManager(new Server() {
|
||||
@Override
|
||||
public CatalogTracker getCatalogTracker() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConfiguration() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getServerName() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ZooKeeperWatcher getZooKeeper() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void abort(String why, Throwable e) {}
|
||||
|
||||
@Override
|
||||
public boolean isStopped() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop(String why) {
|
||||
}}, null, null);
|
||||
|
||||
LOG.debug("regionServerStartup 1");
|
||||
HServerInfo hsi1 = new HServerInfo(new HServerAddress("example.org:1234"),
|
||||
System.currentTimeMillis(), -1, "example.com");
|
||||
sm.regionServerStartup(hsi1, System.currentTimeMillis());
|
||||
|
||||
long maxSkew = 30000;
|
||||
|
||||
try {
|
||||
LOG.debug("regionServerStartup 2");
|
||||
HServerInfo hsi2 = new HServerInfo(new HServerAddress("example.org:1235"),
|
||||
System.currentTimeMillis(), -1, "example.com");
|
||||
sm.regionServerStartup(hsi2, System.currentTimeMillis() - maxSkew * 2);
|
||||
Assert.assertTrue("HMaster should have thrown an ClockOutOfSyncException "
|
||||
+ "but didn't.", false);
|
||||
} catch(ClockOutOfSyncException e) {
|
||||
//we want an exception
|
||||
LOG.info("Recieved expected exception: "+e);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue