HBASE-2998 rolling-restart.sh shouldn't rely on zoo.cfg

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1026470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-10-22 19:54:21 +00:00
parent 4b3fc6b14b
commit f36d48b045
15 changed files with 328 additions and 52 deletions

View File

@ -603,6 +603,7 @@ Release 0.21.0 - Unreleased
work w/ new region naming convention from HBASE-2531
HBASE-3140 Rest schema modification throw null pointer exception
(David Worms via Stack)
HBASE-2998 rolling-restart.sh shouldn't rely on zoo.cfg
IMPROVEMENTS

View File

@ -262,7 +262,9 @@ elif [ "$COMMAND" = "zookeeper" ] ; then
HBASE_OPTS="$HBASE_OPTS $HBASE_ZOOKEEPER_OPTS"
fi
elif [ "$COMMAND" = "zkcli" ] ; then
CLASS='org.apache.zookeeper.ZooKeeperMain'
# ZooKeeperMainServerArg returns '-server HOST:PORT' or empty string.
SERVER_ARG=`"$bin"/hbase org.apache.hadoop.hbase.zookeeper.ZooKeeperMainServerArg`
CLASS="org.apache.zookeeper.ZooKeeperMain ${SERVER_ARG}"
elif [ "$COMMAND" = "classpath" ] ; then
echo $CLASSPATH
exit 0

View File

@ -74,7 +74,7 @@ hbase_rotate_log ()
wait_until_done ()
{
p=$1
cnt=${HBASE_SLAVE_TIMEOUT:-60}
cnt=${HBASE_SLAVE_TIMEOUT:-300}
origcnt=$cnt
while kill -0 $p > /dev/null 2>&1; do
if [ $cnt -gt 1 ]; then

View File

@ -264,7 +264,7 @@ public class CatalogTracker {
if (!refresh) {
return current;
}
if (verifyRegionLocation(current, META_REGION)) {
if (verifyRegionLocation(current, this.metaLocation, META_REGION)) {
return current;
}
resetMetaLocation();
@ -278,7 +278,7 @@ public class CatalogTracker {
return null;
}
HRegionInterface newConnection = getCachedConnection(newLocation);
if (verifyRegionLocation(newConnection, META_REGION)) {
if (verifyRegionLocation(newConnection, this.metaLocation, META_REGION)) {
setMetaLocation(newLocation);
return newConnection;
}
@ -402,6 +402,7 @@ public class CatalogTracker {
}
private boolean verifyRegionLocation(HRegionInterface metaServer,
final HServerAddress address,
byte [] regionName)
throws IOException {
if (metaServer == null) {
@ -428,7 +429,8 @@ public class CatalogTracker {
throw e;
}
}
LOG.info("Failed verification of " + Bytes.toString(regionName) + "; " + t);
LOG.info("Failed verification of " + Bytes.toString(regionName) +
" at address=" + address + "; " + t);
return false;
}
@ -452,7 +454,8 @@ public class CatalogTracker {
throw e;
}
return (connection == null)? false:
verifyRegionLocation(connection, HRegionInfo.ROOT_REGIONINFO.getRegionName());
verifyRegionLocation(connection,this.rootRegionTracker.getRootRegionLocation(),
HRegionInfo.ROOT_REGIONINFO.getRegionName());
}
/**
@ -485,9 +488,7 @@ public class CatalogTracker {
LOG.info("-ROOT- is not assigned; continuing");
} else if (hsi.getServerAddress().equals(rootHsa)) {
result.setFirst(true);
LOG.info(hsi.getServerName() + " carrying -ROOT-; deleting " +
"-ROOT- location from meta");
RootLocationEditor.deleteRootLocation(this.zookeeper);
LOG.info(hsi.getServerName() + " carrying -ROOT-; unsetting");
}
HServerAddress metaHsa = getMetaLocation();
if (metaHsa == null) {

View File

@ -170,7 +170,9 @@ public class ExecutorService {
throw new RuntimeException("An executor service with the name " + name +
" is already running (2)!");
}
LOG.debug("Starting executor service: " + name);
LOG.debug("Starting executor service name=" + name +
", corePoolSize=" + hbes.threadPoolExecutor.getCorePoolSize() +
", maxPoolSize=" + hbes.threadPoolExecutor.getMaximumPoolSize());
}
boolean isExecutorServiceRunning(String name) {

View File

@ -37,6 +37,7 @@ import java.util.TreeMap;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@ -45,7 +46,6 @@ import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.catalog.CatalogTracker;
@ -67,11 +67,13 @@ import org.apache.hadoop.hbase.zookeeper.ZKAssign;
import org.apache.hadoop.hbase.zookeeper.ZKTableDisable;
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
import org.apache.hadoop.hbase.zookeeper.ZKUtil.NodeAndData;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.data.Stat;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
@ -659,19 +661,27 @@ public class AssignmentManager extends ZooKeeperListener {
}
// Presumption is that only this thread will be updating the state at this
// time; i.e. handlers on backend won't be trying to set it to OPEN, etc.
AtomicInteger counter = new AtomicInteger(0);
CreateUnassignedAsyncCallback cb =
new CreateUnassignedAsyncCallback(this.watcher, destination, counter);
for (RegionState state: states) {
if (!setOfflineInZooKeeper(state)) {
if (!asyncSetOfflineInZooKeeper(state, cb, state)) {
return;
}
}
for (RegionState state: states) {
// Transition RegionState to PENDING_OPEN here in master; means we've
// sent the open. We're a little ahead of ourselves here since we've not
// yet sent out the actual open but putting this state change after the
// call to open risks our writing PENDING_OPEN after state has been moved
// to OPENING by the regionserver.
state.update(RegionState.State.PENDING_OPEN);
// Wait until all unassigned nodes have been put up and watchers set.
int total = regions.size();
for (int oldCounter = 0; true;) {
int count = counter.get();
if (oldCounter != count) {
LOG.info(destination.getServerName() + " unassigned znodes=" + count +
" of total=" + total);
oldCounter = count;
}
if (count == total) break;
Threads.sleep(1);
}
// Move on to open regions.
try {
// Send OPEN RPC. This can fail if the server on other end is is not up.
this.serverManager.sendRegionOpen(destination, regions);
@ -682,6 +692,72 @@ public class AssignmentManager extends ZooKeeperListener {
LOG.debug("Bulk assigning done for " + destination.getServerName());
}
/**
* Callback handler for create unassigned znodes used during bulk assign.
*/
static class CreateUnassignedAsyncCallback implements AsyncCallback.StringCallback {
private final Log LOG = LogFactory.getLog(CreateUnassignedAsyncCallback.class);
private final ZooKeeperWatcher zkw;
private final HServerInfo destination;
private final AtomicInteger counter;
CreateUnassignedAsyncCallback(final ZooKeeperWatcher zkw,
final HServerInfo destination, final AtomicInteger counter) {
this.zkw = zkw;
this.destination = destination;
this.counter = counter;
}
@Override
public void processResult(int rc, String path, Object ctx, String name) {
if (rc != 0) {
// Thisis resultcode. If non-zero, need to resubmit.
LOG.warn("rc != 0 for " + path + " -- retryable connectionloss -- " +
"FIX see http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A2");
this.zkw.abort("Connectionloss writing unassigned at " + path +
", rc=" + rc, null);
return;
}
LOG.debug("rs=" + (RegionState)ctx + ", server=" + this.destination.getServerName());
// Async exists to set a watcher so we'll get triggered when
// unassigned node changes.
this.zkw.getZooKeeper().exists(path, this.zkw,
new ExistsUnassignedAsyncCallback(this.counter), ctx);
}
}
/**
* Callback handler for the exists call that sets watcher on unassigned znodes.
* Used during bulk assign on startup.
*/
static class ExistsUnassignedAsyncCallback implements AsyncCallback.StatCallback {
private final Log LOG = LogFactory.getLog(ExistsUnassignedAsyncCallback.class);
private final AtomicInteger counter;
ExistsUnassignedAsyncCallback(final AtomicInteger counter) {
this.counter = counter;
}
@Override
public void processResult(int rc, String path, Object ctx, Stat stat) {
if (rc != 0) {
// Thisis resultcode. If non-zero, need to resubmit.
LOG.warn("rc != 0 for " + path + " -- retryable connectionloss -- " +
"FIX see http://wiki.apache.org/hadoop/ZooKeeper/FAQ#A2");
return;
}
RegionState state = (RegionState)ctx;
LOG.debug("rs=" + state);
// Transition RegionState to PENDING_OPEN here in master; means we've
// sent the open. We're a little ahead of ourselves here since we've not
// yet sent out the actual open but putting this state change after the
// call to open risks our writing PENDING_OPEN after state has been moved
// to OPENING by the regionserver.
state.update(RegionState.State.PENDING_OPEN);
this.counter.addAndGet(1);
}
}
/**
* @param region
* @return
@ -717,6 +793,10 @@ public class AssignmentManager extends ZooKeeperListener {
*/
private void assign(final RegionState state) {
if (!setOfflineInZooKeeper(state)) return;
if (this.master.isStopped()) {
LOG.debug("Server stopped; skipping assign of " + state);
return;
}
RegionPlan plan = getRegionPlan(state);
if (plan == null) return; // Should get reassigned later when RIT times out.
try {
@ -768,6 +848,31 @@ public class AssignmentManager extends ZooKeeperListener {
return true;
}
/**
* Set region as OFFLINED up in zookeeper asynchronously.
* @param state
* @return True if we succeeded, false otherwise (State was incorrect or failed
* updating zk).
*/
boolean asyncSetOfflineInZooKeeper(final RegionState state,
final AsyncCallback.StringCallback cb, final Object ctx) {
if (!state.isClosed() && !state.isOffline()) {
new RuntimeException("Unexpected state trying to OFFLINE; " + state);
this.master.abort("Unexpected state trying to OFFLINE; " + state,
new IllegalStateException());
return false;
}
state.update(RegionState.State.OFFLINE);
try {
ZKAssign.asyncCreateNodeOffline(master.getZooKeeper(), state.getRegion(),
master.getServerName(), cb, ctx);
} catch (KeeperException e) {
master.abort("Unexpected ZK exception creating/setting node OFFLINE", e);
return false;
}
return true;
}
/**
* @param state
* @return Plan for passed <code>state</code> (If none currently, it creates one or

View File

@ -259,41 +259,31 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
this.activeMasterManager = new ActiveMasterManager(zooKeeper, address, this);
this.zooKeeper.registerListener(activeMasterManager);
stallIfBackupMaster(this.conf, this.activeMasterManager);
activeMasterManager.blockUntilBecomingActiveMaster();
this.activeMasterManager.blockUntilBecomingActiveMaster();
// We are either the active master or we were asked to shutdown
if (!this.stopped) {
// We are active master. Finish init and loop until we are closed.
finishInitialization();
loop();
// Once we break out of here, we are being shutdown
// Stop chores
stopChores();
// Wait for all the remaining region servers to report in IFF we were
// running a cluster shutdown AND we were NOT aborting.
if (!this.abort && this.serverManager.isClusterShutdown()) {
this.serverManager.letRegionServersShutdown();
}
stopServiceThreads();
}
// Handle either a backup or active master being stopped
} catch (Throwable t) {
abort("Unhandled exception. Starting shutdown.", t);
} finally {
stopChores();
// Wait for all the remaining region servers to report in IFF we were
// running a cluster shutdown AND we were NOT aborting.
if (!this.abort && this.serverManager.isClusterShutdown()) {
this.serverManager.letRegionServersShutdown();
}
stopServiceThreads();
// Stop services started for both backup and active masters
this.activeMasterManager.stop();
if (this.activeMasterManager != null) this.activeMasterManager.stop();
this.catalogTracker.stop();
this.serverManager.stop();
this.assignmentManager.stop();
HConnectionManager.deleteConnection(this.conf, true);
this.zooKeeper.close();
LOG.info("HMaster main thread exiting");
} catch (Throwable t) {
abort("Unhandled exception. Starting shutdown.", t);
}
LOG.info("HMaster main thread exiting");
}
private void loop() {
@ -543,7 +533,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
if (LOG.isDebugEnabled()) {
LOG.debug("Stopping service threads");
}
this.rpcServer.stop();
if (this.rpcServer != null) this.rpcServer.stop();
// Clean up and close up shop
if (this.infoServer != null) {
LOG.info("Stopping infoServer");
@ -553,7 +543,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
ex.printStackTrace();
}
}
this.executorService.shutdown();
if (this.executorService != null) this.executorService.shutdown();
}
private static Thread getAndStartBalancerChore(final HMaster master) {

View File

@ -19,8 +19,6 @@
*/
package org.apache.hadoop.hbase.master.handler;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HRegionInfo;
@ -104,4 +102,4 @@ public class OpenedRegionHandler extends EventHandler implements TotesHRegionInf
LOG.debug("Opened region " + regionInfo.getRegionNameAsString());
}
}
}
}

View File

@ -67,6 +67,10 @@ public class OpenRegionHandler extends EventHandler {
public void process() throws IOException {
final String name = regionInfo.getRegionNameAsString();
LOG.debug("Processing open of " + name);
if (this.server.isStopped()) {
LOG.info("Server stopping, skipping open of " + name);
return;
}
final String encodedName = regionInfo.getEncodedName();
// TODO: Previously we would check for root region availability (but only that it

View File

@ -26,6 +26,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.executor.EventHandler.EventType;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.Code;
import org.apache.zookeeper.KeeperException.NoNodeException;
@ -148,6 +149,36 @@ public class ZKAssign {
}
}
/**
* Creates an unassigned node in the OFFLINE state for the specified region.
* <p>
* Runs asynchronously. Depends on no pre-existing znode.
*
* <p>Sets a watcher on the unassigned region node.
*
* @param zkw zk reference
* @param region region to be created as offline
* @param serverName server event originates from
* @param cb
* @param ctx
* @throws KeeperException if unexpected zookeeper exception
* @throws KeeperException.NodeExistsException if node already exists
*/
public static void asyncCreateNodeOffline(ZooKeeperWatcher zkw,
HRegionInfo region, String serverName,
final AsyncCallback.StringCallback cb, final Object ctx)
throws KeeperException {
LOG.debug(zkw.prefix("Async create of unassigned node for " +
region.getEncodedName() + " with OFFLINE state"));
RegionTransitionData data = new RegionTransitionData(
EventType.M_ZK_REGION_OFFLINE, region.getRegionName(), serverName);
synchronized(zkw.getNodes()) {
String node = getNodeName(zkw, region.getEncodedName());
zkw.getNodes().add(node);
ZKUtil.asyncCreate(zkw, node, data.getBytes(), cb, ctx);
}
}
/**
* Forces an existing unassigned node to the OFFLINE state for the specified
* region.

View File

@ -33,7 +33,7 @@ import org.apache.hadoop.hbase.HBaseConfiguration;
public class ZKServerTool {
/**
* Run the tool.
* @param args Command line arguments. First arg is path to zookeepers file.
* @param args Command line arguments.
*/
public static void main(String args[]) {
Configuration conf = HBaseConfiguration.create();
@ -51,4 +51,4 @@ public class ZKServerTool {
}
}
}
}
}

View File

@ -37,6 +37,7 @@ import org.apache.hadoop.hbase.HServerAddress;
import org.apache.hadoop.hbase.HServerInfo;
import org.apache.hadoop.hbase.executor.RegionTransitionData;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.zookeeper.AsyncCallback;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.Watcher;
@ -809,6 +810,30 @@ public class ZKUtil {
}
}
/**
* Async creates the specified node with the specified data.
*
* <p>Throws an exception if the node already exists.
*
* <p>The node created is persistent and open access.
*
* @param zkw zk reference
* @param znode path of node to create
* @param data data of node to create
* @param cb
* @param ctx
* @return version of node created
* @throws KeeperException if unexpected zookeeper exception
* @throws KeeperException.NodeExistsException if node already exists
*/
public static void asyncCreate(ZooKeeperWatcher zkw,
String znode, byte [] data, final AsyncCallback.StringCallback cb,
final Object ctx)
throws KeeperException, KeeperException.NodeExistsException {
zkw.getZooKeeper().create(znode, data, Ids.OPEN_ACL_UNSAFE,
CreateMode.PERSISTENT, cb, ctx);
}
/**
* Creates the specified node, if the node does not exist. Does not set a
* watch and fails silently if the node already exists.

View File

@ -0,0 +1,68 @@
/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.zookeeper;
import java.util.Properties;
import java.util.Map.Entry;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
/**
* Tool for reading a ZooKeeper server from HBase XML configuration producing
* the '-server host:port' argument to pass ZooKeeperMain. This program
* emits either '-server HOST:PORT" where HOST is one of the zk ensemble
* members plus zk client port OR it emits '' if no zk servers found (Yes,
* it emits '-server' too).
*/
public class ZooKeeperMainServerArg {
public String parse(final Configuration c) {
// Note that we do not simply grab the property
// HConstants.ZOOKEEPER_QUORUM from the HBaseConfiguration because the
// user may be using a zoo.cfg file.
Properties zkProps = ZKConfig.makeZKProps(c);
String host = null;
String clientPort = null;
for (Entry<Object, Object> entry: zkProps.entrySet()) {
String key = entry.getKey().toString().trim();
String value = entry.getValue().toString().trim();
if (key.startsWith("server.") && host == null) {
String[] parts = value.split(":");
host = parts[0];
} else if (key.endsWith("clientPort")) {
clientPort = value;
}
if (host != null && clientPort != null) break;
}
return host != null && clientPort != null? host + ":" + clientPort: null;
}
/**
* Run the tool.
* @param args Command line arguments. First arg is path to zookeepers file.
*/
public static void main(String args[]) {
Configuration conf = HBaseConfiguration.create();
String hostport = new ZooKeeperMainServerArg().parse(conf);
System.out.println((hostport == null || hostport.length() == 0)? "":
"-server " + hostport);
}
}

View File

@ -47,7 +47,7 @@ import org.apache.zookeeper.ZooKeeper;
* <p>This class also holds and manages the connection to ZooKeeper. Code to
* deal with connection related events and exceptions are handled here.
*/
public class ZooKeeperWatcher implements Watcher {
public class ZooKeeperWatcher implements Watcher, Abortable {
private static final Log LOG = LogFactory.getLog(ZooKeeperWatcher.class);
// Identifiier for this watcher (for logging only). Its made of the prefix
@ -372,4 +372,9 @@ public class ZooKeeperWatcher implements Watcher {
} catch (InterruptedException e) {
}
}
}
@Override
public void abort(String why, Throwable e) {
this.abortable.abort(why, e);
}
}

View File

@ -0,0 +1,44 @@
/**
* Copyright 2010 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.zookeeper;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.junit.Test;
public class TestZooKeeperMainServerArg {
private final ZooKeeperMainServerArg parser = new ZooKeeperMainServerArg();
@Test public void test() {
Configuration c = HBaseConfiguration.create();
assertEquals("localhost:" + c.get("hbase.zookeeper.property.clientPort"),
parser.parse(c));
final String port = "1234";
c.set("hbase.zookeeper.property.clientPort", port);
c.set("hbase.zookeeper.quorum", "example.com");
assertEquals("example.com:" + port, parser.parse(c));
c.set("hbase.zookeeper.quorum", "example1.com,example2.com,example3.com");
assertTrue(port, parser.parse(c).matches("example[1-3]\\.com:" + port));
}
}