HBASE-11574. Addendum that should fix a unit test (TestMetaWithReplicas#testChangingReplicaCount) that fails once in a while

This commit is contained in:
Devaraj Das 2015-01-29 00:27:08 -08:00
parent 15a4738470
commit 7c8aa2e963
3 changed files with 40 additions and 24 deletions

View File

@ -109,6 +109,7 @@ import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor; import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.procedure.MasterProcedureManagerHost; import org.apache.hadoop.hbase.procedure.MasterProcedureManagerHost;
import org.apache.hadoop.hbase.procedure.flush.MasterFlushTableProcedureManager; import org.apache.hadoop.hbase.procedure.flush.MasterFlushTableProcedureManager;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo; import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.RegionServerInfo;
import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode; import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode;
import org.apache.hadoop.hbase.quotas.MasterQuotaManager; import org.apache.hadoop.hbase.quotas.MasterQuotaManager;
@ -124,6 +125,7 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CompressionTest; import org.apache.hadoop.hbase.util.CompressionTest;
import org.apache.hadoop.hbase.util.EncryptionTest; import org.apache.hadoop.hbase.util.EncryptionTest;
import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HBaseFsckRepair;
import org.apache.hadoop.hbase.util.HFileArchiveUtil; import org.apache.hadoop.hbase.util.HFileArchiveUtil;
import org.apache.hadoop.hbase.util.HasThread; import org.apache.hadoop.hbase.util.HasThread;
import org.apache.hadoop.hbase.util.Pair; import org.apache.hadoop.hbase.util.Pair;
@ -799,7 +801,10 @@ public class HMaster extends HRegionServer implements MasterServices, Server {
int replicaId = zooKeeper.getMetaReplicaIdFromZnode(metaReplicaZnode); int replicaId = zooKeeper.getMetaReplicaIdFromZnode(metaReplicaZnode);
if (replicaId >= numMetaReplicasConfigured) { if (replicaId >= numMetaReplicasConfigured) {
RegionState r = MetaTableLocator.getMetaRegionState(zkw, replicaId); RegionState r = MetaTableLocator.getMetaRegionState(zkw, replicaId);
serverManager.sendRegionClose(r.getServerName(), r.getRegion()); LOG.info("Closing excess replica of meta region " + r.getRegion());
// send a close and wait for a max of 30 seconds
ServerManager.closeRegionSilentlyAndWait(getConnection(), r.getServerName(),
r.getRegion(), 30000);
ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(replicaId)); ZKUtil.deleteNode(zkw, zkw.getZNodeForReplica(replicaId));
} }
} }

View File

@ -39,6 +39,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClockOutOfSyncException; import org.apache.hadoop.hbase.ClockOutOfSyncException;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.NotServingRegionException;
import org.apache.hadoop.hbase.RegionLoad; import org.apache.hadoop.hbase.RegionLoad;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerLoad; import org.apache.hadoop.hbase.ServerLoad;
@ -767,6 +768,35 @@ public class ServerManager {
return sendRegionClose(server, region, null); return sendRegionClose(server, region, null);
} }
/**
* Contacts a region server and waits up to timeout ms
* to close the region. This bypasses the active hmaster.
*/
public static void closeRegionSilentlyAndWait(ClusterConnection connection,
ServerName server, HRegionInfo region, long timeout) throws IOException, InterruptedException {
AdminService.BlockingInterface rs = connection.getAdmin(server);
try {
ProtobufUtil.closeRegion(rs, server, region.getRegionName());
} catch (IOException e) {
LOG.warn("Exception when closing region: " + region.getRegionNameAsString(), e);
}
long expiration = timeout + System.currentTimeMillis();
while (System.currentTimeMillis() < expiration) {
try {
HRegionInfo rsRegion =
ProtobufUtil.getRegionInfo(rs, region.getRegionName());
if (rsRegion == null) return;
} catch (IOException ioe) {
if (ioe instanceof NotServingRegionException) // no need to retry again
return;
LOG.warn("Exception when retrieving regioninfo from: " + region.getRegionNameAsString(), ioe);
}
Thread.sleep(1000);
}
throw new IOException("Region " + region + " failed to close within"
+ " timeout " + timeout);
}
/** /**
* Sends an MERGE REGIONS RPC to the specified server to merge the specified * Sends an MERGE REGIONS RPC to the specified server to merge the specified
* regions. * regions.

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.ClusterConnection;
import org.apache.hadoop.hbase.client.Connection; import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.HConnection; import org.apache.hadoop.hbase.client.HConnection;
@ -38,8 +39,7 @@ import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.protobuf.ProtobufUtil; import org.apache.hadoop.hbase.master.ServerManager;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.AdminService;
import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
@ -153,29 +153,10 @@ public class HBaseFsckRepair {
@SuppressWarnings("deprecation") @SuppressWarnings("deprecation")
public static void closeRegionSilentlyAndWait(HConnection connection, public static void closeRegionSilentlyAndWait(HConnection connection,
ServerName server, HRegionInfo region) throws IOException, InterruptedException { ServerName server, HRegionInfo region) throws IOException, InterruptedException {
AdminService.BlockingInterface rs = connection.getAdmin(server);
try {
ProtobufUtil.closeRegion(rs, server, region.getRegionName());
} catch (IOException e) {
LOG.warn("Exception when closing region: " + region.getRegionNameAsString(), e);
}
long timeout = connection.getConfiguration() long timeout = connection.getConfiguration()
.getLong("hbase.hbck.close.timeout", 120000); .getLong("hbase.hbck.close.timeout", 120000);
long expiration = timeout + System.currentTimeMillis(); ServerManager.closeRegionSilentlyAndWait((ClusterConnection)connection, server,
while (System.currentTimeMillis() < expiration) { region, timeout);
try {
HRegionInfo rsRegion =
ProtobufUtil.getRegionInfo(rs, region.getRegionName());
if (rsRegion == null) return;
} catch (IOException ioe) {
if (ioe instanceof NotServingRegionException) // no need to retry again
return;
LOG.warn("Exception when retrieving regioninfo from: " + region.getRegionNameAsString(), ioe);
}
Thread.sleep(1000);
}
throw new IOException("Region " + region + " failed to close within"
+ " timeout " + timeout);
} }
/** /**