HBASE-14754 TestFastFailWithoutTestUtil failing on trunk now in #testPreemptiveFastFailException50Times

This commit is contained in:
stack 2015-11-03 16:20:55 -08:00
parent c575e18ab8
commit e4bf77e2de
1 changed files with 0 additions and 631 deletions

View File

@ -1,631 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.client;
import static org.junit.Assert.*;
import java.io.EOFException;
import java.io.IOException;
import java.io.SyncFailedException;
import java.net.ConnectException;
import java.net.SocketTimeoutException;
import java.nio.channels.ClosedChannelException;
import java.util.concurrent.Callable;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.DoNotRetryIOException;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HRegionLocation;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.exceptions.ConnectionClosingException;
import org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException;
import org.apache.hadoop.hbase.testclassification.ClientTests;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.hadoop.ipc.RemoteException;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@Category({ SmallTests.class, ClientTests.class })
public class TestFastFailWithoutTestUtil {
private static final Log LOG = LogFactory.getLog(TestFastFailWithoutTestUtil.class);
@Test
public void testInterceptorFactoryMethods() {
Configuration conf = HBaseConfiguration.create();
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
RetryingCallerInterceptorFactory interceptorFactory = new RetryingCallerInterceptorFactory(
conf);
RetryingCallerInterceptor interceptorBeforeCast = interceptorFactory
.build();
assertTrue("We should be getting a PreemptiveFastFailInterceptor",
interceptorBeforeCast instanceof PreemptiveFastFailInterceptor);
PreemptiveFastFailInterceptor interceptor = (PreemptiveFastFailInterceptor) interceptorBeforeCast;
RetryingCallerInterceptorContext contextBeforeCast = interceptor
.createEmptyContext();
assertTrue(
"We should be getting a FastFailInterceptorContext since we are interacting with the"
+ " PreemptiveFastFailInterceptor",
contextBeforeCast instanceof FastFailInterceptorContext);
FastFailInterceptorContext context = (FastFailInterceptorContext) contextBeforeCast;
assertTrue(context != null);
conf = HBaseConfiguration.create();
interceptorFactory = new RetryingCallerInterceptorFactory(conf);
interceptorBeforeCast = interceptorFactory.build();
assertTrue(
"We should be getting a NoOpRetryableCallerInterceptor since we disabled PFFE",
interceptorBeforeCast instanceof NoOpRetryableCallerInterceptor);
contextBeforeCast = interceptorBeforeCast.createEmptyContext();
assertTrue(
"We should be getting a NoOpRetryingInterceptorContext from NoOpRetryableCallerInterceptor",
contextBeforeCast instanceof NoOpRetryingInterceptorContext);
assertTrue(context != null);
}
@Test
public void testInterceptorContextClear() {
PreemptiveFastFailInterceptor interceptor = createPreemptiveInterceptor();
FastFailInterceptorContext context = (FastFailInterceptorContext) interceptor
.createEmptyContext();
context.clear();
assertFalse(context.getCouldNotCommunicateWithServer().booleanValue());
assertEquals(context.didTry(), false);
assertEquals(context.getFailureInfo(), null);
assertEquals(context.getServer(), null);
assertEquals(context.getTries(), 0);
}
@Test
public void testInterceptorContextPrepare() throws IOException {
PreemptiveFastFailInterceptor interceptor = TestFastFailWithoutTestUtil
.createPreemptiveInterceptor();
FastFailInterceptorContext context = (FastFailInterceptorContext) interceptor
.createEmptyContext();
RetryingCallable<?> callable = new RegionServerCallable<Boolean>(null,
null, null) {
@Override
public Boolean call(int callTimeout) throws Exception {
return true;
}
@Override
protected HRegionLocation getLocation() {
return new HRegionLocation(null, ServerName.valueOf("localhost", 1234,
987654321));
}
};
context.prepare(callable);
ServerName server = getSomeServerName();
assertEquals(context.getServer(), server);
context.clear();
context.prepare(callable, 2);
assertEquals(context.getServer(), server);
}
@Ignore @Test
public void testInterceptorIntercept50Times() throws IOException,
InterruptedException {
for (int i = 0; i < 50; i++) {
testInterceptorIntercept();
}
}
public void testInterceptorIntercept() throws IOException,
InterruptedException {
Configuration conf = HBaseConfiguration.create();
long CLEANUP_TIMEOUT = 50;
long FAST_FAIL_THRESHOLD = 10;
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_CLEANUP_MS_DURATION_MS,
CLEANUP_TIMEOUT);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_THREASHOLD_MS,
FAST_FAIL_THRESHOLD);
PreemptiveFastFailInterceptor interceptor = TestFastFailWithoutTestUtil
.createPreemptiveInterceptor(conf);
FastFailInterceptorContext context = (FastFailInterceptorContext) interceptor
.createEmptyContext();
RetryingCallable<?> callable = getDummyRetryingCallable(getSomeServerName());
// Lets simulate some work flow here.
int tries = 0;
context.prepare(callable, tries);
interceptor.intercept(context);
interceptor.handleFailure(context, new ConnectException(
"Failed to connect to server"));
interceptor.updateFailureInfo(context);
assertTrue("Interceptor should have updated didTry to true",
context.didTry());
assertTrue(
"The call shouldn't have been successful if there was a ConnectException",
context.getCouldNotCommunicateWithServer().booleanValue());
assertNull(
"Once a failure is identified, the first time the FailureInfo is generated for the server,"
+ " but it is not assigned to the context yet. It would be assigned on the next"
+ " intercept.", context.getFailureInfo());
assertEquals(context.getTries(), tries);
assertFalse(
"We are still in the first attempt and so we dont set this variable to true yet.",
context.isRetryDespiteFastFailMode());
Thread.sleep(FAST_FAIL_THRESHOLD + 1); // We sleep so as to make sure that
// we
// actually consider this server as a
// dead server in the next attempt.
tries++;
context.prepare(callable, tries);
interceptor.intercept(context);
interceptor.handleFailure(context, new ConnectException(
"Failed to connect to server"));
interceptor.updateFailureInfo(context);
assertTrue("didTru should remain true", context.didTry());
assertTrue(
"The call shouldn't have been successful if there was a ConnectException",
context.getCouldNotCommunicateWithServer().booleanValue());
assertNotNull(
"The context this time is updated with a failureInfo, since we already gave it a try.",
context.getFailureInfo());
assertEquals(context.getTries(), tries);
assertTrue(
"Since we are alone here we would be given the permission to retryDespiteFailures.",
context.isRetryDespiteFastFailMode());
context.clear();
Thread.sleep(CLEANUP_TIMEOUT); // Lets try and cleanup the data in the fast
// fail failure maps.
tries++;
context.clear();
context.prepare(callable, tries);
interceptor.occasionallyCleanupFailureInformation();
assertNull("The cleanup should have cleared the server",
interceptor.repeatedFailuresMap.get(context.getServer()));
interceptor.intercept(context);
interceptor.handleFailure(context, new ConnectException(
"Failed to connect to server"));
interceptor.updateFailureInfo(context);
assertTrue("didTru should remain true", context.didTry());
assertTrue(
"The call shouldn't have been successful if there was a ConnectException",
context.getCouldNotCommunicateWithServer().booleanValue());
assertNull("The failureInfo is cleared off from the maps.",
context.getFailureInfo());
assertEquals(context.getTries(), tries);
assertFalse(
"Since we are alone here we would be given the permission to retryDespiteFailures.",
context.isRetryDespiteFastFailMode());
context.clear();
}
private <T> RetryingCallable<T> getDummyRetryingCallable(
ServerName someServerName) {
return new RegionServerCallable<T>(null, null, null) {
@Override
public T call(int callTimeout) throws Exception {
return null;
}
@Override
protected HRegionLocation getLocation() {
return new HRegionLocation(null, serverName);
}
};
}
@Test
public void testExceptionsIdentifiedByInterceptor() throws IOException {
Throwable[] networkexceptions = new Throwable[] {
new ConnectException("Mary is unwell"),
new SocketTimeoutException("Mike is too late"),
new ClosedChannelException(),
new SyncFailedException("Dave is not on the same page"),
new TimeoutException("Mike is late again"),
new EOFException("This is the end... "),
new ConnectionClosingException("Its closing") };
final String INDUCED = "Induced";
Throwable[] nonNetworkExceptions = new Throwable[] {
new IOException("Bob died"),
new RemoteException("Bob's cousin died", null),
new NoSuchMethodError(INDUCED), new NullPointerException(INDUCED),
new DoNotRetryIOException(INDUCED), new Error(INDUCED) };
Configuration conf = HBaseConfiguration.create();
long CLEANUP_TIMEOUT = 0;
long FAST_FAIL_THRESHOLD = 1000000;
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_CLEANUP_MS_DURATION_MS,
CLEANUP_TIMEOUT);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_THREASHOLD_MS,
FAST_FAIL_THRESHOLD);
for (Throwable e : networkexceptions) {
PreemptiveFastFailInterceptor interceptor = TestFastFailWithoutTestUtil
.createPreemptiveInterceptor(conf);
FastFailInterceptorContext context = (FastFailInterceptorContext) interceptor
.createEmptyContext();
RetryingCallable<?> callable = getDummyRetryingCallable(getSomeServerName());
context.prepare(callable, 0);
interceptor.intercept(context);
interceptor.handleFailure(context, e);
interceptor.updateFailureInfo(context);
assertTrue(
"The call shouldn't have been successful if there was a ConnectException",
context.getCouldNotCommunicateWithServer().booleanValue());
}
for (Throwable e : nonNetworkExceptions) {
try {
PreemptiveFastFailInterceptor interceptor = TestFastFailWithoutTestUtil
.createPreemptiveInterceptor(conf);
FastFailInterceptorContext context = (FastFailInterceptorContext) interceptor
.createEmptyContext();
RetryingCallable<?> callable = getDummyRetryingCallable(getSomeServerName());
context.prepare(callable, 0);
interceptor.intercept(context);
interceptor.handleFailure(context, e);
interceptor.updateFailureInfo(context);
assertFalse(
"The call shouldn't have been successful if there was a ConnectException",
context.getCouldNotCommunicateWithServer().booleanValue());
} catch (NoSuchMethodError t) {
assertTrue("Exception not induced", t.getMessage().contains(INDUCED));
} catch (NullPointerException t) {
assertTrue("Exception not induced", t.getMessage().contains(INDUCED));
} catch (DoNotRetryIOException t) {
assertTrue("Exception not induced", t.getMessage().contains(INDUCED));
} catch (Error t) {
assertTrue("Exception not induced", t.getMessage().contains(INDUCED));
}
}
}
protected static PreemptiveFastFailInterceptor createPreemptiveInterceptor(
Configuration conf) {
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
RetryingCallerInterceptorFactory interceptorFactory = new RetryingCallerInterceptorFactory(
conf);
RetryingCallerInterceptor interceptorBeforeCast = interceptorFactory
.build();
return (PreemptiveFastFailInterceptor) interceptorBeforeCast;
}
static PreemptiveFastFailInterceptor createPreemptiveInterceptor() {
Configuration conf = HBaseConfiguration.create();
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
return createPreemptiveInterceptor(conf);
}
@Test(timeout = 120000)
public void testPreemptiveFastFailException50Times()
throws InterruptedException, ExecutionException {
for (int i = 0; i < 50; i++) {
testPreemptiveFastFailException();
}
}
/***
* This test tries to create a thread interleaving of the 2 threads trying to do a
* Retrying operation using a {@link PreemptiveFastFailInterceptor}. The goal here is to make sure
* that the second thread will be attempting the operation while the first thread is in the
* process of making an attempt after it has marked the server in fast fail.
*
* The thread execution is as follows :
* The PreemptiveFastFailInterceptor is extended in this test to achieve a good interleaving
* behavior without using any thread sleeps.
*
* Privileged Thread 1 NonPrivileged Thread 2
*
* Retry 0 : intercept
*
* Retry 0 : handleFailure
* latches[0].countdown
* latches2[0].await
* latches[0].await
* intercept : Retry 0
*
* handleFailure : Retry 0
*
* updateFailureinfo : Retry 0
* latches2[0].countdown
*
* Retry 0 : updateFailureInfo
*
* Retry 1 : intercept
*
* Retry 1 : handleFailure
* latches[1].countdown
* latches2[1].await
*
* latches[1].await
* intercept : Retry 1
* (throws PFFE)
* handleFailure : Retry 1
*
* updateFailureinfo : Retry 1
* latches2[1].countdown
* Retry 1 : updateFailureInfo
*
*
* See getInterceptor() for more details on the interceptor implementation to make sure this
* thread interleaving is achieved.
*
* We need 2 sets of latches of size MAX_RETRIES. We use an AtomicInteger done to make sure that
* we short circuit the Thread 1 after we hit the PFFE on Thread 2
*
*
* @throws InterruptedException
* @throws ExecutionException
*/
private void testPreemptiveFastFailException() throws InterruptedException,
ExecutionException {
LOG.debug("Setting up the counters to start the test");
priviRetryCounter.set(0);
nonPriviRetryCounter.set(0);
done.set(0);
for (int i = 0; i <= RETRIES; i++) {
latches[i] = new CountDownLatch(1);
latches2[i] = new CountDownLatch(1);
}
PreemptiveFastFailInterceptor interceptor = getInterceptor();
final RpcRetryingCaller<Void> priviCaller = getRpcRetryingCaller(
PAUSE_TIME, RETRIES, interceptor);
final RpcRetryingCaller<Void> nonPriviCaller = getRpcRetryingCaller(
PAUSE_TIME, RETRIES, interceptor);
LOG.debug("Submitting the thread 1");
Future<Boolean> priviFuture = executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
try {
isPriviThreadLocal.get().set(true);
priviCaller
.callWithRetries(
getRetryingCallable(serverName, exception),
CLEANUP_TIMEOUT);
} catch (RetriesExhaustedException e) {
return true;
} catch (PreemptiveFastFailException e) {
return false;
}
return false;
}
});
LOG.debug("Submitting the thread 2");
Future<Boolean> nonPriviFuture = executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
try {
isPriviThreadLocal.get().set(false);
nonPriviCaller.callWithRetries(
getRetryingCallable(serverName, exception),
CLEANUP_TIMEOUT);
} catch (PreemptiveFastFailException e) {
return true;
}
return false;
}
});
LOG.debug("Waiting for Thread 2 to finish");
try {
nonPriviFuture.get(30, TimeUnit.SECONDS);
assertTrue(nonPriviFuture.get());
} catch (TimeoutException e) {
Threads.printThreadInfo(System.out,
"This should not hang but seems to sometimes...FIX! Here is a thread dump!");
}
LOG.debug("Waiting for Thread 1 to finish");
try {
priviFuture.get(30, TimeUnit.SECONDS);
assertTrue(priviFuture.get());
} catch (TimeoutException e) {
// There is something wrong w/ the latching but don't have time to fix. If timesout, just
// let it go for now till someone has time to look. Meantime, here is thread dump.
Threads.printThreadInfo(System.out,
"This should not hang but seems to sometimes...FIX! Here is a thread dump!");
}
// Now that the server in fast fail mode. Lets try to make contact with the
// server with a third thread. And make sure that when there is no
// exception,
// the fast fail gets cleared up.
assertTrue(interceptor.isServerInFailureMap(serverName));
final RpcRetryingCaller<Void> priviCallerNew = getRpcRetryingCaller(
PAUSE_TIME, RETRIES, interceptor);
executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
priviCallerNew.callWithRetries(
getRetryingCallable(serverName, null), CLEANUP_TIMEOUT);
return false;
}
}).get();
assertFalse("The server was supposed to be removed from the map",
interceptor.isServerInFailureMap(serverName));
}
ExecutorService executor = Executors.newCachedThreadPool();
/**
* Some timeouts to make the test execution resonable.
*/
final int PAUSE_TIME = 10;
final int RETRIES = 3;
final int CLEANUP_TIMEOUT = 10000;
final long FAST_FAIL_THRESHOLD = PAUSE_TIME / 1;
/**
* The latches necessary to make the thread interleaving possible.
*/
final CountDownLatch[] latches = new CountDownLatch[RETRIES + 1];
final CountDownLatch[] latches2 = new CountDownLatch[RETRIES + 1];
final AtomicInteger done = new AtomicInteger(0);
/**
* Global retry counters that give us an idea about which iteration of the retry we are in
*/
final AtomicInteger priviRetryCounter = new AtomicInteger();
final AtomicInteger nonPriviRetryCounter = new AtomicInteger();
final ServerName serverName = getSomeServerName();
/**
* The variable which is used as an identifier within the 2 threads.
*/
public final ThreadLocal<AtomicBoolean> isPriviThreadLocal = new ThreadLocal<AtomicBoolean>() {
@Override
public AtomicBoolean initialValue() {
return new AtomicBoolean(true);
}
};
final Exception exception = new ConnectionClosingException("The current connection is closed");
public PreemptiveFastFailInterceptor getInterceptor() {
final Configuration conf = HBaseConfiguration.create();
conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_CLEANUP_MS_DURATION_MS,
CLEANUP_TIMEOUT);
conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_THREASHOLD_MS,
FAST_FAIL_THRESHOLD);
return new PreemptiveFastFailInterceptor(
conf) {
@Override
public void updateFailureInfo(RetryingCallerInterceptorContext context) {
boolean pffe = false;
if (!isPriviThreadLocal.get().get()) {
pffe = !((FastFailInterceptorContext)context).isRetryDespiteFastFailMode();
}
if (isPriviThreadLocal.get().get()) {
try {
// Thread 2 should be done by 2 iterations. We should short circuit Thread 1 because
// Thread 2 would be dead and can't do a countdown.
if (done.get() <= 1) {
latches2[priviRetryCounter.get()].await();
}
} catch (InterruptedException e) {
fail();
}
}
super.updateFailureInfo(context);
if (!isPriviThreadLocal.get().get()) {
if (pffe) done.incrementAndGet();
latches2[nonPriviRetryCounter.get()].countDown();
}
}
@Override
public void intercept(RetryingCallerInterceptorContext context)
throws PreemptiveFastFailException {
if (!isPriviThreadLocal.get().get()) {
try {
latches[nonPriviRetryCounter.getAndIncrement()].await();
} catch (InterruptedException e) {
fail();
}
}
super.intercept(context);
}
@Override
public void handleFailure(RetryingCallerInterceptorContext context,
Throwable t) throws IOException {
super.handleFailure(context, t);
if (isPriviThreadLocal.get().get()) {
latches[priviRetryCounter.getAndIncrement()].countDown();
}
}
};
}
public RpcRetryingCaller<Void> getRpcRetryingCaller(int pauseTime,
int retries, RetryingCallerInterceptor interceptor) {
return new RpcRetryingCallerImpl<Void>(pauseTime, retries, interceptor, 9) {
@Override
public Void callWithRetries(RetryingCallable<Void> callable,
int callTimeout) throws IOException, RuntimeException {
Void ret = super.callWithRetries(callable, callTimeout);
return ret;
}
};
}
protected static ServerName getSomeServerName() {
return ServerName.valueOf("localhost", 1234, 987654321);
}
private RegionServerCallable<Void> getRetryingCallable(
final ServerName serverName, final Exception e) {
return new RegionServerCallable<Void>(null, null, null) {
@Override
public void prepare(boolean reload) throws IOException {
this.location = new HRegionLocation(HRegionInfo.FIRST_META_REGIONINFO,
serverName);
}
@Override
public Void call(int callTimeout) throws Exception {
if (e != null)
throw e;
return null;
}
@Override
protected HRegionLocation getLocation() {
return new HRegionLocation(null, serverName);
}
@Override
public void throwable(Throwable t, boolean retrying) {
// Do nothing
}
@Override
public long sleep(long pause, int tries) {
return ConnectionUtils.getPauseTime(pause, tries + 1);
}
};
}
}