HBASE-2421 Put hangs for 10 retries on failed region servers; forward-port from branch

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@945395 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-05-17 21:36:02 +00:00
parent 23ee15f04e
commit afd149c7aa
6 changed files with 76 additions and 17 deletions

View File

@ -325,6 +325,7 @@ Release 0.21.0 - Unreleased
HBASE-2382 Don't rely on fs.getDefaultReplication() to roll HLogs
(Nicolas Spiegelberg via Stack)
HBASE-2415 Disable META splitting in 0.20 (Todd Lipcon via Stack)
HBASE-2421 Put hangs for 10 retries on failed region servers
IMPROVEMENTS
HBASE-1760 Cleanup TODOs in HTable

View File

@ -192,7 +192,7 @@ public interface HConnection {
* @throws IOException if a remote or network exception occurs
* @throws RuntimeException other unspecified error
*/
public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable)
public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
throws IOException, RuntimeException;

View File

@ -680,7 +680,7 @@ public class HConnectionManager implements HConstants {
// This block guards against two threads trying to load the meta
// region at the same time. The first will load the meta region and
// the second will use the value that the first one found.
synchronized(regionLockObject) {
synchronized (regionLockObject) {
// Check the cache again for a hit in case some other thread made the
// same query while we were waiting on the lock. If not supposed to
// be using the cache, delete any existing cached location so it won't
@ -1077,15 +1077,19 @@ public class HConnectionManager implements HConstants {
return null;
}
public <T> T getRegionServerForWithoutRetries(ServerCallable<T> callable)
public <T> T getRegionServerWithoutRetries(ServerCallable<T> callable)
throws IOException, RuntimeException {
try {
callable.instantiateServer(false);
return callable.call();
} catch (Throwable t) {
t = translateException(t);
Throwable t2 = translateException(t);
if (t2 instanceof IOException) {
throw (IOException)t2;
} else {
throw new RuntimeException(t2);
}
}
return null;
}
@SuppressWarnings({"ConstantConditions"})
@ -1299,9 +1303,25 @@ public class HConnectionManager implements HConstants {
}
}
@SuppressWarnings({"ConstantConditions"})
/**
* Process a batch of Puts on the given executor service.
*
* @param list the puts to make - successful puts will be removed.
* @param pool thread pool to execute requests on
*
* In the case of an exception, we take different actions depending on the
* situation:
* - If the exception is a DoNotRetryException, we rethrow it and leave the
* 'list' parameter in an indeterminate state.
* - If the 'list' parameter is a singleton, we directly throw the specific
* exception for that put.
* - Otherwise, we throw a generic exception indicating that an error occurred.
* The 'list' parameter is mutated to contain those puts that did not succeed.
*/
public void processBatchOfPuts(List<Put> list,
final byte[] tableName, ExecutorService pool) throws IOException {
boolean singletonList = list.size() == 1;
Throwable singleRowCause = null;
for ( int tries = 0 ; tries < numRetries && !list.isEmpty(); ++tries) {
Collections.sort(list);
Map<HServerAddress, MultiPut> regionPuts =
@ -1367,10 +1387,19 @@ public class HConnectionManager implements HConstants {
LOG.debug("Failed all from " + request.address, e);
failed.addAll(request.allPuts());
} catch (ExecutionException e) {
System.out.println(e);
// all go into the failed list.
LOG.debug("Failed all from " + request.address, e);
failed.addAll(request.allPuts());
// Just give up, leaving the batch put list in an untouched/semi-committed state
if (e.getCause() instanceof DoNotRetryIOException) {
throw (DoNotRetryIOException) e.getCause();
}
if (singletonList) {
// be richer for reporting in a 1 row case.
singleRowCause = e.getCause();
}
}
}
list.clear();
@ -1391,9 +1420,13 @@ public class HConnectionManager implements HConstants {
}
}
if (!list.isEmpty()) {
if (singletonList && singleRowCause != null) {
throw new IOException(singleRowCause);
}
// ran out of retries and didnt succeed everything!
throw new RetriesExhaustedException("Still had " + list.size() + " puts left after retrying " +
numRetries + " times. Should have detail on which Regions failed the most");
numRetries + " times.");
}
}
@ -1404,7 +1437,7 @@ public class HConnectionManager implements HConstants {
final HConnection connection = this;
return new Callable<MultiPutResponse>() {
public MultiPutResponse call() throws IOException {
return getRegionServerWithRetries(
return getRegionServerWithoutRetries(
new ServerCallable<MultiPutResponse>(connection, tableName, null) {
public MultiPutResponse call() throws IOException {
MultiPutResponse resp = server.multiPut(puts);

View File

@ -542,8 +542,8 @@ public class HTable implements HTableInterface {
} finally {
// the write buffer was adjusted by processBatchOfPuts
currentWriteBufferSize = 0;
for (Put aWriteBuffer : writeBuffer) {
currentWriteBufferSize += aWriteBuffer.heapSize();
for (Put aPut : writeBuffer) {
currentWriteBufferSize += aPut.heapSize();
}
}
}

View File

@ -233,7 +233,7 @@ public class HRegionServer implements HConstants, HRegionInterface,
// Run HDFS shutdown on exit if this is set. We clear this out when
// doing a restart() to prevent closing of HDFS.
private final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
public final AtomicBoolean shutdownHDFS = new AtomicBoolean(true);
private final String machineName;

View File

@ -20,15 +20,15 @@
package org.apache.hadoop.hbase;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.util.Bytes;
import java.util.List;
import java.util.ArrayList;
import java.util.List;
public class TestMultiParallelPut extends MultiRegionTable {
private static final byte[] VALUE = Bytes.toBytes("value");
@ -58,7 +58,14 @@ public class TestMultiParallelPut extends MultiRegionTable {
List<byte[]> keys = new ArrayList<byte[]>();
public void testMultiPut() throws Exception {
public void testParallelPut() throws Exception {
doATest(false);
}
public void testParallelPutWithRSAbort() throws Exception {
doATest(true);
}
public void doATest(boolean doAbort) throws Exception {
HTable table = new HTable(TEST_TABLE);
table.setAutoFlush(false);
@ -73,6 +80,19 @@ public class TestMultiParallelPut extends MultiRegionTable {
table.flushCommits();
if (doAbort) {
cluster.abortRegionServer(0);
// try putting more keys after the abort.
for ( byte [] k : keys ) {
Put put = new Put(k);
put.add(BYTES_FAMILY, QUALIFIER, VALUE);
table.put(put);
}
table.flushCommits();
}
for (byte [] k : keys ) {
Get get = new Get(k);
get.addColumn(BYTES_FAMILY, QUALIFIER);
@ -88,10 +108,15 @@ public class TestMultiParallelPut extends MultiRegionTable {
HBaseAdmin admin = new HBaseAdmin(conf);
ClusterStatus cs = admin.getClusterStatus();
assertEquals(2, cs.getServers());
int expectedServerCount = 2;
if (doAbort)
expectedServerCount = 1;
assertEquals(expectedServerCount, cs.getServers());
for ( HServerInfo info : cs.getServerInfo()) {
System.out.println(info);
assertTrue( info.getLoad().getNumberOfRegions() > 10);
}
}
}