HADOOP-1403. HBase reliability. Make master and region server more fault tolerant.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@540586 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Jim Kellerman 2007-05-22 13:51:10 +00:00
parent 47bf7fd8bb
commit e3e22fe37e
3 changed files with 460 additions and 345 deletions

View File

@ -8,3 +8,5 @@ Trunk (unreleased changes)
3. HADOOP-1404. HBase command-line shutdown failing (Michael Stack)
4. HADOOP-1397. Replace custom hbase locking with
java.util.concurrent.locks.ReentrantLock (Michael Stack)
5. HADOOP-1403. HBase reliability - make master and region server more fault
tolerant.

View File

@ -55,6 +55,7 @@ public class HMaster implements HConstants, HMasterInterface,
private FileSystem fs;
private Random rand;
private long threadWakeFrequency;
private int numRetries;
private long maxRegionOpenTime;
// The 'msgQueue' is used to assign work to the client processor thread
@ -181,7 +182,7 @@ public class HMaster implements HConstants, HMasterInterface,
server.close(scannerId);
}
} catch (IOException e) {
e.printStackTrace();
LOG.error(e);
}
scannerId = -1L;
}
@ -284,7 +285,7 @@ public class HMaster implements HConstants, HMasterInterface,
}
}
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
closed = true;
}
LOG.debug("ROOT scanner exiting");
@ -391,7 +392,7 @@ public class HMaster implements HConstants, HMasterInterface,
} while(true);
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
closed = true;
}
}
@ -480,6 +481,7 @@ public class HMaster implements HConstants, HMasterInterface,
Path rootRegionDir =
HStoreFile.getHRegionDir(dir, HGlobals.rootRegionInfo.regionName);
LOG.info("Root region dir: " + rootRegionDir.toString());
if(! fs.exists(rootRegionDir)) {
LOG.info("bootstrap: creating ROOT and first META regions");
try {
@ -492,11 +494,12 @@ public class HMaster implements HConstants, HMasterInterface,
meta.close();
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
}
}
this.threadWakeFrequency = conf.getLong(THREAD_WAKE_FREQUENCY, 10 * 1000);
this.numRetries = conf.getInt("hbase.client.retries.number", 2);
this.maxRegionOpenTime = conf.getLong("hbase.hbasemaster.maxregionopen", 30 * 1000);
this.msgQueue = new Vector<PendingOperation>();
this.serverLeases = new Leases(
@ -575,7 +578,7 @@ public class HMaster implements HConstants, HMasterInterface,
} catch(IOException e) {
// Something happened during startup. Shut things down.
this.closed = true;
e.printStackTrace();
LOG.error(e);
}
// Main processing loop
@ -625,7 +628,7 @@ public class HMaster implements HConstants, HMasterInterface,
} catch(Exception iex) {
// Print if ever there is an interrupt (Just for kicks. Remove if it
// ever happens).
iex.printStackTrace();
LOG.warn(iex);
}
try {
// Join the thread till it finishes.
@ -633,7 +636,7 @@ public class HMaster implements HConstants, HMasterInterface,
} catch(Exception iex) {
// Print if ever there is an interrupt (Just for kicks. Remove if it
// ever happens).
iex.printStackTrace();
LOG.warn(iex);
}
try {
// Join until its finished. TODO: Maybe do in parallel in its own thread
@ -642,7 +645,7 @@ public class HMaster implements HConstants, HMasterInterface,
} catch(InterruptedException iex) {
// Print if ever there is an interrupt (Just for kicks. Remove if it
// ever happens).
iex.printStackTrace();
LOG.warn(iex);
}
if(LOG.isDebugEnabled()) {
@ -1033,12 +1036,22 @@ public class HMaster implements HConstants, HMasterInterface,
DataInputBuffer inbuf = new DataInputBuffer();
try {
while(true) {
LabelledData[] values = null;
while(true) {
HStoreKey key = new HStoreKey();
try {
values = server.next(scannerId, key);
if(values.length == 0) {
} catch(NotServingRegionException e) {
throw e;
} catch(IOException e) {
LOG.error(e);
break;
}
if(values == null || values.length == 0) {
break;
}
@ -1053,8 +1066,14 @@ public class HMaster implements HConstants, HMasterInterface,
// No server
continue;
}
try {
serverName = new String(bytes, UTF8_ENCODING);
} catch(UnsupportedEncodingException e) {
LOG.error(e);
break;
}
if(deadServer.compareTo(serverName) != 0) {
// This isn't the server you're looking for - move along
continue;
@ -1065,7 +1084,15 @@ public class HMaster implements HConstants, HMasterInterface,
// No start code
continue;
}
long startCode = Long.valueOf(new String(bytes, UTF8_ENCODING));
long startCode = -1L;
try {
startCode = Long.valueOf(new String(bytes, UTF8_ENCODING));
} catch(UnsupportedEncodingException e) {
LOG.error(e);
break;
}
if(oldStartCode != startCode) {
// Close but no cigar
@ -1080,8 +1107,15 @@ public class HMaster implements HConstants, HMasterInterface,
}
inbuf.reset(bytes, bytes.length);
HRegionInfo info = new HRegionInfo();
try {
info.readFields(inbuf);
} catch(IOException e) {
LOG.error(e);
break;
}
if(LOG.isDebugEnabled()) {
LOG.debug(serverName + " was serving " + info.regionName);
}
@ -1098,7 +1132,7 @@ public class HMaster implements HConstants, HMasterInterface,
server.close(scannerId);
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
}
}
@ -1135,26 +1169,52 @@ public class HMaster implements HConstants, HMasterInterface,
// Scan the ROOT region
HRegionInterface server = null;
long scannerId = -1L;
for(int tries = 0; tries < numRetries; tries ++) {
waitForRootRegion(); // Wait until the root region is available
HRegionInterface server = client.getHRegionConnection(rootRegionLocation);
long scannerId =
server.openScanner(HGlobals.rootRegionInfo.regionName, columns, startRow);
server = client.getHRegionConnection(rootRegionLocation);
scannerId = -1L;
try {
scannerId = server.openScanner(HGlobals.rootRegionInfo.regionName, columns, startRow);
scanMetaRegion(server, scannerId, HGlobals.rootRegionInfo.regionName);
break;
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
}
}
// We can not scan every meta region if they have not already been assigned
// and scanned.
for(int tries = 0; tries < numRetries; tries ++) {
try {
metaScanner.waitForMetaScan();
for(Iterator<MetaRegion> i = knownMetaRegions.values().iterator();
i.hasNext(); ) {
server = null;
scannerId = -1L;
MetaRegion r = i.next();
server = client.getHRegionConnection(r.server);
scannerId = server.openScanner(r.regionName, columns, startRow);
scanMetaRegion(server, scannerId, r.regionName);
}
break;
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
}
}
}
}
@ -1183,6 +1243,7 @@ public class HMaster implements HConstants, HMasterInterface,
}
public void process() throws IOException {
for(int tries = 0; tries < numRetries; tries ++) {
// We can not access any meta region if they have not already been assigned
// and scanned.
@ -1214,10 +1275,21 @@ public class HMaster implements HConstants, HMasterInterface,
metaRegionName = r.regionName;
server = client.getHRegionConnection(r.server);
}
try {
long lockid = server.startUpdate(metaRegionName, clientId, regionInfo.regionName);
server.delete(metaRegionName, clientId, lockid, COL_SERVER);
server.delete(metaRegionName, clientId, lockid, COL_STARTCODE);
server.commit(metaRegionName, clientId, lockid);
break;
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
continue;
}
}
if(reassignRegion) {
if(LOG.isDebugEnabled()) {
@ -1261,12 +1333,13 @@ public class HMaster implements HConstants, HMasterInterface,
String.valueOf(info.getStartCode()).getBytes(UTF8_ENCODING));
} catch(UnsupportedEncodingException e) {
e.printStackTrace();
LOG.error(e);
}
}
public void process() throws IOException {
for(int tries = 0; tries < numRetries; tries ++) {
// We can not access any meta region if they have not already been assigned
// and scanned.
@ -1302,10 +1375,19 @@ public class HMaster implements HConstants, HMasterInterface,
if(LOG.isDebugEnabled()) {
LOG.debug("updating row " + regionName + " in table " + metaRegionName);
}
try {
long lockid = server.startUpdate(metaRegionName, clientId, regionName);
server.put(metaRegionName, clientId, lockid, COL_SERVER, serverAddress);
server.put(metaRegionName, clientId, lockid, COL_STARTCODE, startCode);
server.commit(metaRegionName, clientId, lockid);
break;
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
}
}
}
}
@ -1324,6 +1406,8 @@ public class HMaster implements HConstants, HMasterInterface,
}
HRegionInfo newRegion = new HRegionInfo(rand.nextLong(), desc, null, null);
for(int tries = 0; tries < numRetries; tries++) {
try {
// We can not access any meta region if they have not already been assigned
// and scanned.
@ -1381,6 +1465,14 @@ public class HMaster implements HConstants, HMasterInterface,
unassignedRegions.put(regionName, info);
assignAttempts.put(regionName, 0L);
break;
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
}
}
if(LOG.isDebugEnabled()) {
LOG.debug("created table " + desc.getName());
@ -1439,8 +1531,10 @@ public class HMaster implements HConstants, HMasterInterface,
throw new IllegalStateException(MASTER_NOT_RUNNING);
}
// We can not access any meta region if they have not already been assigned
// and scanned.
for(int tries = 0; tries < numRetries; tries++) {
try {
// We can not access any meta region if they have not already been
// assigned and scanned.
metaScanner.waitForMetaScan();
@ -1485,7 +1579,6 @@ public class HMaster implements HConstants, HMasterInterface,
try {
scannerId = server.openScanner(m.regionName, METACOLUMNS, tableName);
DataInputBuffer inbuf = new DataInputBuffer();
byte[] bytes;
while(true) {
@ -1553,17 +1646,13 @@ public class HMaster implements HConstants, HMasterInterface,
unservedRegions.add(info.regionName);
}
} catch(IOException e) {
e.printStackTrace();
} finally {
if(scannerId != -1L) {
try {
server.close(scannerId);
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
}
}
scannerId = -1L;
@ -1589,12 +1678,13 @@ public class HMaster implements HConstants, HMasterInterface,
LOG.debug("deleted columns in row: " + rowName);
}
} catch(Exception e) {
} catch(IOException e) {
if(lockid != -1L) {
server.abort(m.regionName, clientId, lockid);
}
LOG.error("columns deletion failed in row: " + rowName);
LOG.error(e);
throw e;
}
}
@ -1614,10 +1704,24 @@ public class HMaster implements HConstants, HMasterInterface,
} catch(IOException e) {
LOG.error("failed to delete region " + regionName);
LOG.error(e);
throw e;
}
}
}
}
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
continue;
} catch(IOException e) {
LOG.error(e);
throw e;
}
break;
}
if(LOG.isDebugEnabled()) {
LOG.debug("deleted table: " + tableName);
}

View File

@ -57,6 +57,7 @@ public class HRegionServer
private long threadWakeFrequency;
private int maxLogEntries;
private long msgInterval;
private int numRetries;
// Check to see if regions should be split
@ -132,6 +133,8 @@ public class HRegionServer
(oldRegion.find(META_TABLE_NAME.toString()) == 0) ?
ROOT_TABLE_NAME : META_TABLE_NAME;
for(int tries = 0; tries < numRetries; tries++) {
try {
client.openTable(tableToUpdate);
long lockid = client.startUpdate(oldRegion);
client.delete(lockid, COL_REGIONINFO);
@ -174,6 +177,15 @@ public class HRegionServer
} finally {
lock.writeLock().unlock();
}
} catch(NotServingRegionException e) {
if(tries == numRetries - 1) {
throw e;
}
continue;
}
break;
}
}
}
}
@ -241,7 +253,7 @@ public class HRegionServer
cur.optionallyFlush();
} catch(IOException iex) {
iex.printStackTrace();
LOG.error(iex);
}
}
}
@ -503,9 +515,6 @@ public class HRegionServer
} catch(InterruptedException iex) {
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("Wake");
}
}
continue;
}
@ -617,7 +626,7 @@ public class HRegionServer
join();
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
}
if(LOG.isDebugEnabled()) {
LOG.debug("main thread exiting");
@ -747,7 +756,7 @@ public class HRegionServer
throw new IOException("Impossible state during msg processing. Instruction: " + msg);
}
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
}
}
if(LOG.isDebugEnabled()) {
@ -944,7 +953,7 @@ public class HRegionServer
localRegion.abort(localLockId);
} catch(IOException iex) {
iex.printStackTrace();
LOG.error(iex);
}
}
}
@ -1071,7 +1080,7 @@ public class HRegionServer
leases.createLease(scannerName, scannerName, new ScannerListener(scannerName));
} catch(IOException e) {
e.printStackTrace();
LOG.error(e);
throw e;
}
return scannerId;