HDFS-2718. Optimize OP_ADD in edits loading. Contributed by Konstantin Shvachko.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-0.23@1239769 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Konstantin Shvachko 2012-02-02 19:12:42 +00:00
parent 12f83b380a
commit 4c78384ecb
8 changed files with 193 additions and 57 deletions

View File

@ -1544,6 +1544,8 @@ Release 0.22.1 - Unreleased
OPTIMIZATIONS OPTIMIZATIONS
HDFS-2718. Optimize OP_ADD in edits loading. (shv)
BUG FIXES BUG FIXES
Release 0.22.0 - 2011-11-29 Release 0.22.0 - 2011-11-29

View File

@ -158,9 +158,6 @@ public class BlockInfoUnderConstruction extends BlockInfo {
BlockInfo convertToCompleteBlock() throws IOException { BlockInfo convertToCompleteBlock() throws IOException {
assert getBlockUCState() != BlockUCState.COMPLETE : assert getBlockUCState() != BlockUCState.COMPLETE :
"Trying to convert a COMPLETE block"; "Trying to convert a COMPLETE block";
if(getBlockUCState() != BlockUCState.COMMITTED)
throw new IOException(
"Cannot complete block: block has not been COMMITTED by the client");
return new BlockInfo(this); return new BlockInfo(this);
} }

View File

@ -439,15 +439,23 @@ public class BlockManager {
*/ */
private BlockInfo completeBlock(final INodeFile fileINode, private BlockInfo completeBlock(final INodeFile fileINode,
final int blkIndex) throws IOException { final int blkIndex) throws IOException {
return completeBlock(fileINode, blkIndex, false);
}
public BlockInfo completeBlock(final INodeFile fileINode,
final int blkIndex, final boolean force) throws IOException {
if(blkIndex < 0) if(blkIndex < 0)
return null; return null;
BlockInfo curBlock = fileINode.getBlocks()[blkIndex]; BlockInfo curBlock = fileINode.getBlocks()[blkIndex];
if(curBlock.isComplete()) if(curBlock.isComplete())
return curBlock; return curBlock;
BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock; BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)curBlock;
if(ucBlock.numNodes() < minReplication) if(!force && ucBlock.numNodes() < minReplication)
throw new IOException("Cannot complete block: " + throw new IOException("Cannot complete block: " +
"block does not satisfy minimal replication requirement."); "block does not satisfy minimal replication requirement.");
if(!force && ucBlock.getBlockUCState() != BlockUCState.COMMITTED)
throw new IOException(
"Cannot complete block: block has not been COMMITTED by the client");
BlockInfo completeBlock = ucBlock.convertToCompleteBlock(); BlockInfo completeBlock = ucBlock.convertToCompleteBlock();
// replace penultimate block in file // replace penultimate block in file
fileINode.setBlock(blkIndex, completeBlock); fileINode.setBlock(blkIndex, completeBlock);

View File

@ -287,22 +287,28 @@ public class FSDirectory implements Closeable {
short replication, short replication,
long modificationTime, long modificationTime,
long atime, long atime,
long preferredBlockSize) long preferredBlockSize,
String clientName,
String clientMachine)
throws UnresolvedLinkException { throws UnresolvedLinkException {
INode newNode; INode newNode;
long diskspace = UNKNOWN_DISK_SPACE;
assert hasWriteLock(); assert hasWriteLock();
if (blocks == null) if (blocks == null)
newNode = new INodeDirectory(permissions, modificationTime); newNode = new INodeDirectory(permissions, modificationTime);
else { else if(blocks.length == 0 || blocks[blocks.length-1].getBlockUCState()
== BlockUCState.UNDER_CONSTRUCTION) {
newNode = new INodeFileUnderConstruction(
permissions, blocks.length, replication,
preferredBlockSize, modificationTime, clientName,
clientMachine, null);
} else {
newNode = new INodeFile(permissions, blocks.length, replication, newNode = new INodeFile(permissions, blocks.length, replication,
modificationTime, atime, preferredBlockSize); modificationTime, atime, preferredBlockSize);
diskspace = ((INodeFile)newNode).diskspaceConsumed(blocks);
} }
writeLock(); writeLock();
try { try {
try { try {
newNode = addNode(path, newNode, diskspace); newNode = addNode(path, newNode, UNKNOWN_DISK_SPACE);
if(newNode != null && blocks != null) { if(newNode != null && blocks != null) {
int nrBlocks = blocks.length; int nrBlocks = blocks.length;
// Add file->block mapping // Add file->block mapping
@ -321,6 +327,74 @@ public class FSDirectory implements Closeable {
} }
/**
* Update files in-memory data structures with new block information.
* @throws IOException
*/
void updateFile(INodeFile file,
String path,
PermissionStatus permissions,
BlockInfo[] blocks,
short replication,
long mtime,
long atime,
long preferredBlockSize) throws IOException {
// Update the salient file attributes.
file.setAccessTime(atime);
file.setModificationTimeForce(mtime);
// Update its block list
BlockInfo[] oldBlocks = file.getBlocks();
// Are we only updating the last block's gen stamp.
boolean isGenStampUpdate = oldBlocks.length == blocks.length;
// First, update blocks in common
BlockInfo oldBlock = null;
for (int i = 0; i < oldBlocks.length && i < blocks.length; i++) {
oldBlock = oldBlocks[i];
Block newBlock = blocks[i];
boolean isLastBlock = i == oldBlocks.length - 1;
if (oldBlock.getBlockId() != newBlock.getBlockId() ||
(oldBlock.getGenerationStamp() != newBlock.getGenerationStamp() &&
!(isGenStampUpdate && isLastBlock))) {
throw new IOException("Mismatched block IDs or generation stamps, " +
"attempting to replace block " + oldBlock + " with " + newBlock +
" as block # " + i + "/" + blocks.length + " of " + path);
}
oldBlock.setNumBytes(newBlock.getNumBytes());
oldBlock.setGenerationStamp(newBlock.getGenerationStamp());
}
if (blocks.length < oldBlocks.length) {
// We're removing a block from the file, e.g. abandonBlock(...)
if (!file.isUnderConstruction()) {
throw new IOException("Trying to remove a block from file " +
path + " which is not under construction.");
}
if (blocks.length != oldBlocks.length - 1) {
throw new IOException("Trying to remove more than one block from file "
+ path);
}
unprotectedRemoveBlock(path,
(INodeFileUnderConstruction)file, oldBlocks[oldBlocks.length - 1]);
} else if (blocks.length > oldBlocks.length) {
// We're adding blocks
// First complete last old Block
getBlockManager().completeBlock(file, oldBlocks.length-1, true);
// Add the new blocks
for (int i = oldBlocks.length; i < blocks.length; i++) {
// addBlock();
BlockInfo newBI = blocks[i];
getBlockManager().addINode(newBI, file);
file.addBlock(newBI);
}
}
}
INodeDirectory addToParent(byte[] src, INodeDirectory parentINode, INodeDirectory addToParent(byte[] src, INodeDirectory parentINode,
INode newNode, boolean propagateModTime) throws UnresolvedLinkException { INode newNode, boolean propagateModTime) throws UnresolvedLinkException {
// NOTE: This does not update space counts for parents // NOTE: This does not update space counts for parents
@ -442,28 +516,33 @@ public class FSDirectory implements Closeable {
writeLock(); writeLock();
try { try {
// modify file-> block and blocksMap unprotectedRemoveBlock(path, fileNode, block);
fileNode.removeLastBlock(block);
getBlockManager().removeBlockFromMap(block);
// write modified block locations to log // write modified block locations to log
fsImage.getEditLog().logOpenFile(path, fileNode); fsImage.getEditLog().logOpenFile(path, fileNode);
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* FSDirectory.removeBlock: "
+path+" with "+block
+" block is removed from the file system");
}
// update space consumed
INode[] pathINodes = getExistingPathINodes(path);
updateCount(pathINodes, pathINodes.length-1, 0,
-fileNode.getPreferredBlockSize()*fileNode.getReplication(), true);
} finally { } finally {
writeUnlock(); writeUnlock();
} }
return true; return true;
} }
void unprotectedRemoveBlock(String path, INodeFileUnderConstruction fileNode,
Block block) throws IOException {
// modify file-> block and blocksMap
fileNode.removeLastBlock(block);
getBlockManager().removeBlockFromMap(block);
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* FSDirectory.removeBlock: "
+path+" with "+block
+" block is removed from the file system");
}
// update space consumed
INode[] pathINodes = getExistingPathINodes(path);
updateCount(pathINodes, pathINodes.length-1, 0,
-fileNode.getPreferredBlockSize()*fileNode.getReplication(), true);
}
/** /**
* @see #unprotectedRenameTo(String, String, long) * @see #unprotectedRenameTo(String, String, long)
* @deprecated Use {@link #renameTo(String, String, Rename...)} instead. * @deprecated Use {@link #renameTo(String, String, Rename...)} instead.

View File

@ -187,31 +187,53 @@ public class FSEditLogLoader {
" clientMachine " + addCloseOp.clientMachine); " clientMachine " + addCloseOp.clientMachine);
} }
fsDir.unprotectedDelete(addCloseOp.path, addCloseOp.mtime); // There are four cases here:
// 1. OP_ADD to create a new file
// 2. OP_ADD to update file blocks
// 3. OP_ADD to open file for append
// 4. OP_CLOSE to close the file
// add to the file tree // See if the file already exists
INodeFile node = (INodeFile)fsDir.unprotectedAddFile( INodeFile oldFile = fsDir.getFileINode(addCloseOp.path);
addCloseOp.path, permissions, if (oldFile == null) { // OP_ADD for a new file
blocks, replication, assert addCloseOp.opCode == FSEditLogOpCodes.OP_ADD :
addCloseOp.mtime, addCloseOp.atime, blockSize); "Expected opcode OP_ADD, but got " + addCloseOp.opCode;
if (addCloseOp.opCode == FSEditLogOpCodes.OP_ADD) { fsDir.unprotectedAddFile(
// addCloseOp.path, permissions, blocks, replication,
// Replace current node with a INodeUnderConstruction. addCloseOp.mtime, addCloseOp.atime, blockSize,
// Recreate in-memory lease record. addCloseOp.clientName, addCloseOp.clientMachine);
// } else {
INodeFileUnderConstruction cons = new INodeFileUnderConstruction( fsDir.updateFile(oldFile,
node.getLocalNameBytes(), addCloseOp.path, permissions, blocks, replication,
node.getReplication(), addCloseOp.mtime, addCloseOp.atime, blockSize);
node.getModificationTime(), if(addCloseOp.opCode == FSEditLogOpCodes.OP_CLOSE) { // OP_CLOSE
node.getPreferredBlockSize(), assert oldFile.isUnderConstruction() :
node.getBlocks(), "File is not under construction: " + addCloseOp.path;
node.getPermissionStatus(), fsNamesys.getBlockManager().completeBlock(
addCloseOp.clientName, oldFile, blocks.length-1, true);
addCloseOp.clientMachine, INodeFile newFile =
null); ((INodeFileUnderConstruction)oldFile).convertToInodeFile();
fsDir.replaceNode(addCloseOp.path, node, cons); fsDir.replaceNode(addCloseOp.path, oldFile, newFile);
fsNamesys.leaseManager.addLease(cons.getClientName(), } else if(! oldFile.isUnderConstruction()) { // OP_ADD for append
addCloseOp.path); INodeFileUnderConstruction cons = new INodeFileUnderConstruction(
oldFile.getLocalNameBytes(),
oldFile.getReplication(),
oldFile.getModificationTime(),
oldFile.getPreferredBlockSize(),
oldFile.getBlocks(),
oldFile.getPermissionStatus(),
addCloseOp.clientName,
addCloseOp.clientMachine,
null);
fsDir.replaceNode(addCloseOp.path, oldFile, cons);
}
}
// Update file lease
if(addCloseOp.opCode == FSEditLogOpCodes.OP_ADD) {
fsNamesys.leaseManager.addLease(addCloseOp.clientName, addCloseOp.path);
} else { // Ops.OP_CLOSE
fsNamesys.leaseManager.removeLease(
((INodeFileUnderConstruction)oldFile).getClientName(), addCloseOp.path);
} }
break; break;
} }

View File

@ -41,8 +41,20 @@ public class INodeFileUnderConstruction extends INodeFile {
String clientName, String clientName,
String clientMachine, String clientMachine,
DatanodeDescriptor clientNode) { DatanodeDescriptor clientNode) {
super(permissions.applyUMask(UMASK), 0, replication, modTime, modTime, this(permissions, 0, replication, preferredBlockSize, modTime,
preferredBlockSize); clientName, clientMachine, clientNode);
}
INodeFileUnderConstruction(PermissionStatus permissions,
int nrBlocks,
short replication,
long preferredBlockSize,
long modTime,
String clientName,
String clientMachine,
DatanodeDescriptor clientNode) {
super(permissions.applyUMask(UMASK), nrBlocks, replication,
modTime, modTime, preferredBlockSize);
this.clientName = clientName; this.clientName = clientName;
this.clientMachine = clientMachine; this.clientMachine = clientMachine;
this.clientNode = clientNode; this.clientNode = clientNode;

View File

@ -72,12 +72,20 @@ public class TestAbandonBlock {
// Now abandon the last block // Now abandon the last block
DFSClient dfsclient = DFSClientAdapter.getDFSClient((DistributedFileSystem)fs); DFSClient dfsclient = DFSClientAdapter.getDFSClient((DistributedFileSystem)fs);
LocatedBlocks blocks = dfsclient.getNamenode().getBlockLocations(src, 0, 1); LocatedBlocks blocks =
dfsclient.getNamenode().getBlockLocations(src, 0, Integer.MAX_VALUE);
int orginalNumBlocks = blocks.locatedBlockCount();
LocatedBlock b = blocks.getLastLocatedBlock(); LocatedBlock b = blocks.getLastLocatedBlock();
dfsclient.getNamenode().abandonBlock(b.getBlock(), src, dfsclient.clientName); dfsclient.getNamenode().abandonBlock(b.getBlock(), src, dfsclient.clientName);
// And close the file // And close the file
fout.close(); fout.close();
// Close cluster and check the block has been abandoned after restart
cluster.restartNameNode();
blocks = dfsclient.getNamenode().getBlockLocations(src, 0, Integer.MAX_VALUE);
assert orginalNumBlocks == blocks.locatedBlockCount() + 1 :
"Blocks " + b + " has not been abandoned.";
} }
@Test @Test

View File

@ -111,10 +111,12 @@ public class TestEditLog extends TestCase {
int numTransactions; int numTransactions;
short replication = 3; short replication = 3;
long blockSize = 64; long blockSize = 64;
int startIndex;
Transactions(FSNamesystem ns, int num) { Transactions(FSNamesystem ns, int numTx, int startIdx) {
namesystem = ns; namesystem = ns;
numTransactions = num; numTransactions = numTx;
startIndex = startIdx;
} }
// add a bunch of transactions. // add a bunch of transactions.
@ -126,8 +128,8 @@ public class TestEditLog extends TestCase {
for (int i = 0; i < numTransactions; i++) { for (int i = 0; i < numTransactions; i++) {
INodeFileUnderConstruction inode = new INodeFileUnderConstruction( INodeFileUnderConstruction inode = new INodeFileUnderConstruction(
p, replication, blockSize, 0, "", "", null); p, replication, blockSize, 0, "", "", null);
editLog.logOpenFile("/filename" + i, inode); editLog.logOpenFile("/filename" + startIndex + i, inode);
editLog.logCloseFile("/filename" + i, inode); editLog.logCloseFile("/filename" + startIndex + i, inode);
editLog.logSync(); editLog.logSync();
} }
} }
@ -275,7 +277,8 @@ public class TestEditLog extends TestCase {
// Create threads and make them run transactions concurrently. // Create threads and make them run transactions concurrently.
Thread threadId[] = new Thread[NUM_THREADS]; Thread threadId[] = new Thread[NUM_THREADS];
for (int i = 0; i < NUM_THREADS; i++) { for (int i = 0; i < NUM_THREADS; i++) {
Transactions trans = new Transactions(namesystem, NUM_TRANSACTIONS); Transactions trans =
new Transactions(namesystem, NUM_TRANSACTIONS, i*NUM_TRANSACTIONS);
threadId[i] = new Thread(trans, "TransactionThread-" + i); threadId[i] = new Thread(trans, "TransactionThread-" + i);
threadId[i].start(); threadId[i].start();
} }
@ -288,11 +291,16 @@ public class TestEditLog extends TestCase {
i--; // retry i--; // retry
} }
} }
// Reopen some files as for append
Transactions trans =
new Transactions(namesystem, NUM_TRANSACTIONS, NUM_TRANSACTIONS / 2);
trans.run();
// Roll another time to finalize edits_inprogress_3 // Roll another time to finalize edits_inprogress_3
fsimage.rollEditLog(); fsimage.rollEditLog();
long expectedTxns = (NUM_THREADS * 2 * NUM_TRANSACTIONS) + 2; // +2 for start/end txns long expectedTxns = ((NUM_THREADS+1) * 2 * NUM_TRANSACTIONS) + 2; // +2 for start/end txns
// Verify that we can read in all the transactions that we have written. // Verify that we can read in all the transactions that we have written.
// If there were any corruptions, it is likely that the reading in // If there were any corruptions, it is likely that the reading in