Merging r1466653 through r1467712 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-2802@1467713 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2013-04-13 23:05:54 +00:00
commit bf807063bc
210 changed files with 11726 additions and 3072 deletions

View File

@ -143,10 +143,6 @@
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty</artifactId>
</exclusion>
<exclusion>
<groupId>org.mortbay.jetty</groupId>
<artifactId>jetty-util</artifactId>
</exclusion>
<exclusion>
<groupId>com.sun.jersey</groupId>
<artifactId>jersey-core</artifactId>

View File

@ -617,6 +617,17 @@ Release 2.0.5-beta - UNRELEASED
HADOOP-9429. TestConfiguration fails with IBM JAVA. (Amir Sanjar via
suresh)
HADOOP-9222. Cover package with org.apache.hadoop.io.lz4 unit tests (Vadim
Bondarev via jlowe)
HADOOP-9233. Cover package org.apache.hadoop.io.compress.zlib with unit
tests (Vadim Bondarev via jlowe)
HADOOP-9211. Set default max heap size in HADOOP_CLIENT_OPTS to 512m
in order to avoid OOME. (Plamen Jeliazkov via shv)
HADOOP-9473. Typo in FileUtil copy() method. (Glen Mazza via suresh)
Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES
@ -643,6 +654,9 @@ Release 2.0.4-alpha - UNRELEASED
HADOOP-9444. Modify hadoop-policy.xml to replace unexpanded variables to a
default value of '*'. (Roman Shaposhnik via vinodkv)
HADOOP-9471. hadoop-client wrongfully excludes jetty-util JAR,
breaking webhdfs. (tucu)
Release 2.0.3-alpha - 2013-02-06
INCOMPATIBLE CHANGES
@ -1606,6 +1620,24 @@ Release 2.0.0-alpha - 05-23-2012
HADOOP-8655. Fix TextInputFormat for large deliminators. (Gelesh via
bobby)
Release 0.23.8 - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES
IMPROVEMENTS
OPTIMIZATIONS
BUG FIXES
HADOOP-9222. Cover package with org.apache.hadoop.io.lz4 unit tests (Vadim
Bondarev via jlowe)
HADOOP-9233. Cover package org.apache.hadoop.io.compress.zlib with unit
tests (Vadim Bondarev via jlowe)
Release 0.23.7 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -522,6 +522,7 @@
<javahClassName>org.apache.hadoop.io.compress.lz4.Lz4Compressor</javahClassName>
<javahClassName>org.apache.hadoop.io.compress.lz4.Lz4Decompressor</javahClassName>
<javahClassName>org.apache.hadoop.util.NativeCrc32</javahClassName>
<javahClassName>org.apache.hadoop.net.unix.DomainSocket</javahClassName>
</javahClassNames>
<javahOutputDirectory>${project.build.directory}/native/javah</javahOutputDirectory>
</configuration>

View File

@ -163,10 +163,10 @@ add_executable(test_bulk_crc32
${D}/util/bulk_crc32.c
${T}/util/test_bulk_crc32.c
)
set_property(SOURCE main.cpp PROPERTY INCLUDE_DIRECTORIES "\"-Werror\" \"-Wall\"")
SET(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
add_dual_library(hadoop
main/native/src/exception.c
${D}/io/compress/lz4/Lz4Compressor.c
${D}/io/compress/lz4/Lz4Decompressor.c
${D}/io/compress/lz4/lz4.c
@ -177,6 +177,7 @@ add_dual_library(hadoop
${D}/io/nativeio/NativeIO.c
${D}/io/nativeio/errno_enum.c
${D}/io/nativeio/file_descriptor.c
${D}/net/unix/DomainSocket.c
${D}/security/JniBasedUnixGroupsMapping.c
${D}/security/JniBasedUnixGroupsNetgroupMapping.c
${D}/security/getGroup.c

View File

@ -62,7 +62,7 @@ export HADOOP_DATANODE_OPTS="-Dhadoop.security.logger=ERROR,RFAS $HADOOP_DATANOD
export HADOOP_SECONDARYNAMENODE_OPTS="-Dhadoop.security.logger=${HADOOP_SECURITY_LOGGER:-INFO,RFAS} -Dhdfs.audit.logger=${HDFS_AUDIT_LOGGER:-INFO,NullAppender} $HADOOP_SECONDARYNAMENODE_OPTS"
# The following applies to multiple commands (fs, dfs, fsck, distcp etc)
export HADOOP_CLIENT_OPTS="-Xmx128m $HADOOP_CLIENT_OPTS"
export HADOOP_CLIENT_OPTS="-Xmx512m $HADOOP_CLIENT_OPTS"
#HADOOP_JAVA_PLATFORM_OPTS="-XX:-UsePerfData $HADOOP_JAVA_PLATFORM_OPTS"
# On secure datanodes, user to run the datanode as after dropping privileges

View File

@ -743,6 +743,21 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
return value.trim();
}
}
/**
* Get the value of the <code>name</code> property as a trimmed <code>String</code>,
* <code>defaultValue</code> if no such property exists.
* See @{Configuration#getTrimmed} for more details.
*
* @param name the property name.
* @param defaultValue the property default value.
* @return the value of the <code>name</code> or defaultValue
* if it is not set.
*/
public String getTrimmed(String name, String defaultValue) {
String ret = getTrimmed(name);
return ret == null ? defaultValue : ret;
}
/**
* Get the value of the <code>name</code> property, without doing
@ -877,7 +892,7 @@ public class Configuration implements Iterable<Map.Entry<String,String>>,
}
return result;
}
/**
* Get the value of the <code>name</code> property as an <code>int</code>.
*

View File

@ -284,7 +284,7 @@ public class FileUtil {
// Check if dest is directory
if (!dstFS.exists(dst)) {
throw new IOException("`" + dst +"': specified destination directory " +
"doest not exist");
"does not exist");
} else {
FileStatus sdst = dstFS.getFileStatus(dst);
if (!sdst.isDirectory())

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.net;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.InputStream;
import java.net.Socket;
import java.net.SocketTimeoutException;
@ -37,7 +38,8 @@ import java.nio.channels.SelectionKey;
* IllegalBlockingModeException.
* Please use {@link SocketOutputStream} for writing.
*/
class SocketInputStream extends InputStream
@InterfaceAudience.LimitedPrivate("HDFS")
public class SocketInputStream extends InputStream
implements ReadableByteChannel {
private Reader reader;

View File

@ -260,4 +260,8 @@ public class SocketOutputStream extends OutputStream
throws IOException {
transferToFully(fileCh, position, count, null, null);
}
public void setTimeout(int timeoutMs) {
writer.setTimeout(timeoutMs);
}
}

View File

@ -0,0 +1,704 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.net.unix;
import java.io.Closeable;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketException;
import java.nio.channels.AsynchronousCloseException;
import java.nio.channels.ClosedChannelException;
import java.nio.channels.ReadableByteChannel;
import java.nio.ByteBuffer;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.lang.SystemUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.util.NativeCodeLoader;
import com.google.common.annotations.VisibleForTesting;
/**
* The implementation of UNIX domain sockets in Java.
*
* See {@link DomainSocket} for more information about UNIX domain sockets.
*/
@InterfaceAudience.LimitedPrivate("HDFS")
public class DomainSocket implements Closeable {
static {
if (SystemUtils.IS_OS_WINDOWS) {
loadingFailureReason = "UNIX Domain sockets are not available on Windows.";
} else if (!NativeCodeLoader.isNativeCodeLoaded()) {
loadingFailureReason = "libhadoop cannot be loaded.";
} else {
String problem;
try {
anchorNative();
problem = null;
} catch (Throwable t) {
problem = "DomainSocket#anchorNative got error: " + t.getMessage();
}
loadingFailureReason = problem;
}
}
static Log LOG = LogFactory.getLog(DomainSocket.class);
/**
* True only if we should validate the paths used in {@link DomainSocket#bind()}
*/
private static boolean validateBindPaths = true;
/**
* The reason why DomainSocket is not available, or null if it is available.
*/
private final static String loadingFailureReason;
/**
* Initialize the native library code.
*/
private static native void anchorNative();
/**
* This function is designed to validate that the path chosen for a UNIX
* domain socket is secure. A socket path is secure if it doesn't allow
* unprivileged users to perform a man-in-the-middle attack against it.
* For example, one way to perform a man-in-the-middle attack would be for
* a malicious user to move the server socket out of the way and create his
* own socket in the same place. Not good.
*
* Note that we only check the path once. It's possible that the
* permissions on the path could change, perhaps to something more relaxed,
* immediately after the path passes our validation test-- hence creating a
* security hole. However, the purpose of this check is to spot common
* misconfigurations. System administrators do not commonly change
* permissions on these paths while the server is running.
*
* @param path the path to validate
* @param skipComponents the number of starting path components to skip
* validation for (used only for testing)
*/
@VisibleForTesting
native static void validateSocketPathSecurity0(String path,
int skipComponents) throws IOException;
/**
* Return true only if UNIX domain sockets are available.
*/
public static String getLoadingFailureReason() {
return loadingFailureReason;
}
/**
* Disable validation of the server bind paths.
*/
@VisibleForTesting
public static void disableBindPathValidation() {
validateBindPaths = false;
}
/**
* Given a path and a port, compute the effective path by replacing
* occurrences of _PORT with the port. This is mainly to make it
* possible to run multiple DataNodes locally for testing purposes.
*
* @param path The source path
* @param port Port number to use
*
* @return The effective path
*/
public static String getEffectivePath(String path, int port) {
return path.replace("_PORT", String.valueOf(port));
}
/**
* Tracks the reference count of the file descriptor, and also whether it is
* open or closed.
*/
private static class Status {
/**
* Bit mask representing a closed domain socket.
*/
private static final int STATUS_CLOSED_MASK = 1 << 30;
/**
* Status bits
*
* Bit 30: 0 = DomainSocket open, 1 = DomainSocket closed
* Bits 29 to 0: the reference count.
*/
private final AtomicInteger bits = new AtomicInteger(0);
Status() { }
/**
* Increment the reference count of the underlying file descriptor.
*
* @throws ClosedChannelException If the file descriptor is closed.
*/
void reference() throws ClosedChannelException {
int curBits = bits.incrementAndGet();
if ((curBits & STATUS_CLOSED_MASK) != 0) {
bits.decrementAndGet();
throw new ClosedChannelException();
}
}
/**
* Decrement the reference count of the underlying file descriptor.
*
* @param checkClosed Whether to throw an exception if the file
* descriptor is closed.
*
* @throws AsynchronousCloseException If the file descriptor is closed and
* checkClosed is set.
*/
void unreference(boolean checkClosed) throws AsynchronousCloseException {
int newCount = bits.decrementAndGet();
assert (newCount & ~STATUS_CLOSED_MASK) >= 0;
if (checkClosed && ((newCount & STATUS_CLOSED_MASK) != 0)) {
throw new AsynchronousCloseException();
}
}
/**
* Return true if the file descriptor is currently open.
*
* @return True if the file descriptor is currently open.
*/
boolean isOpen() {
return ((bits.get() & STATUS_CLOSED_MASK) == 0);
}
/**
* Mark the file descriptor as closed.
*
* Once the file descriptor is closed, it cannot be reopened.
*
* @return The current reference count.
* @throws ClosedChannelException If someone else closes the file
* descriptor before we do.
*/
int setClosed() throws ClosedChannelException {
while (true) {
int curBits = bits.get();
if ((curBits & STATUS_CLOSED_MASK) != 0) {
throw new ClosedChannelException();
}
if (bits.compareAndSet(curBits, curBits | STATUS_CLOSED_MASK)) {
return curBits & (~STATUS_CLOSED_MASK);
}
}
}
/**
* Get the current reference count.
*
* @return The current reference count.
*/
int getReferenceCount() {
return bits.get() & (~STATUS_CLOSED_MASK);
}
}
/**
* The socket status.
*/
private final Status status;
/**
* The file descriptor associated with this UNIX domain socket.
*/
private final int fd;
/**
* The path associated with this UNIX domain socket.
*/
private final String path;
/**
* The InputStream associated with this socket.
*/
private final DomainInputStream inputStream = new DomainInputStream();
/**
* The OutputStream associated with this socket.
*/
private final DomainOutputStream outputStream = new DomainOutputStream();
/**
* The Channel associated with this socket.
*/
private final DomainChannel channel = new DomainChannel();
private DomainSocket(String path, int fd) {
this.status = new Status();
this.fd = fd;
this.path = path;
}
private static native int bind0(String path) throws IOException;
/**
* Create a new DomainSocket listening on the given path.
*
* @param path The path to bind and listen on.
* @return The new DomainSocket.
*/
public static DomainSocket bindAndListen(String path) throws IOException {
if (loadingFailureReason != null) {
throw new UnsupportedOperationException(loadingFailureReason);
}
if (validateBindPaths) {
validateSocketPathSecurity0(path, 0);
}
int fd = bind0(path);
return new DomainSocket(path, fd);
}
private static native int accept0(int fd) throws IOException;
/**
* Accept a new UNIX domain connection.
*
* This method can only be used on sockets that were bound with bind().
*
* @return The new connection.
* @throws IOException If there was an I/O error
* performing the accept-- such as the
* socket being closed from under us.
* @throws SocketTimeoutException If the accept timed out.
*/
public DomainSocket accept() throws IOException {
status.reference();
boolean exc = true;
try {
DomainSocket ret = new DomainSocket(path, accept0(fd));
exc = false;
return ret;
} finally {
status.unreference(exc);
}
}
private static native int connect0(String path);
/**
* Create a new DomainSocket connected to the given path.
*
* @param path The path to connect to.
* @return The new DomainSocket.
*/
public static DomainSocket connect(String path) throws IOException {
if (loadingFailureReason != null) {
throw new UnsupportedOperationException(loadingFailureReason);
}
int fd = connect0(path);
return new DomainSocket(path, fd);
}
/**
* Return true if the file descriptor is currently open.
*
* @return True if the file descriptor is currently open.
*/
public boolean isOpen() {
return status.isOpen();
}
/**
* @return The socket path.
*/
public String getPath() {
return path;
}
/**
* @return The socket InputStream
*/
public DomainInputStream getInputStream() {
return inputStream;
}
/**
* @return The socket OutputStream
*/
public DomainOutputStream getOutputStream() {
return outputStream;
}
/**
* @return The socket Channel
*/
public DomainChannel getChannel() {
return channel;
}
public static final int SEND_BUFFER_SIZE = 1;
public static final int RECEIVE_BUFFER_SIZE = 2;
public static final int SEND_TIMEOUT = 3;
public static final int RECEIVE_TIMEOUT = 4;
private static native void setAttribute0(int fd, int type, int val)
throws IOException;
public void setAttribute(int type, int size) throws IOException {
status.reference();
boolean exc = true;
try {
setAttribute0(fd, type, size);
exc = false;
} finally {
status.unreference(exc);
}
}
private native int getAttribute0(int fd, int type) throws IOException;
public int getAttribute(int type) throws IOException {
status.reference();
int attribute;
boolean exc = true;
try {
attribute = getAttribute0(fd, type);
exc = false;
return attribute;
} finally {
status.unreference(exc);
}
}
private static native void close0(int fd) throws IOException;
private static native void closeFileDescriptor0(FileDescriptor fd)
throws IOException;
private static native void shutdown0(int fd) throws IOException;
/**
* Close the Socket.
*/
@Override
public void close() throws IOException {
// Set the closed bit on this DomainSocket
int refCount;
try {
refCount = status.setClosed();
} catch (ClosedChannelException e) {
// Someone else already closed the DomainSocket.
return;
}
// Wait for all references to go away
boolean didShutdown = false;
boolean interrupted = false;
while (refCount > 0) {
if (!didShutdown) {
try {
// Calling shutdown on the socket will interrupt blocking system
// calls like accept, write, and read that are going on in a
// different thread.
shutdown0(fd);
} catch (IOException e) {
LOG.error("shutdown error: ", e);
}
didShutdown = true;
}
try {
Thread.sleep(10);
} catch (InterruptedException e) {
interrupted = true;
}
refCount = status.getReferenceCount();
}
// At this point, nobody has a reference to the file descriptor,
// and nobody will be able to get one in the future either.
// We now call close(2) on the file descriptor.
// After this point, the file descriptor number will be reused by
// something else. Although this DomainSocket object continues to hold
// the old file descriptor number (it's a final field), we never use it
// again because this DomainSocket is closed.
close0(fd);
if (interrupted) {
Thread.currentThread().interrupt();
}
}
private native static void sendFileDescriptors0(int fd,
FileDescriptor descriptors[],
byte jbuf[], int offset, int length) throws IOException;
/**
* Send some FileDescriptor objects to the process on the other side of this
* socket.
*
* @param descriptors The file descriptors to send.
* @param jbuf Some bytes to send. You must send at least
* one byte.
* @param offset The offset in the jbuf array to start at.
* @param length Length of the jbuf array to use.
*/
public void sendFileDescriptors(FileDescriptor descriptors[],
byte jbuf[], int offset, int length) throws IOException {
status.reference();
boolean exc = true;
try {
sendFileDescriptors0(fd, descriptors, jbuf, offset, length);
exc = false;
} finally {
status.unreference(exc);
}
}
private static native int receiveFileDescriptors0(int fd,
FileDescriptor[] descriptors,
byte jbuf[], int offset, int length) throws IOException;
/**
* Receive some FileDescriptor objects from the process on the other side of
* this socket.
*
* @param descriptors (output parameter) Array of FileDescriptors.
* We will fill as many slots as possible with file
* descriptors passed from the remote process. The
* other slots will contain NULL.
* @param jbuf (output parameter) Buffer to read into.
* The UNIX domain sockets API requires you to read
* at least one byte from the remote process, even
* if all you care about is the file descriptors
* you will receive.
* @param offset Offset into the byte buffer to load data
* @param length Length of the byte buffer to use for data
*
* @return The number of bytes read. This will be -1 if we
* reached EOF (similar to SocketInputStream);
* otherwise, it will be positive.
* @throws IOException if there was an I/O error.
*/
public int receiveFileDescriptors(FileDescriptor[] descriptors,
byte jbuf[], int offset, int length) throws IOException {
status.reference();
boolean exc = true;
try {
int nBytes = receiveFileDescriptors0(fd, descriptors, jbuf, offset, length);
exc = false;
return nBytes;
} finally {
status.unreference(exc);
}
}
/**
* Receive some FileDescriptor objects from the process on the other side of
* this socket, and wrap them in FileInputStream objects.
*
* See {@link DomainSocket#recvFileInputStreams(ByteBuffer)}
*/
public int recvFileInputStreams(FileInputStream[] streams, byte buf[],
int offset, int length) throws IOException {
FileDescriptor descriptors[] = new FileDescriptor[streams.length];
boolean success = false;
for (int i = 0; i < streams.length; i++) {
streams[i] = null;
}
status.reference();
try {
int ret = receiveFileDescriptors0(fd, descriptors, buf, offset, length);
for (int i = 0, j = 0; i < descriptors.length; i++) {
if (descriptors[i] != null) {
streams[j++] = new FileInputStream(descriptors[i]);
descriptors[i] = null;
}
}
success = true;
return ret;
} finally {
if (!success) {
for (int i = 0; i < descriptors.length; i++) {
if (descriptors[i] != null) {
try {
closeFileDescriptor0(descriptors[i]);
} catch (Throwable t) {
LOG.warn(t);
}
} else if (streams[i] != null) {
try {
streams[i].close();
} catch (Throwable t) {
LOG.warn(t);
} finally {
streams[i] = null; }
}
}
}
status.unreference(!success);
}
}
private native static int readArray0(int fd, byte b[], int off, int len)
throws IOException;
private native static int available0(int fd) throws IOException;
private static native void write0(int fd, int b) throws IOException;
private static native void writeArray0(int fd, byte b[], int offset, int length)
throws IOException;
private native static int readByteBufferDirect0(int fd, ByteBuffer dst,
int position, int remaining) throws IOException;
/**
* Input stream for UNIX domain sockets.
*/
@InterfaceAudience.LimitedPrivate("HDFS")
public class DomainInputStream extends InputStream {
@Override
public int read() throws IOException {
status.reference();
boolean exc = true;
try {
byte b[] = new byte[1];
int ret = DomainSocket.readArray0(DomainSocket.this.fd, b, 0, 1);
exc = false;
return (ret >= 0) ? b[0] : -1;
} finally {
status.unreference(exc);
}
}
@Override
public int read(byte b[], int off, int len) throws IOException {
status.reference();
boolean exc = true;
try {
int nRead = DomainSocket.readArray0(DomainSocket.this.fd, b, off, len);
exc = false;
return nRead;
} finally {
status.unreference(exc);
}
}
@Override
public int available() throws IOException {
status.reference();
boolean exc = true;
try {
int nAvailable = DomainSocket.available0(DomainSocket.this.fd);
exc = false;
return nAvailable;
} finally {
status.unreference(exc);
}
}
@Override
public void close() throws IOException {
DomainSocket.this.close();
}
}
/**
* Output stream for UNIX domain sockets.
*/
@InterfaceAudience.LimitedPrivate("HDFS")
public class DomainOutputStream extends OutputStream {
@Override
public void close() throws IOException {
DomainSocket.this.close();
}
@Override
public void write(int val) throws IOException {
status.reference();
boolean exc = true;
try {
byte b[] = new byte[1];
b[0] = (byte)val;
DomainSocket.writeArray0(DomainSocket.this.fd, b, 0, 1);
exc = false;
} finally {
status.unreference(exc);
}
}
@Override
public void write(byte[] b, int off, int len) throws IOException {
status.reference();
boolean exc = true;
try {
DomainSocket.writeArray0(DomainSocket.this.fd, b, off, len);
exc = false;
} finally {
status.unreference(exc);
}
}
}
@InterfaceAudience.LimitedPrivate("HDFS")
public class DomainChannel implements ReadableByteChannel {
@Override
public boolean isOpen() {
return DomainSocket.this.isOpen();
}
@Override
public void close() throws IOException {
DomainSocket.this.close();
}
@Override
public int read(ByteBuffer dst) throws IOException {
status.reference();
boolean exc = true;
try {
int nread = 0;
if (dst.isDirect()) {
nread = DomainSocket.readByteBufferDirect0(DomainSocket.this.fd,
dst, dst.position(), dst.remaining());
} else if (dst.hasArray()) {
nread = DomainSocket.readArray0(DomainSocket.this.fd,
dst.array(), dst.position() + dst.arrayOffset(),
dst.remaining());
} else {
throw new AssertionError("we don't support " +
"using ByteBuffers that aren't either direct or backed by " +
"arrays");
}
if (nread > 0) {
dst.position(dst.position() + nread);
}
exc = false;
return nread;
} finally {
status.unreference(exc);
}
}
}
@Override
public String toString() {
return String.format("DomainSocket(fd=%d,path=%s)", fd, path);
}
}

View File

@ -27,8 +27,10 @@ import java.io.DataInputStream;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import javax.crypto.SecretKey;
@ -144,6 +146,10 @@ extends AbstractDelegationTokenIdentifier>
return;
}
protected void logExpireToken(TokenIdent ident) throws IOException {
return;
}
/**
* Update the current master key
* This is called once by startThreads before tokenRemoverThread is created,
@ -363,15 +369,25 @@ extends AbstractDelegationTokenIdentifier>
}
/** Remove expired delegation tokens from cache */
private synchronized void removeExpiredToken() {
private void removeExpiredToken() throws IOException {
long now = Time.now();
Iterator<DelegationTokenInformation> i = currentTokens.values().iterator();
while (i.hasNext()) {
long renewDate = i.next().getRenewDate();
if (now > renewDate) {
i.remove();
Set<TokenIdent> expiredTokens = new HashSet<TokenIdent>();
synchronized (this) {
Iterator<Map.Entry<TokenIdent, DelegationTokenInformation>> i =
currentTokens.entrySet().iterator();
while (i.hasNext()) {
Map.Entry<TokenIdent, DelegationTokenInformation> entry = i.next();
long renewDate = entry.getValue().getRenewDate();
if (renewDate < now) {
expiredTokens.add(entry.getKey());
i.remove();
}
}
}
// don't hold lock on 'this' to avoid edit log updates blocking token ops
for (TokenIdent ident : expiredTokens) {
logExpireToken(ident);
}
}
public void stopThreads() {

View File

@ -104,7 +104,7 @@ public class DataChecksum implements Checksum {
( (bytes[offset+2] & 0xff) << 16 ) |
( (bytes[offset+3] & 0xff) << 8 ) |
( (bytes[offset+4] & 0xff) );
return newDataChecksum( Type.valueOf(bytes[0]), bytesPerChecksum );
return newDataChecksum( Type.valueOf(bytes[offset]), bytesPerChecksum );
}
/**

View File

@ -0,0 +1,109 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "exception.h"
#include <jni.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
jthrowable newExceptionV(JNIEnv* env, const char *name,
const char *fmt, va_list ap)
{
int need;
char buf[1], *msg = NULL;
va_list ap2;
jstring jstr = NULL;
jthrowable jthr;
jclass clazz;
jmethodID excCtor;
va_copy(ap2, ap);
clazz = (*env)->FindClass(env, name);
if (!clazz) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
excCtor = (*env)->GetMethodID(env,
clazz, "<init>", "(Ljava/lang/String;)V");
if (!excCtor) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
need = vsnprintf(buf, sizeof(buf), fmt, ap);
if (need < 0) {
fmt = "vsnprintf error";
need = strlen(fmt);
}
msg = malloc(need + 1);
vsnprintf(msg, need + 1, fmt, ap2);
jstr = (*env)->NewStringUTF(env, msg);
if (!jstr) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
jthr = (*env)->NewObject(env, clazz, excCtor, jstr);
if (!jthr) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
done:
free(msg);
va_end(ap2);
(*env)->DeleteLocalRef(env, jstr);
return jthr;
}
jthrowable newException(JNIEnv* env, const char *name, const char *fmt, ...)
{
va_list ap;
jthrowable jthr;
va_start(ap, fmt);
jthr = newExceptionV(env, name, fmt, ap);
va_end(ap);
return jthr;
}
jthrowable newRuntimeException(JNIEnv* env, const char *fmt, ...)
{
va_list ap;
jthrowable jthr;
va_start(ap, fmt);
jthr = newExceptionV(env, "java/lang/RuntimeException", fmt, ap);
va_end(ap);
return jthr;
}
jthrowable newIOException(JNIEnv* env, const char *fmt, ...)
{
va_list ap;
jthrowable jthr;
va_start(ap, fmt);
jthr = newExceptionV(env, "java/io/IOException", fmt, ap);
va_end(ap);
return jthr;
}

View File

@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef HADOOP_MAIN_NATIVE_SRC_EXCEPTION_H
#define HADOOP_MAIN_NATIVE_SRC_EXCEPTION_H
#include <jni.h> /* for jthrowable */
#include <stdarg.h> /* for va_list */
/**
* Create a new Exception.
*
* No exceptions will be pending on return.
*
* @param env The JNI environment
* @param name full name of the Java exception class
* @param fmt printf-style format string
* @param ap printf-style arguments
*
* @return The RuntimeException
*/
jthrowable newExceptionV(JNIEnv* env, const char *name,
const char *fmt, va_list ap);
/**
* Create a new Exception.
*
* No exceptions will be pending on return.
*
* @param env The JNI environment
* @param name full name of the Java exception class
* @param fmt printf-style format string
* @param ... printf-style arguments
*
* @return The RuntimeException
*/
jthrowable newException(JNIEnv* env, const char *name, const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
/**
* Create a new RuntimeException.
*
* No exceptions will be pending on return.
*
* @param env The JNI environment
* @param fmt printf-style format string
* @param ... printf-style arguments
*
* @return The RuntimeException
*/
jthrowable newRuntimeException(JNIEnv* env, const char *fmt, ...)
__attribute__((format(printf, 2, 3)));
/**
* Create a new IOException.
*
* No exceptions will be pending on return.
*
* @param env The JNI environment
* @param fmt printf-style format string
* @param ... printf-style arguments
*
* @return The IOException, or another exception if we failed
* to create the NativeIOException.
*/
jthrowable newIOException(JNIEnv* env, const char *fmt, ...)
__attribute__((format(printf, 2, 3)));
#endif

View File

@ -0,0 +1,944 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#define _GNU_SOURCE
#include "exception.h"
#include "org/apache/hadoop/io/nativeio/file_descriptor.h"
#include "org_apache_hadoop.h"
#include "org_apache_hadoop_net_unix_DomainSocket.h"
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <jni.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/ioctl.h> /* for FIONREAD */
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/un.h>
#include <unistd.h>
#define SEND_BUFFER_SIZE org_apache_hadoop_net_unix_DomainSocket_SEND_BUFFER_SIZE
#define RECEIVE_BUFFER_SIZE org_apache_hadoop_net_unix_DomainSocket_RECEIVE_BUFFER_SIZE
#define SEND_TIMEOUT org_apache_hadoop_net_unix_DomainSocket_SEND_TIMEOUT
#define RECEIVE_TIMEOUT org_apache_hadoop_net_unix_DomainSocket_RECEIVE_TIMEOUT
#define DEFAULT_RECEIVE_TIMEOUT 120000
#define DEFAULT_SEND_TIMEOUT 120000
#define LISTEN_BACKLOG 128
/**
* Can't pass more than this number of file descriptors in a single message.
*/
#define MAX_PASSED_FDS 16
static jthrowable setAttribute0(JNIEnv *env, jint fd, jint type, jint val);
/**
* Convert an errno to a socket exception name.
*
* Note: we assume that all of these exceptions have a one-argument constructor
* that takes a string.
*
* @return The exception class name
*/
static const char *errnoToSocketExceptionName(int errnum)
{
switch (errnum) {
case EAGAIN:
/* accept(2) returns EAGAIN when a socket timeout has been set, and that
* timeout elapses without an incoming connection. This error code is also
* used in non-blocking I/O, but we don't support that. */
case ETIMEDOUT:
return "java/net/SocketTimeoutException";
case EHOSTDOWN:
case EHOSTUNREACH:
case ECONNREFUSED:
return "java/net/NoRouteToHostException";
case ENOTSUP:
return "java/lang/UnsupportedOperationException";
default:
return "java/net/SocketException";
}
}
static jthrowable newSocketException(JNIEnv *env, int errnum,
const char *fmt, ...)
__attribute__((format(printf, 3, 4)));
static jthrowable newSocketException(JNIEnv *env, int errnum,
const char *fmt, ...)
{
va_list ap;
jthrowable jthr;
va_start(ap, fmt);
jthr = newExceptionV(env, errnoToSocketExceptionName(errnum), fmt, ap);
va_end(ap);
return jthr;
}
static const char* terror(int errnum)
{
if ((errnum < 0) || (errnum >= sys_nerr)) {
return "unknown error.";
}
return sys_errlist[errnum];
}
/**
* Flexible buffer that will try to fit data on the stack, and fall back
* to the heap if necessary.
*/
struct flexibleBuffer {
int8_t *curBuf;
int8_t *allocBuf;
int8_t stackBuf[8196];
};
static jthrowable flexBufInit(JNIEnv *env, struct flexibleBuffer *flexBuf, jint length)
{
flexBuf->curBuf = flexBuf->allocBuf = NULL;
if (length < sizeof(flexBuf->stackBuf)) {
flexBuf->curBuf = flexBuf->stackBuf;
return NULL;
}
flexBuf->allocBuf = malloc(length);
if (!flexBuf->allocBuf) {
return newException(env, "java/lang/OutOfMemoryError",
"OOM allocating space for %d bytes of data.", length);
}
flexBuf->curBuf = flexBuf->allocBuf;
return NULL;
}
static void flexBufFree(struct flexibleBuffer *flexBuf)
{
free(flexBuf->allocBuf);
}
static jthrowable setup(JNIEnv *env, int *ofd, jobject jpath, int doConnect)
{
const char *cpath = NULL;
struct sockaddr_un addr;
jthrowable jthr = NULL;
int fd = -1, ret;
fd = socket(PF_UNIX, SOCK_STREAM, 0);
if (fd < 0) {
ret = errno;
jthr = newSocketException(env, ret,
"error creating UNIX domain socket with SOCK_STREAM: %s",
terror(ret));
goto done;
}
memset(&addr, 0, sizeof(&addr));
addr.sun_family = AF_UNIX;
cpath = (*env)->GetStringUTFChars(env, jpath, NULL);
if (!cpath) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
ret = snprintf(addr.sun_path, sizeof(addr.sun_path),
"%s", cpath);
if (ret < 0) {
ret = errno;
jthr = newSocketException(env, EIO,
"error computing UNIX domain socket path: error %d (%s)",
ret, terror(ret));
goto done;
}
if (ret >= sizeof(addr.sun_path)) {
jthr = newSocketException(env, ENAMETOOLONG,
"error computing UNIX domain socket path: path too long. "
"The longest UNIX domain socket path possible on this host "
"is %zd bytes.", sizeof(addr.sun_path) - 1);
goto done;
}
if (doConnect) {
RETRY_ON_EINTR(ret, connect(fd,
(struct sockaddr*)&addr, sizeof(addr)));
if (ret < 0) {
ret = errno;
jthr = newException(env, "java/net/ConnectException",
"connect(2) error: %s when trying to connect to '%s'",
terror(ret), addr.sun_path);
goto done;
}
} else {
RETRY_ON_EINTR(ret, unlink(addr.sun_path));
RETRY_ON_EINTR(ret, bind(fd, (struct sockaddr*)&addr, sizeof(addr)));
if (ret < 0) {
ret = errno;
jthr = newException(env, "java/net/BindException",
"bind(2) error: %s when trying to bind to '%s'",
terror(ret), addr.sun_path);
goto done;
}
/* We need to make the socket readable and writable for all users in the
* system.
*
* If the system administrator doesn't want the socket to be accessible to
* all users, he can simply adjust the +x permissions on one of the socket's
* parent directories.
*
* See HDFS-4485 for more discussion.
*/
if (chmod(addr.sun_path, 0666)) {
ret = errno;
jthr = newException(env, "java/net/BindException",
"chmod(%s, 0666) failed: %s", addr.sun_path, terror(ret));
goto done;
}
if (listen(fd, LISTEN_BACKLOG) < 0) {
ret = errno;
jthr = newException(env, "java/net/BindException",
"listen(2) error: %s when trying to listen to '%s'",
terror(ret), addr.sun_path);
goto done;
}
}
done:
if (cpath) {
(*env)->ReleaseStringUTFChars(env, jpath, cpath);
}
if (jthr) {
if (fd > 0) {
RETRY_ON_EINTR(ret, close(fd));
fd = -1;
}
} else {
*ofd = fd;
}
return jthr;
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_anchorNative(
JNIEnv *env, jclass clazz)
{
fd_init(env); // for fd_get, fd_create, etc.
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_validateSocketPathSecurity0(
JNIEnv *env, jclass clazz, jobject jstr, jint skipComponents)
{
jint utfLength;
char path[PATH_MAX], check[PATH_MAX], *token, *rest;
struct stat st;
int ret, mode, strlenPath;
uid_t uid;
jthrowable jthr = NULL;
utfLength = (*env)->GetStringUTFLength(env, jstr);
if (utfLength > sizeof(path)) {
jthr = newIOException(env, "path is too long! We expected a path "
"no longer than %zd UTF-8 bytes.", sizeof(path));
goto done;
}
(*env)->GetStringUTFRegion(env, jstr, 0, utfLength, path);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
uid = geteuid();
strlenPath = strlen(path);
if (strlenPath == 0) {
jthr = newIOException(env, "socket path is empty.");
goto done;
}
if (path[strlenPath - 1] == '/') {
/* It makes no sense to have a socket path that ends in a slash, since
* sockets are not directories. */
jthr = newIOException(env, "bad socket path '%s'. The socket path "
"must not end in a slash.", path);
goto done;
}
// This loop iterates through all of the path components except for the very
// last one. We don't validate the last component, since it's not supposed to
// be a directory. (If it is a directory, we will fail to create the socket
// later with EISDIR or similar.)
for (check[0] = '/', check[1] = '\0', rest = path, token = "";
token && rest[0];
token = strtok_r(rest, "/", &rest)) {
if (strcmp(check, "/") != 0) {
// If the previous directory we checked was '/', we skip appending another
// slash to the end because it would be unncessary. Otherwise we do it.
strcat(check, "/");
}
// These strcats are safe because the length of 'check' is the same as the
// length of 'path' and we never add more slashes than were in the original
// path.
strcat(check, token);
if (skipComponents > 0) {
skipComponents--;
continue;
}
if (stat(check, &st) < 0) {
ret = errno;
jthr = newIOException(env, "failed to stat a path component: '%s'. "
"error code %d (%s)", check, ret, terror(ret));
goto done;
}
mode = st.st_mode & 0777;
if (mode & 0002) {
jthr = newIOException(env, "the path component: '%s' is "
"world-writable. Its permissions are 0%03o. Please fix "
"this or select a different socket path.", check, mode);
goto done;
}
if ((mode & 0020) && (st.st_gid != 0)) {
jthr = newIOException(env, "the path component: '%s' is "
"group-writable, and the group is not root. Its permissions are "
"0%03o, and it is owned by gid %d. Please fix this or "
"select a different socket path.", check, mode, st.st_gid);
goto done;
}
if ((mode & 0200) && (st.st_uid != 0) &&
(st.st_uid != uid)) {
jthr = newIOException(env, "the path component: '%s' is "
"owned by a user who is not root and not you. Your effective user "
"id is %d; the path is owned by user id %d, and its permissions are "
"0%03o. Please fix this or select a different socket path.",
check, uid, st.st_uid, mode);
goto done;
goto done;
}
}
done:
if (jthr) {
(*env)->Throw(env, jthr);
}
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_bind0(
JNIEnv *env, jclass clazz, jstring path)
{
int fd;
jthrowable jthr = NULL;
jthr = setup(env, &fd, path, 0);
if (jthr) {
(*env)->Throw(env, jthr);
}
return fd;
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_accept0(
JNIEnv *env, jclass clazz, jint fd)
{
int ret, newFd = -1;
socklen_t slen;
struct sockaddr_un remote;
jthrowable jthr = NULL;
slen = sizeof(remote);
do {
newFd = accept(fd, (struct sockaddr*)&remote, &slen);
} while ((newFd < 0) && (errno == EINTR));
if (newFd < 0) {
ret = errno;
jthr = newSocketException(env, ret, "accept(2) error: %s", terror(ret));
goto done;
}
done:
if (jthr) {
if (newFd > 0) {
RETRY_ON_EINTR(ret, close(newFd));
newFd = -1;
}
(*env)->Throw(env, jthr);
}
return newFd;
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_connect0(
JNIEnv *env, jclass clazz, jstring path)
{
int ret, fd;
jthrowable jthr = NULL;
jthr = setup(env, &fd, path, 1);
if (jthr) {
(*env)->Throw(env, jthr);
return -1;
}
if (((jthr = setAttribute0(env, fd, SEND_TIMEOUT, DEFAULT_SEND_TIMEOUT))) ||
((jthr = setAttribute0(env, fd, RECEIVE_TIMEOUT, DEFAULT_RECEIVE_TIMEOUT)))) {
RETRY_ON_EINTR(ret, close(fd));
(*env)->Throw(env, jthr);
return -1;
}
return fd;
}
static void javaMillisToTimeVal(int javaMillis, struct timeval *tv)
{
tv->tv_sec = javaMillis / 1000;
tv->tv_usec = (javaMillis - (tv->tv_sec * 1000)) * 1000;
}
static jthrowable setAttribute0(JNIEnv *env, jint fd, jint type, jint val)
{
struct timeval tv;
int ret, buf;
switch (type) {
case SEND_BUFFER_SIZE:
buf = val;
if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &buf, sizeof(buf))) {
ret = errno;
return newSocketException(env, ret,
"setsockopt(SO_SNDBUF) error: %s", terror(ret));
}
return NULL;
case RECEIVE_BUFFER_SIZE:
buf = val;
if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &buf, sizeof(buf))) {
ret = errno;
return newSocketException(env, ret,
"setsockopt(SO_RCVBUF) error: %s", terror(ret));
}
return NULL;
case SEND_TIMEOUT:
javaMillisToTimeVal(val, &tv);
if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, (struct timeval *)&tv,
sizeof(tv))) {
ret = errno;
return newSocketException(env, ret,
"setsockopt(SO_SNDTIMEO) error: %s", terror(ret));
}
return NULL;
case RECEIVE_TIMEOUT:
javaMillisToTimeVal(val, &tv);
if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (struct timeval *)&tv,
sizeof(tv))) {
ret = errno;
return newSocketException(env, ret,
"setsockopt(SO_RCVTIMEO) error: %s", terror(ret));
}
return NULL;
default:
break;
}
return newRuntimeException(env, "Invalid attribute type %d.", type);
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_setAttribute0(
JNIEnv *env, jclass clazz, jint fd, jint type, jint val)
{
jthrowable jthr = setAttribute0(env, fd, type, val);
if (jthr) {
(*env)->Throw(env, jthr);
}
}
static jint getSockOptBufSizeToJavaBufSize(int size)
{
#ifdef __linux__
// Linux always doubles the value that you set with setsockopt.
// We cut it in half here so that programs can at least read back the same
// value they set.
size /= 2;
#endif
return size;
}
static int timeValToJavaMillis(const struct timeval *tv)
{
return (tv->tv_sec * 1000) + (tv->tv_usec / 1000);
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_getAttribute0(
JNIEnv *env, jclass clazz, jint fd, jint type)
{
struct timeval tv;
socklen_t len;
int ret, rval = 0;
switch (type) {
case SEND_BUFFER_SIZE:
len = sizeof(rval);
if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &rval, &len)) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"getsockopt(SO_SNDBUF) error: %s", terror(ret)));
return -1;
}
return getSockOptBufSizeToJavaBufSize(rval);
case RECEIVE_BUFFER_SIZE:
len = sizeof(rval);
if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rval, &len)) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"getsockopt(SO_RCVBUF) error: %s", terror(ret)));
return -1;
}
return getSockOptBufSizeToJavaBufSize(rval);
case SEND_TIMEOUT:
memset(&tv, 0, sizeof(tv));
len = sizeof(struct timeval);
if (getsockopt(fd, SOL_SOCKET, SO_SNDTIMEO, &tv, &len)) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"getsockopt(SO_SNDTIMEO) error: %s", terror(ret)));
return -1;
}
return timeValToJavaMillis(&tv);
case RECEIVE_TIMEOUT:
memset(&tv, 0, sizeof(tv));
len = sizeof(struct timeval);
if (getsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, &tv, &len)) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"getsockopt(SO_RCVTIMEO) error: %s", terror(ret)));
return -1;
}
return timeValToJavaMillis(&tv);
default:
(*env)->Throw(env, newRuntimeException(env,
"Invalid attribute type %d.", type));
return -1;
}
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_close0(
JNIEnv *env, jclass clazz, jint fd)
{
int ret;
RETRY_ON_EINTR(ret, close(fd));
if (ret) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"close(2) error: %s", terror(ret)));
}
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_closeFileDescriptor0(
JNIEnv *env, jclass clazz, jobject jfd)
{
Java_org_apache_hadoop_net_unix_DomainSocket_close0(
env, clazz, fd_get(env, jfd));
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_shutdown0(
JNIEnv *env, jclass clazz, jint fd)
{
int ret;
RETRY_ON_EINTR(ret, shutdown(fd, SHUT_RDWR));
if (ret) {
ret = errno;
(*env)->Throw(env, newSocketException(env, ret,
"shutdown(2) error: %s", terror(ret)));
}
}
/**
* Write an entire buffer to a file descriptor.
*
* @param env The JNI environment.
* @param fd The fd to write to.
* @param buf The buffer to write
* @param amt The length of the buffer to write.
* @return NULL on success; or the unraised exception representing
* the problem.
*/
static jthrowable write_fully(JNIEnv *env, int fd, int8_t *buf, int amt)
{
int err, res;
while (amt > 0) {
res = write(fd, buf, amt);
if (res < 0) {
err = errno;
if (err == EINTR) {
continue;
}
return newSocketException(env, err, "write(2) error: %s", terror(err));
}
amt -= res;
buf += res;
}
return NULL;
}
/**
* Our auxillary data setup.
*
* See man 3 cmsg for more information about auxillary socket data on UNIX.
*
* We use __attribute__((packed)) to ensure that the compiler doesn't insert any
* padding between 'hdr' and 'fds'.
* We use __attribute__((aligned(8)) to ensure that the compiler puts the start
* of the structure at an address which is a multiple of 8. If we did not do
* this, the attribute((packed)) would cause the compiler to generate a lot of
* slow code for accessing unaligned memory.
*/
struct cmsghdr_with_fds {
struct cmsghdr hdr;
int fds[MAX_PASSED_FDS];
} __attribute__((packed,aligned(8)));
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_sendFileDescriptors0(
JNIEnv *env, jclass clazz, jint fd, jobject jfds, jobject jbuf,
jint offset, jint length)
{
struct iovec vec[1];
struct flexibleBuffer flexBuf;
struct cmsghdr_with_fds aux;
jint jfdsLen;
int i, ret = -1, auxLen;
struct msghdr socketMsg;
jthrowable jthr = NULL;
jthr = flexBufInit(env, &flexBuf, length);
if (jthr) {
goto done;
}
if (length <= 0) {
jthr = newException(env, "java/lang/IllegalArgumentException",
"You must write at least one byte.");
goto done;
}
jfdsLen = (*env)->GetArrayLength(env, jfds);
if (jfdsLen <= 0) {
jthr = newException(env, "java/lang/IllegalArgumentException",
"Called sendFileDescriptors with no file descriptors.");
goto done;
} else if (jfdsLen > MAX_PASSED_FDS) {
jfdsLen = 0;
jthr = newException(env, "java/lang/IllegalArgumentException",
"Called sendFileDescriptors with an array of %d length. "
"The maximum is %d.", jfdsLen, MAX_PASSED_FDS);
goto done;
}
(*env)->GetByteArrayRegion(env, jbuf, offset, length, flexBuf.curBuf);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
memset(&vec, 0, sizeof(vec));
vec[0].iov_base = flexBuf.curBuf;
vec[0].iov_len = length;
auxLen = CMSG_LEN(jfdsLen * sizeof(int));
memset(&aux, 0, auxLen);
memset(&socketMsg, 0, sizeof(socketMsg));
socketMsg.msg_iov = vec;
socketMsg.msg_iovlen = 1;
socketMsg.msg_control = &aux;
socketMsg.msg_controllen = auxLen;
aux.hdr.cmsg_len = auxLen;
aux.hdr.cmsg_level = SOL_SOCKET;
aux.hdr.cmsg_type = SCM_RIGHTS;
for (i = 0; i < jfdsLen; i++) {
jobject jfd = (*env)->GetObjectArrayElement(env, jfds, i);
if (!jfd) {
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
jthr = newException(env, "java/lang/NullPointerException",
"element %d of jfds was NULL.", i);
goto done;
}
aux.fds[i] = fd_get(env, jfd);
(*env)->DeleteLocalRef(env, jfd);
if (jthr) {
goto done;
}
}
RETRY_ON_EINTR(ret, sendmsg(fd, &socketMsg, 0));
if (ret < 0) {
ret = errno;
jthr = newSocketException(env, ret, "sendmsg(2) error: %s", terror(ret));
goto done;
}
length -= ret;
if (length > 0) {
// Write the rest of the bytes we were asked to send.
// This time, no fds will be attached.
jthr = write_fully(env, fd, flexBuf.curBuf + ret, length);
if (jthr) {
goto done;
}
}
done:
flexBufFree(&flexBuf);
if (jthr) {
(*env)->Throw(env, jthr);
}
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_receiveFileDescriptors0(
JNIEnv *env, jclass clazz, jint fd, jarray jfds, jarray jbuf,
jint offset, jint length)
{
struct iovec vec[1];
struct flexibleBuffer flexBuf;
struct cmsghdr_with_fds aux;
int i, jRecvFdsLen = 0, auxLen;
jint jfdsLen = 0;
struct msghdr socketMsg;
ssize_t bytesRead = -1;
jobject fdObj;
jthrowable jthr = NULL;
jthr = flexBufInit(env, &flexBuf, length);
if (jthr) {
goto done;
}
if (length <= 0) {
jthr = newRuntimeException(env, "You must read at least one byte.");
goto done;
}
jfdsLen = (*env)->GetArrayLength(env, jfds);
if (jfdsLen <= 0) {
jthr = newException(env, "java/lang/IllegalArgumentException",
"Called receiveFileDescriptors with an array of %d length. "
"You must pass at least one fd.", jfdsLen);
goto done;
} else if (jfdsLen > MAX_PASSED_FDS) {
jfdsLen = 0;
jthr = newException(env, "java/lang/IllegalArgumentException",
"Called receiveFileDescriptors with an array of %d length. "
"The maximum is %d.", jfdsLen, MAX_PASSED_FDS);
goto done;
}
for (i = 0; i < jfdsLen; i++) {
(*env)->SetObjectArrayElement(env, jfds, i, NULL);
}
vec[0].iov_base = flexBuf.curBuf;
vec[0].iov_len = length;
auxLen = CMSG_LEN(jfdsLen * sizeof(int));
memset(&aux, 0, auxLen);
memset(&socketMsg, 0, auxLen);
socketMsg.msg_iov = vec;
socketMsg.msg_iovlen = 1;
socketMsg.msg_control = &aux;
socketMsg.msg_controllen = auxLen;
aux.hdr.cmsg_len = auxLen;
aux.hdr.cmsg_level = SOL_SOCKET;
aux.hdr.cmsg_type = SCM_RIGHTS;
RETRY_ON_EINTR(bytesRead, recvmsg(fd, &socketMsg, 0));
if (bytesRead < 0) {
int ret = errno;
if (ret == ECONNABORTED) {
// The remote peer disconnected on us. Treat this as an EOF.
bytesRead = -1;
goto done;
}
jthr = newSocketException(env, ret, "recvmsg(2) failed: %s",
terror(ret));
goto done;
} else if (bytesRead == 0) {
bytesRead = -1;
goto done;
}
jRecvFdsLen = (aux.hdr.cmsg_len - sizeof(struct cmsghdr)) / sizeof(int);
for (i = 0; i < jRecvFdsLen; i++) {
fdObj = fd_create(env, aux.fds[i]);
if (!fdObj) {
jthr = (*env)->ExceptionOccurred(env);
(*env)->ExceptionClear(env);
goto done;
}
// Make this -1 so we don't attempt to close it twice in an error path.
aux.fds[i] = -1;
(*env)->SetObjectArrayElement(env, jfds, i, fdObj);
// There is no point keeping around a local reference to the fdObj.
// The array continues to reference it.
(*env)->DeleteLocalRef(env, fdObj);
}
(*env)->SetByteArrayRegion(env, jbuf, offset, length, flexBuf.curBuf);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
done:
flexBufFree(&flexBuf);
if (jthr) {
// Free any FileDescriptor references we may have created,
// or file descriptors we may have been passed.
for (i = 0; i < jRecvFdsLen; i++) {
if (aux.fds[i] >= 0) {
RETRY_ON_EINTR(i, close(aux.fds[i]));
aux.fds[i] = -1;
}
fdObj = (*env)->GetObjectArrayElement(env, jfds, i);
if (fdObj) {
int ret, afd = fd_get(env, fdObj);
if (afd >= 0) {
RETRY_ON_EINTR(ret, close(afd));
}
(*env)->SetObjectArrayElement(env, jfds, i, NULL);
(*env)->DeleteLocalRef(env, fdObj);
}
}
(*env)->Throw(env, jthr);
}
return bytesRead;
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_readArray0(
JNIEnv *env, jclass clazz, jint fd, jarray b, jint offset, jint length)
{
int ret = -1;
struct flexibleBuffer flexBuf;
jthrowable jthr;
jthr = flexBufInit(env, &flexBuf, length);
if (jthr) {
goto done;
}
RETRY_ON_EINTR(ret, read(fd, flexBuf.curBuf, length));
if (ret < 0) {
ret = errno;
if (ret == ECONNABORTED) {
// The remote peer disconnected on us. Treat this as an EOF.
ret = -1;
goto done;
}
jthr = newSocketException(env, ret, "read(2) error: %s",
terror(ret));
goto done;
}
if (ret == 0) {
goto done;
}
(*env)->SetByteArrayRegion(env, b, offset, ret, flexBuf.curBuf);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
done:
flexBufFree(&flexBuf);
if (jthr) {
(*env)->Throw(env, jthr);
}
return ret == 0 ? -1 : ret; // Java wants -1 on EOF
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_available0(
JNIEnv *env, jclass clazz, jint fd)
{
int ret, avail = 0;
jthrowable jthr = NULL;
RETRY_ON_EINTR(ret, ioctl(fd, FIONREAD, &avail));
if (ret < 0) {
ret = errno;
jthr = newSocketException(env, ret,
"ioctl(%d, FIONREAD) error: %s", fd, terror(ret));
goto done;
}
done:
if (jthr) {
(*env)->Throw(env, jthr);
}
return avail;
}
JNIEXPORT void JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_writeArray0(
JNIEnv *env, jclass clazz, jint fd, jarray b, jint offset, jint length)
{
struct flexibleBuffer flexBuf;
jthrowable jthr;
jthr = flexBufInit(env, &flexBuf, length);
if (jthr) {
goto done;
}
(*env)->GetByteArrayRegion(env, b, offset, length, flexBuf.curBuf);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
jthr = write_fully(env, fd, flexBuf.curBuf, length);
if (jthr) {
goto done;
}
done:
flexBufFree(&flexBuf);
if (jthr) {
(*env)->Throw(env, jthr);
}
}
JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_readByteBufferDirect0(
JNIEnv *env, jclass clazz, jint fd, jobject dst, jint position, jint remaining)
{
uint8_t *buf;
jthrowable jthr = NULL;
int res = -1;
buf = (*env)->GetDirectBufferAddress(env, dst);
if (!buf) {
jthr = newRuntimeException(env, "GetDirectBufferAddress failed.");
goto done;
}
RETRY_ON_EINTR(res, read(fd, buf + position, remaining));
if (res < 0) {
res = errno;
if (res != ECONNABORTED) {
jthr = newSocketException(env, res, "read(2) error: %s",
terror(res));
goto done;
} else {
// The remote peer disconnected on us. Treat this as an EOF.
res = -1;
}
}
done:
if (jthr) {
(*env)->Throw(env, jthr);
}
return res;
}

View File

@ -180,6 +180,10 @@ static FARPROC WINAPI do_dlsym(JNIEnv *env, HMODULE handle, LPCSTR symbol) {
THROW(env, "java/lang/InternalError", exception_msg); \
}
#define RETRY_ON_EINTR(ret, expr) do { \
ret = expr; \
} while ((ret == -1) && (errno == EINTR));
#endif
//vim: sw=2: ts=2: et

View File

@ -0,0 +1,524 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.compress.lz4.Lz4Compressor;
import org.apache.hadoop.io.compress.snappy.SnappyCompressor;
import org.apache.hadoop.io.compress.zlib.BuiltInZlibDeflater;
import org.apache.hadoop.io.compress.zlib.ZlibCompressor;
import org.apache.hadoop.io.compress.zlib.ZlibFactory;
import org.apache.hadoop.util.NativeCodeLoader;
import org.apache.log4j.Logger;
import org.junit.Assert;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import static org.junit.Assert.*;
public class CompressDecompressTester<T extends Compressor, E extends Decompressor> {
private static final Logger logger = Logger
.getLogger(CompressDecompressTester.class);
private final byte[] originalRawData;
private ImmutableList<TesterPair<T, E>> pairs = ImmutableList.of();
private ImmutableList.Builder<TesterPair<T, E>> builder = ImmutableList.builder();
private ImmutableSet<CompressionTestStrategy> stateges = ImmutableSet.of();
private PreAssertionTester<T, E> assertionDelegate;
public CompressDecompressTester(byte[] originalRawData) {
this.originalRawData = Arrays.copyOf(originalRawData,
originalRawData.length);
this.assertionDelegate = new PreAssertionTester<T, E>() {
@Override
public ImmutableList<TesterPair<T, E>> filterOnAssumeWhat(
ImmutableList<TesterPair<T, E>> pairs) {
ImmutableList.Builder<TesterPair<T, E>> builder = ImmutableList
.builder();
for (TesterPair<T, E> pair : pairs) {
if (isAvailable(pair))
builder.add(pair);
}
return builder.build();
}
};
}
private static boolean isNativeSnappyLoadable() {
boolean snappyAvailable = false;
boolean loaded = false;
try {
System.loadLibrary("snappy");
logger.warn("Snappy native library is available");
snappyAvailable = true;
boolean hadoopNativeAvailable = NativeCodeLoader.isNativeCodeLoaded();
loaded = snappyAvailable && hadoopNativeAvailable;
if (loaded) {
logger.info("Snappy native library loaded");
} else {
logger.warn("Snappy native library not loaded");
}
} catch (Throwable t) {
logger.warn("Failed to load snappy: ", t);
return false;
}
return loaded;
}
public static <T extends Compressor, E extends Decompressor> CompressDecompressTester<T, E> of(
byte[] rawData) {
return new CompressDecompressTester<T, E>(rawData);
}
public CompressDecompressTester<T, E> withCompressDecompressPair(
T compressor, E decompressor) {
addPair(
compressor,
decompressor,
Joiner.on("_").join(compressor.getClass().getCanonicalName(),
decompressor.getClass().getCanonicalName()));
return this;
}
public CompressDecompressTester<T, E> withTestCases(
ImmutableSet<CompressionTestStrategy> stateges) {
this.stateges = ImmutableSet.copyOf(stateges);
return this;
}
private void addPair(T compressor, E decompressor, String name) {
builder.add(new TesterPair<T, E>(name, compressor, decompressor));
}
public void test() throws InstantiationException, IllegalAccessException {
pairs = builder.build();
pairs = assertionDelegate.filterOnAssumeWhat(pairs);
for (TesterPair<T, E> pair : pairs) {
for (CompressionTestStrategy strategy : stateges) {
strategy.getTesterStrategy().assertCompression(pair.getName(),
pair.getCompressor(), pair.getDecompressor(),
Arrays.copyOf(originalRawData, originalRawData.length));
}
}
endAll(pairs);
}
private void endAll(ImmutableList<TesterPair<T, E>> pairs) {
for (TesterPair<T, E> pair : pairs)
pair.end();
}
interface PreAssertionTester<T extends Compressor, E extends Decompressor> {
ImmutableList<TesterPair<T, E>> filterOnAssumeWhat(
ImmutableList<TesterPair<T, E>> pairs);
}
public enum CompressionTestStrategy {
COMPRESS_DECOMPRESS_ERRORS(new TesterCompressionStrategy() {
private final Joiner joiner = Joiner.on("- ");
@Override
public void assertCompression(String name, Compressor compressor,
Decompressor decompressor, byte[] rawData) {
assertTrue(checkSetInputNullPointerException(compressor));
assertTrue(checkSetInputNullPointerException(decompressor));
assertTrue(checkCompressArrayIndexOutOfBoundsException(compressor,
rawData));
assertTrue(checkCompressArrayIndexOutOfBoundsException(decompressor,
rawData));
assertTrue(checkCompressNullPointerException(compressor, rawData));
assertTrue(checkCompressNullPointerException(decompressor, rawData));
assertTrue(checkSetInputArrayIndexOutOfBoundsException(compressor));
assertTrue(checkSetInputArrayIndexOutOfBoundsException(decompressor));
}
private boolean checkSetInputNullPointerException(Compressor compressor) {
try {
compressor.setInput(null, 0, 1);
} catch (NullPointerException npe) {
return true;
} catch (Exception ex) {
logger.error(joiner.join(compressor.getClass().getCanonicalName(),
"checkSetInputNullPointerException error !!!"));
}
return false;
}
private boolean checkCompressNullPointerException(Compressor compressor,
byte[] rawData) {
try {
compressor.setInput(rawData, 0, rawData.length);
compressor.compress(null, 0, 1);
} catch (NullPointerException npe) {
return true;
} catch (Exception ex) {
logger.error(joiner.join(compressor.getClass().getCanonicalName(),
"checkCompressNullPointerException error !!!"));
}
return false;
}
private boolean checkCompressNullPointerException(
Decompressor decompressor, byte[] rawData) {
try {
decompressor.setInput(rawData, 0, rawData.length);
decompressor.decompress(null, 0, 1);
} catch (NullPointerException npe) {
return true;
} catch (Exception ex) {
logger.error(joiner.join(decompressor.getClass().getCanonicalName(),
"checkCompressNullPointerException error !!!"));
}
return false;
}
private boolean checkSetInputNullPointerException(
Decompressor decompressor) {
try {
decompressor.setInput(null, 0, 1);
} catch (NullPointerException npe) {
return true;
} catch (Exception ex) {
logger.error(joiner.join(decompressor.getClass().getCanonicalName(),
"checkSetInputNullPointerException error !!!"));
}
return false;
}
private boolean checkSetInputArrayIndexOutOfBoundsException(
Compressor compressor) {
try {
compressor.setInput(new byte[] { (byte) 0 }, 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
logger.error(joiner.join(compressor.getClass().getCanonicalName(),
"checkSetInputArrayIndexOutOfBoundsException error !!!"));
}
return false;
}
private boolean checkCompressArrayIndexOutOfBoundsException(
Compressor compressor, byte[] rawData) {
try {
compressor.setInput(rawData, 0, rawData.length);
compressor.compress(new byte[rawData.length], 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
logger.error(joiner.join(compressor.getClass().getCanonicalName(),
"checkCompressArrayIndexOutOfBoundsException error !!!"));
}
return false;
}
private boolean checkCompressArrayIndexOutOfBoundsException(
Decompressor decompressor, byte[] rawData) {
try {
decompressor.setInput(rawData, 0, rawData.length);
decompressor.decompress(new byte[rawData.length], 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
logger.error(joiner.join(decompressor.getClass().getCanonicalName(),
"checkCompressArrayIndexOutOfBoundsException error !!!"));
}
return false;
}
private boolean checkSetInputArrayIndexOutOfBoundsException(
Decompressor decompressor) {
try {
decompressor.setInput(new byte[] { (byte) 0 }, 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
logger.error(joiner.join(decompressor.getClass().getCanonicalName(),
"checkNullPointerException error !!!"));
}
return false;
}
}),
COMPRESS_DECOMPRESS_SINGLE_BLOCK(new TesterCompressionStrategy() {
final Joiner joiner = Joiner.on("- ");
@Override
public void assertCompression(String name, Compressor compressor,
Decompressor decompressor, byte[] rawData) {
int cSize = 0;
int decompressedSize = 0;
byte[] compressedResult = new byte[rawData.length];
byte[] decompressedBytes = new byte[rawData.length];
try {
assertTrue(
joiner.join(name, "compressor.needsInput before error !!!"),
compressor.needsInput());
assertTrue(
joiner.join(name, "compressor.getBytesWritten before error !!!"),
compressor.getBytesWritten() == 0);
compressor.setInput(rawData, 0, rawData.length);
compressor.finish();
while (!compressor.finished()) {
cSize += compressor.compress(compressedResult, 0,
compressedResult.length);
}
compressor.reset();
assertTrue(
joiner.join(name, "decompressor.needsInput() before error !!!"),
decompressor.needsInput());
decompressor.setInput(compressedResult, 0, cSize);
assertFalse(
joiner.join(name, "decompressor.needsInput() after error !!!"),
decompressor.needsInput());
while (!decompressor.finished()) {
decompressedSize = decompressor.decompress(decompressedBytes, 0,
decompressedBytes.length);
}
decompressor.reset();
assertTrue(joiner.join(name, " byte size not equals error !!!"),
decompressedSize == rawData.length);
assertArrayEquals(
joiner.join(name, " byte arrays not equals error !!!"), rawData,
decompressedBytes);
} catch (Exception ex) {
fail(joiner.join(name, ex.getMessage()));
}
}
}),
COMPRESS_DECOMPRESS_WITH_EMPTY_STREAM(new TesterCompressionStrategy() {
final Joiner joiner = Joiner.on("- ");
final ImmutableMap<Class<? extends Compressor>, Integer> emptySize = ImmutableMap
.of(Lz4Compressor.class, 4, ZlibCompressor.class, 16,
SnappyCompressor.class, 4, BuiltInZlibDeflater.class, 16);
@Override
void assertCompression(String name, Compressor compressor,
Decompressor decompressor, byte[] originalRawData) {
byte[] buf = null;
ByteArrayInputStream bytesIn = null;
BlockDecompressorStream blockDecompressorStream = null;
ByteArrayOutputStream bytesOut = new ByteArrayOutputStream();
// close without write
try {
compressor.reset();
// decompressor.end();
BlockCompressorStream blockCompressorStream = new BlockCompressorStream(
bytesOut, compressor, 1024, 0);
blockCompressorStream.close();
// check compressed output
buf = bytesOut.toByteArray();
int emSize = emptySize.get(compressor.getClass());
Assert.assertEquals(
joiner.join(name, "empty stream compressed output size != "
+ emSize), emSize, buf.length);
// use compressed output as input for decompression
bytesIn = new ByteArrayInputStream(buf);
// create decompression stream
blockDecompressorStream = new BlockDecompressorStream(bytesIn,
decompressor, 1024);
// no byte is available because stream was closed
assertEquals(joiner.join(name, " return value is not -1"), -1,
blockDecompressorStream.read());
} catch (IOException e) {
fail(joiner.join(name, e.getMessage()));
} finally {
if (blockDecompressorStream != null)
try {
bytesOut.close();
blockDecompressorStream.close();
bytesIn.close();
blockDecompressorStream.close();
} catch (IOException e) {
}
}
}
}),
COMPRESS_DECOMPRESS_BLOCK(new TesterCompressionStrategy() {
private final Joiner joiner = Joiner.on("- ");
private static final int BLOCK_SIZE = 512;
private final byte[] operationBlock = new byte[BLOCK_SIZE];
// Use default of 512 as bufferSize and compressionOverhead of
// (1% of bufferSize + 12 bytes) = 18 bytes (zlib algorithm).
private static final int overheadSpace = BLOCK_SIZE / 100 + 12;
@Override
public void assertCompression(String name, Compressor compressor,
Decompressor decompressor, byte[] originalRawData) {
int off = 0;
int len = originalRawData.length;
int maxSize = BLOCK_SIZE - overheadSpace;
int compresSize = 0;
List<Integer> blockLabels = new ArrayList<Integer>();
ByteArrayOutputStream compressedOut = new ByteArrayOutputStream();
ByteArrayOutputStream decompressOut = new ByteArrayOutputStream();
try {
if (originalRawData.length > maxSize) {
do {
int bufLen = Math.min(len, maxSize);
compressor.setInput(originalRawData, off, bufLen);
compressor.finish();
while (!compressor.finished()) {
compresSize = compressor.compress(operationBlock, 0,
operationBlock.length);
compressedOut.write(operationBlock, 0, compresSize);
blockLabels.add(compresSize);
}
compressor.reset();
off += bufLen;
len -= bufLen;
} while (len > 0);
}
off = 0;
// compressed bytes
byte[] compressedBytes = compressedOut.toByteArray();
for (Integer step : blockLabels) {
decompressor.setInput(compressedBytes, off, step);
while (!decompressor.finished()) {
int dSize = decompressor.decompress(operationBlock, 0,
operationBlock.length);
decompressOut.write(operationBlock, 0, dSize);
}
decompressor.reset();
off = off + step;
}
assertArrayEquals(
joiner.join(name, "byte arrays not equals error !!!"),
originalRawData, decompressOut.toByteArray());
} catch (Exception ex) {
fail(joiner.join(name, ex.getMessage()));
} finally {
try {
compressedOut.close();
} catch (IOException e) {
}
try {
decompressOut.close();
} catch (IOException e) {
}
}
}
});
private final TesterCompressionStrategy testerStrategy;
CompressionTestStrategy(TesterCompressionStrategy testStrategy) {
this.testerStrategy = testStrategy;
}
public TesterCompressionStrategy getTesterStrategy() {
return testerStrategy;
}
}
static final class TesterPair<T extends Compressor, E extends Decompressor> {
private final T compressor;
private final E decompressor;
private final String name;
TesterPair(String name, T compressor, E decompressor) {
this.compressor = compressor;
this.decompressor = decompressor;
this.name = name;
}
public void end() {
Configuration cfg = new Configuration();
compressor.reinit(cfg);
compressor.end();
decompressor.end();
}
public T getCompressor() {
return compressor;
}
public E getDecompressor() {
return decompressor;
}
public String getName() {
return name;
}
}
/**
* Method for compressor availability check
*/
private static <T extends Compressor, E extends Decompressor> boolean isAvailable(TesterPair<T, E> pair) {
Compressor compressor = pair.compressor;
if (compressor.getClass().isAssignableFrom(Lz4Compressor.class)
&& (NativeCodeLoader.isNativeCodeLoaded()))
return true;
else if (compressor.getClass().isAssignableFrom(BuiltInZlibDeflater.class)
&& NativeCodeLoader.isNativeCodeLoaded())
return true;
else if (compressor.getClass().isAssignableFrom(ZlibCompressor.class)) {
return ZlibFactory.isNativeZlibLoaded(new Configuration());
}
else if (compressor.getClass().isAssignableFrom(SnappyCompressor.class)
&& isNativeSnappyLoadable())
return true;
return false;
}
abstract static class TesterCompressionStrategy {
protected final Logger logger = Logger.getLogger(getClass());
abstract void assertCompression(String name, Compressor compressor,
Decompressor decompressor, byte[] originalRawData);
}
}

View File

@ -0,0 +1,101 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
import static org.junit.Assert.fail;
import java.util.Random;
import org.apache.hadoop.io.compress.CompressDecompressTester.CompressionTestStrategy;
import org.apache.hadoop.io.compress.lz4.Lz4Compressor;
import org.apache.hadoop.io.compress.lz4.Lz4Decompressor;
import org.apache.hadoop.io.compress.snappy.SnappyCompressor;
import org.apache.hadoop.io.compress.snappy.SnappyDecompressor;
import org.apache.hadoop.io.compress.zlib.BuiltInZlibDeflater;
import org.apache.hadoop.io.compress.zlib.BuiltInZlibInflater;
import org.junit.Test;
import com.google.common.collect.ImmutableSet;
/**
* Test for pairs:
* <pre>
* SnappyCompressor/SnappyDecompressor
* Lz4Compressor/Lz4Decompressor
* BuiltInZlibDeflater/new BuiltInZlibInflater
*
*
* Note: we can't use ZlibCompressor/ZlibDecompressor here
* because his constructor can throw exception (if native libraries not found)
* For ZlibCompressor/ZlibDecompressor pair testing used {@code TestZlibCompressorDecompressor}
*
* </pre>
*
*/
public class TestCompressorDecompressor {
private static final Random rnd = new Random(12345L);
@Test
public void testCompressorDecompressor() {
// no more for this data
int SIZE = 44 * 1024;
byte[] rawData = generate(SIZE);
try {
CompressDecompressTester.of(rawData)
.withCompressDecompressPair(new SnappyCompressor(), new SnappyDecompressor())
.withCompressDecompressPair(new Lz4Compressor(), new Lz4Decompressor())
.withCompressDecompressPair(new BuiltInZlibDeflater(), new BuiltInZlibInflater())
.withTestCases(ImmutableSet.of(CompressionTestStrategy.COMPRESS_DECOMPRESS_SINGLE_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_ERRORS,
CompressionTestStrategy.COMPRESS_DECOMPRESS_WITH_EMPTY_STREAM))
.test();
} catch (Exception ex) {
fail("testCompressorDecompressor error !!!" + ex);
}
}
@Test
public void testCompressorDecompressorWithExeedBufferLimit() {
int BYTE_SIZE = 100 * 1024;
byte[] rawData = generate(BYTE_SIZE);
try {
CompressDecompressTester.of(rawData)
.withCompressDecompressPair(
new SnappyCompressor(BYTE_SIZE + BYTE_SIZE / 2),
new SnappyDecompressor(BYTE_SIZE + BYTE_SIZE / 2))
.withCompressDecompressPair(new Lz4Compressor(BYTE_SIZE),
new Lz4Decompressor(BYTE_SIZE))
.withTestCases(ImmutableSet.of(CompressionTestStrategy.COMPRESS_DECOMPRESS_SINGLE_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_ERRORS,
CompressionTestStrategy.COMPRESS_DECOMPRESS_WITH_EMPTY_STREAM))
.test();
} catch (Exception ex) {
fail("testCompressorDecompressorWithExeedBufferLimit error !!!" + ex);
}
}
public static byte[] generate(int size) {
byte[] array = new byte[size];
for (int i = 0; i < size; i++)
array[i] = (byte) rnd.nextInt(16);
return array;
}
}

View File

@ -0,0 +1,316 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress.lz4;
import static org.junit.Assert.*;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.Random;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.compress.BlockCompressorStream;
import org.apache.hadoop.io.compress.BlockDecompressorStream;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.Lz4Codec;
import org.apache.hadoop.io.compress.lz4.Lz4Compressor;
import org.apache.hadoop.io.compress.lz4.Lz4Decompressor;
import org.junit.Before;
import org.junit.Test;
import static org.junit.Assume.*;
public class TestLz4CompressorDecompressor {
private static final Random rnd = new Random(12345l);
@Before
public void before() {
assumeTrue(Lz4Codec.isNativeCodeLoaded());
}
//test on NullPointerException in {@code compressor.setInput()}
@Test
public void testCompressorSetInputNullPointerException() {
try {
Lz4Compressor compressor = new Lz4Compressor();
compressor.setInput(null, 0, 10);
fail("testCompressorSetInputNullPointerException error !!!");
} catch (NullPointerException ex) {
// expected
} catch (Exception e) {
fail("testCompressorSetInputNullPointerException ex error !!!");
}
}
//test on NullPointerException in {@code decompressor.setInput()}
@Test
public void testDecompressorSetInputNullPointerException() {
try {
Lz4Decompressor decompressor = new Lz4Decompressor();
decompressor.setInput(null, 0, 10);
fail("testDecompressorSetInputNullPointerException error !!!");
} catch (NullPointerException ex) {
// expected
} catch (Exception e) {
fail("testDecompressorSetInputNullPointerException ex error !!!");
}
}
//test on ArrayIndexOutOfBoundsException in {@code compressor.setInput()}
@Test
public void testCompressorSetInputAIOBException() {
try {
Lz4Compressor compressor = new Lz4Compressor();
compressor.setInput(new byte[] {}, -5, 10);
fail("testCompressorSetInputAIOBException error !!!");
} catch (ArrayIndexOutOfBoundsException ex) {
// expected
} catch (Exception ex) {
fail("testCompressorSetInputAIOBException ex error !!!");
}
}
//test on ArrayIndexOutOfBoundsException in {@code decompressor.setInput()}
@Test
public void testDecompressorSetInputAIOUBException() {
try {
Lz4Decompressor decompressor = new Lz4Decompressor();
decompressor.setInput(new byte[] {}, -5, 10);
fail("testDecompressorSetInputAIOBException error !!!");
} catch (ArrayIndexOutOfBoundsException ex) {
// expected
} catch (Exception e) {
fail("testDecompressorSetInputAIOBException ex error !!!");
}
}
//test on NullPointerException in {@code compressor.compress()}
@Test
public void testCompressorCompressNullPointerException() {
try {
Lz4Compressor compressor = new Lz4Compressor();
byte[] bytes = generate(1024 * 6);
compressor.setInput(bytes, 0, bytes.length);
compressor.compress(null, 0, 0);
fail("testCompressorCompressNullPointerException error !!!");
} catch (NullPointerException ex) {
// expected
} catch (Exception e) {
fail("testCompressorCompressNullPointerException ex error !!!");
}
}
//test on NullPointerException in {@code decompressor.decompress()}
@Test
public void testDecompressorCompressNullPointerException() {
try {
Lz4Decompressor decompressor = new Lz4Decompressor();
byte[] bytes = generate(1024 * 6);
decompressor.setInput(bytes, 0, bytes.length);
decompressor.decompress(null, 0, 0);
fail("testDecompressorCompressNullPointerException error !!!");
} catch (NullPointerException ex) {
// expected
} catch (Exception e) {
fail("testDecompressorCompressNullPointerException ex error !!!");
}
}
//test on ArrayIndexOutOfBoundsException in {@code compressor.compress()}
@Test
public void testCompressorCompressAIOBException() {
try {
Lz4Compressor compressor = new Lz4Compressor();
byte[] bytes = generate(1024 * 6);
compressor.setInput(bytes, 0, bytes.length);
compressor.compress(new byte[] {}, 0, -1);
fail("testCompressorCompressAIOBException error !!!");
} catch (ArrayIndexOutOfBoundsException ex) {
// expected
} catch (Exception e) {
fail("testCompressorCompressAIOBException ex error !!!");
}
}
//test on ArrayIndexOutOfBoundsException in decompressor.decompress()
@Test
public void testDecompressorCompressAIOBException() {
try {
Lz4Decompressor decompressor = new Lz4Decompressor();
byte[] bytes = generate(1024 * 6);
decompressor.setInput(bytes, 0, bytes.length);
decompressor.decompress(new byte[] {}, 0, -1);
fail("testDecompressorCompressAIOBException error !!!");
} catch (ArrayIndexOutOfBoundsException ex) {
// expected
} catch (Exception e) {
fail("testDecompressorCompressAIOBException ex error !!!");
}
}
// test Lz4Compressor compressor.compress()
@Test
public void testSetInputWithBytesSizeMoreThenDefaultLz4CompressorByfferSize() {
int BYTES_SIZE = 1024 * 64 + 1;
try {
Lz4Compressor compressor = new Lz4Compressor();
byte[] bytes = generate(BYTES_SIZE);
assertTrue("needsInput error !!!", compressor.needsInput());
compressor.setInput(bytes, 0, bytes.length);
byte[] emptyBytes = new byte[BYTES_SIZE];
int csize = compressor.compress(emptyBytes, 0, bytes.length);
assertTrue(
"testSetInputWithBytesSizeMoreThenDefaultLz4CompressorByfferSize error !!!",
csize != 0);
} catch (Exception ex) {
fail("testSetInputWithBytesSizeMoreThenDefaultLz4CompressorByfferSize ex error !!!");
}
}
// test compress/decompress process
@Test
public void testCompressDecompress() {
int BYTE_SIZE = 1024 * 54;
byte[] bytes = generate(BYTE_SIZE);
Lz4Compressor compressor = new Lz4Compressor();
try {
compressor.setInput(bytes, 0, bytes.length);
assertTrue("Lz4CompressDecompress getBytesRead error !!!",
compressor.getBytesRead() > 0);
assertTrue(
"Lz4CompressDecompress getBytesWritten before compress error !!!",
compressor.getBytesWritten() == 0);
byte[] compressed = new byte[BYTE_SIZE];
int cSize = compressor.compress(compressed, 0, compressed.length);
assertTrue(
"Lz4CompressDecompress getBytesWritten after compress error !!!",
compressor.getBytesWritten() > 0);
Lz4Decompressor decompressor = new Lz4Decompressor();
// set as input for decompressor only compressed data indicated with cSize
decompressor.setInput(compressed, 0, cSize);
byte[] decompressed = new byte[BYTE_SIZE];
decompressor.decompress(decompressed, 0, decompressed.length);
assertTrue("testLz4CompressDecompress finished error !!!", decompressor.finished());
assertArrayEquals(bytes, decompressed);
compressor.reset();
decompressor.reset();
assertTrue("decompressor getRemaining error !!!",decompressor.getRemaining() == 0);
} catch (Exception e) {
fail("testLz4CompressDecompress ex error!!!");
}
}
// test compress/decompress with empty stream
@Test
public void testCompressorDecompressorEmptyStreamLogic() {
ByteArrayInputStream bytesIn = null;
ByteArrayOutputStream bytesOut = null;
byte[] buf = null;
BlockDecompressorStream blockDecompressorStream = null;
try {
// compress empty stream
bytesOut = new ByteArrayOutputStream();
BlockCompressorStream blockCompressorStream = new BlockCompressorStream(
bytesOut, new Lz4Compressor(), 1024, 0);
// close without write
blockCompressorStream.close();
// check compressed output
buf = bytesOut.toByteArray();
assertEquals("empty stream compressed output size != 4", 4, buf.length);
// use compressed output as input for decompression
bytesIn = new ByteArrayInputStream(buf);
// create decompression stream
blockDecompressorStream = new BlockDecompressorStream(bytesIn,
new Lz4Decompressor(), 1024);
// no byte is available because stream was closed
assertEquals("return value is not -1", -1, blockDecompressorStream.read());
} catch (Exception e) {
fail("testCompressorDecompressorEmptyStreamLogic ex error !!!"
+ e.getMessage());
} finally {
if (blockDecompressorStream != null)
try {
bytesIn.close();
bytesOut.close();
blockDecompressorStream.close();
} catch (IOException e) {
}
}
}
// test compress/decompress process through CompressionOutputStream/CompressionInputStream api
@Test
public void testCompressorDecopressorLogicWithCompressionStreams() {
DataOutputStream deflateOut = null;
DataInputStream inflateIn = null;
int BYTE_SIZE = 1024 * 100;
byte[] bytes = generate(BYTE_SIZE);
int bufferSize = 262144;
int compressionOverhead = (bufferSize / 6) + 32;
try {
DataOutputBuffer compressedDataBuffer = new DataOutputBuffer();
CompressionOutputStream deflateFilter = new BlockCompressorStream(
compressedDataBuffer, new Lz4Compressor(bufferSize), bufferSize,
compressionOverhead);
deflateOut = new DataOutputStream(new BufferedOutputStream(deflateFilter));
deflateOut.write(bytes, 0, bytes.length);
deflateOut.flush();
deflateFilter.finish();
DataInputBuffer deCompressedDataBuffer = new DataInputBuffer();
deCompressedDataBuffer.reset(compressedDataBuffer.getData(), 0,
compressedDataBuffer.getLength());
CompressionInputStream inflateFilter = new BlockDecompressorStream(
deCompressedDataBuffer, new Lz4Decompressor(bufferSize), bufferSize);
inflateIn = new DataInputStream(new BufferedInputStream(inflateFilter));
byte[] result = new byte[BYTE_SIZE];
inflateIn.read(result);
assertArrayEquals("original array not equals compress/decompressed array", result,
bytes);
} catch (IOException e) {
fail("testLz4CompressorDecopressorLogicWithCompressionStreams ex error !!!");
} finally {
try {
if (deflateOut != null)
deflateOut.close();
if (inflateIn != null)
inflateIn.close();
} catch (Exception e) {
}
}
}
public static byte[] generate(int size) {
byte[] array = new byte[size];
for (int i = 0; i < size; i++)
array[i] = (byte)rnd.nextInt(16);
return array;
}
}

View File

@ -0,0 +1,362 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress.zlib;
import static org.junit.Assert.*;
import static org.junit.Assume.*;
import java.io.IOException;
import java.io.InputStream;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.compress.CompressDecompressTester;
import org.apache.hadoop.io.compress.Compressor;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.DecompressorStream;
import org.apache.hadoop.io.compress.CompressDecompressTester.CompressionTestStrategy;
import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel;
import org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionStrategy;
import org.junit.Before;
import org.junit.Test;
import com.google.common.collect.ImmutableSet;
public class TestZlibCompressorDecompressor {
private static final Random random = new Random(12345L);
@Before
public void before() {
assumeTrue(ZlibFactory.isNativeZlibLoaded(new Configuration()));
}
@Test
public void testZlibCompressorDecompressor() {
try {
int SIZE = 44 * 1024;
byte[] rawData = generate(SIZE);
CompressDecompressTester.of(rawData)
.withCompressDecompressPair(new ZlibCompressor(), new ZlibDecompressor())
.withTestCases(ImmutableSet.of(CompressionTestStrategy.COMPRESS_DECOMPRESS_SINGLE_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_ERRORS,
CompressionTestStrategy.COMPRESS_DECOMPRESS_WITH_EMPTY_STREAM))
.test();
} catch (Exception ex) {
fail("testCompressorDecompressor error !!!" + ex);
}
}
@Test
public void testCompressorDecompressorWithExeedBufferLimit() {
int BYTE_SIZE = 100 * 1024;
byte[] rawData = generate(BYTE_SIZE);
try {
CompressDecompressTester.of(rawData)
.withCompressDecompressPair(
new ZlibCompressor(
org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionLevel.BEST_COMPRESSION,
CompressionStrategy.DEFAULT_STRATEGY,
org.apache.hadoop.io.compress.zlib.ZlibCompressor.CompressionHeader.DEFAULT_HEADER,
BYTE_SIZE),
new ZlibDecompressor(
org.apache.hadoop.io.compress.zlib.ZlibDecompressor.CompressionHeader.DEFAULT_HEADER,
BYTE_SIZE))
.withTestCases(ImmutableSet.of(CompressionTestStrategy.COMPRESS_DECOMPRESS_SINGLE_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_BLOCK,
CompressionTestStrategy.COMPRESS_DECOMPRESS_ERRORS,
CompressionTestStrategy.COMPRESS_DECOMPRESS_WITH_EMPTY_STREAM))
.test();
} catch (Exception ex) {
fail("testCompressorDecompressorWithExeedBufferLimit error !!!" + ex);
}
}
@Test
public void testZlibCompressorDecompressorWithConfiguration() {
Configuration conf = new Configuration();
conf.setBoolean(CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_KEY, true);
if (ZlibFactory.isNativeZlibLoaded(conf)) {
byte[] rawData;
int tryNumber = 5;
int BYTE_SIZE = 10 * 1024;
Compressor zlibCompressor = ZlibFactory.getZlibCompressor(conf);
Decompressor zlibDecompressor = ZlibFactory.getZlibDecompressor(conf);
rawData = generate(BYTE_SIZE);
try {
for (int i = 0; i < tryNumber; i++)
compressDecompressZlib(rawData, (ZlibCompressor) zlibCompressor,
(ZlibDecompressor) zlibDecompressor);
zlibCompressor.reinit(conf);
} catch (Exception ex) {
fail("testZlibCompressorDecompressorWithConfiguration ex error " + ex);
}
} else {
assertTrue("ZlibFactory is using native libs against request",
ZlibFactory.isNativeZlibLoaded(conf));
}
}
@Test
public void testZlibCompressDecompress() {
byte[] rawData = null;
int rawDataSize = 0;
rawDataSize = 1024 * 64;
rawData = generate(rawDataSize);
try {
ZlibCompressor compressor = new ZlibCompressor();
ZlibDecompressor decompressor = new ZlibDecompressor();
assertFalse("testZlibCompressDecompress finished error",
compressor.finished());
compressor.setInput(rawData, 0, rawData.length);
assertTrue("testZlibCompressDecompress getBytesRead before error",
compressor.getBytesRead() == 0);
compressor.finish();
byte[] compressedResult = new byte[rawDataSize];
int cSize = compressor.compress(compressedResult, 0, rawDataSize);
assertTrue("testZlibCompressDecompress getBytesRead ather error",
compressor.getBytesRead() == rawDataSize);
assertTrue(
"testZlibCompressDecompress compressed size no less then original size",
cSize < rawDataSize);
decompressor.setInput(compressedResult, 0, cSize);
byte[] decompressedBytes = new byte[rawDataSize];
decompressor.decompress(decompressedBytes, 0, decompressedBytes.length);
assertArrayEquals("testZlibCompressDecompress arrays not equals ",
rawData, decompressedBytes);
compressor.reset();
decompressor.reset();
} catch (IOException ex) {
fail("testZlibCompressDecompress ex !!!" + ex);
}
}
@Test
public void testZlibCompressorDecompressorSetDictionary() {
Configuration conf = new Configuration();
conf.setBoolean(CommonConfigurationKeys.IO_NATIVE_LIB_AVAILABLE_KEY, true);
if (ZlibFactory.isNativeZlibLoaded(conf)) {
Compressor zlibCompressor = ZlibFactory.getZlibCompressor(conf);
Decompressor zlibDecompressor = ZlibFactory.getZlibDecompressor(conf);
checkSetDictionaryNullPointerException(zlibCompressor);
checkSetDictionaryNullPointerException(zlibDecompressor);
checkSetDictionaryArrayIndexOutOfBoundsException(zlibDecompressor);
checkSetDictionaryArrayIndexOutOfBoundsException(zlibCompressor);
} else {
assertTrue("ZlibFactory is using native libs against request",
ZlibFactory.isNativeZlibLoaded(conf));
}
}
@Test
public void testZlibFactory() {
Configuration cfg = new Configuration();
assertTrue("testZlibFactory compression level error !!!",
CompressionLevel.DEFAULT_COMPRESSION == ZlibFactory
.getCompressionLevel(cfg));
assertTrue("testZlibFactory compression strategy error !!!",
CompressionStrategy.DEFAULT_STRATEGY == ZlibFactory
.getCompressionStrategy(cfg));
ZlibFactory.setCompressionLevel(cfg, CompressionLevel.BEST_COMPRESSION);
assertTrue("testZlibFactory compression strategy error !!!",
CompressionLevel.BEST_COMPRESSION == ZlibFactory
.getCompressionLevel(cfg));
ZlibFactory.setCompressionStrategy(cfg, CompressionStrategy.FILTERED);
assertTrue("testZlibFactory compression strategy error !!!",
CompressionStrategy.FILTERED == ZlibFactory.getCompressionStrategy(cfg));
}
private boolean checkSetDictionaryNullPointerException(
Decompressor decompressor) {
try {
decompressor.setDictionary(null, 0, 1);
} catch (NullPointerException ex) {
return true;
} catch (Exception ex) {
}
return false;
}
private boolean checkSetDictionaryNullPointerException(Compressor compressor) {
try {
compressor.setDictionary(null, 0, 1);
} catch (NullPointerException ex) {
return true;
} catch (Exception ex) {
}
return false;
}
private boolean checkSetDictionaryArrayIndexOutOfBoundsException(
Compressor compressor) {
try {
compressor.setDictionary(new byte[] { (byte) 0 }, 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
}
return false;
}
private boolean checkSetDictionaryArrayIndexOutOfBoundsException(
Decompressor decompressor) {
try {
decompressor.setDictionary(new byte[] { (byte) 0 }, 0, -1);
} catch (ArrayIndexOutOfBoundsException e) {
return true;
} catch (Exception e) {
}
return false;
}
private byte[] compressDecompressZlib(byte[] rawData,
ZlibCompressor zlibCompressor, ZlibDecompressor zlibDecompressor)
throws IOException {
int cSize = 0;
byte[] compressedByte = new byte[rawData.length];
byte[] decompressedRawData = new byte[rawData.length];
zlibCompressor.setInput(rawData, 0, rawData.length);
zlibCompressor.finish();
while (!zlibCompressor.finished()) {
cSize = zlibCompressor.compress(compressedByte, 0, compressedByte.length);
}
zlibCompressor.reset();
assertTrue(zlibDecompressor.getBytesWritten() == 0);
assertTrue(zlibDecompressor.getBytesRead() == 0);
assertTrue(zlibDecompressor.needsInput());
zlibDecompressor.setInput(compressedByte, 0, cSize);
assertFalse(zlibDecompressor.needsInput());
while (!zlibDecompressor.finished()) {
zlibDecompressor.decompress(decompressedRawData, 0,
decompressedRawData.length);
}
assertTrue(zlibDecompressor.getBytesWritten() == rawData.length);
assertTrue(zlibDecompressor.getBytesRead() == cSize);
zlibDecompressor.reset();
assertTrue(zlibDecompressor.getRemaining() == 0);
assertArrayEquals(
"testZlibCompressorDecompressorWithConfiguration array equals error",
rawData, decompressedRawData);
return decompressedRawData;
}
@Test
public void testBuiltInGzipDecompressorExceptions() {
BuiltInGzipDecompressor decompresser = new BuiltInGzipDecompressor();
try {
decompresser.setInput(null, 0, 1);
} catch (NullPointerException ex) {
// expected
} catch (Exception ex) {
fail("testBuiltInGzipDecompressorExceptions npe error " + ex);
}
try {
decompresser.setInput(new byte[] { 0 }, 0, -1);
} catch (ArrayIndexOutOfBoundsException ex) {
// expected
} catch (Exception ex) {
fail("testBuiltInGzipDecompressorExceptions aioob error" + ex);
}
assertTrue("decompresser.getBytesRead error",
decompresser.getBytesRead() == 0);
assertTrue("decompresser.getRemaining error",
decompresser.getRemaining() == 0);
decompresser.reset();
decompresser.end();
InputStream decompStream = null;
try {
// invalid 0 and 1 bytes , must be 31, -117
int buffSize = 1 * 1024;
byte buffer[] = new byte[buffSize];
Decompressor decompressor = new BuiltInGzipDecompressor();
DataInputBuffer gzbuf = new DataInputBuffer();
decompStream = new DecompressorStream(gzbuf, decompressor);
gzbuf.reset(new byte[] { 0, 0, 1, 1, 1, 1, 11, 1, 1, 1, 1 }, 11);
decompStream.read(buffer);
} catch (IOException ioex) {
// expected
} catch (Exception ex) {
fail("invalid 0 and 1 byte in gzip stream" + ex);
}
// invalid 2 byte, must be 8
try {
int buffSize = 1 * 1024;
byte buffer[] = new byte[buffSize];
Decompressor decompressor = new BuiltInGzipDecompressor();
DataInputBuffer gzbuf = new DataInputBuffer();
decompStream = new DecompressorStream(gzbuf, decompressor);
gzbuf.reset(new byte[] { 31, -117, 7, 1, 1, 1, 1, 11, 1, 1, 1, 1 }, 11);
decompStream.read(buffer);
} catch (IOException ioex) {
// expected
} catch (Exception ex) {
fail("invalid 2 byte in gzip stream" + ex);
}
try {
int buffSize = 1 * 1024;
byte buffer[] = new byte[buffSize];
Decompressor decompressor = new BuiltInGzipDecompressor();
DataInputBuffer gzbuf = new DataInputBuffer();
decompStream = new DecompressorStream(gzbuf, decompressor);
gzbuf.reset(new byte[] { 31, -117, 8, -32, 1, 1, 1, 11, 1, 1, 1, 1 }, 11);
decompStream.read(buffer);
} catch (IOException ioex) {
// expected
} catch (Exception ex) {
fail("invalid 3 byte in gzip stream" + ex);
}
try {
int buffSize = 1 * 1024;
byte buffer[] = new byte[buffSize];
Decompressor decompressor = new BuiltInGzipDecompressor();
DataInputBuffer gzbuf = new DataInputBuffer();
decompStream = new DecompressorStream(gzbuf, decompressor);
gzbuf.reset(new byte[] { 31, -117, 8, 4, 1, 1, 1, 11, 1, 1, 1, 1 }, 11);
decompStream.read(buffer);
} catch (IOException ioex) {
// expected
} catch (Exception ex) {
fail("invalid 3 byte make hasExtraField" + ex);
}
}
public static byte[] generate(int size) {
byte[] data = new byte[size];
for (int i = 0; i < size; i++)
data[i] = (byte)random.nextInt(16);
return data;
}
}

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.net.unix;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.util.Random;
import org.apache.commons.io.FileUtils;
/**
* Create a temporary directory in which sockets can be created.
* When creating a UNIX domain socket, the name
* must be fairly short (around 110 bytes on most platforms).
*/
public class TemporarySocketDirectory implements Closeable {
private File dir;
public TemporarySocketDirectory() {
String tmp = System.getProperty("java.io.tmpdir", "/tmp");
dir = new File(tmp, "socks." + (System.currentTimeMillis() +
"." + (new Random().nextInt())));
dir.mkdirs();
dir.setWritable(true, true);
}
public File getDir() {
return dir;
}
@Override
public void close() throws IOException {
if (dir != null) {
FileUtils.deleteDirectory(dir);
dir = null;
}
}
protected void finalize() throws IOException {
close();
}
}

View File

@ -0,0 +1,706 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.net.unix;
import java.io.File;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.SocketTimeoutException;
import java.nio.ByteBuffer;
import java.nio.channels.AsynchronousCloseException;
import java.nio.channels.ClosedChannelException;
import java.util.Arrays;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.DomainSocket.DomainChannel;
import org.apache.hadoop.test.GenericTestUtils;
import org.apache.hadoop.util.Shell;
import com.google.common.io.Files;
public class TestDomainSocket {
private static TemporarySocketDirectory sockDir;
@BeforeClass
public static void init() {
sockDir = new TemporarySocketDirectory();
DomainSocket.disableBindPathValidation();
}
@AfterClass
public static void shutdown() throws IOException {
sockDir.close();
}
@Before
public void before() {
Assume.assumeTrue(DomainSocket.getLoadingFailureReason() == null);
}
/**
* Test that we can create a socket and close it, even if it hasn't been
* opened.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testSocketCreateAndClose() throws IOException {
DomainSocket serv = DomainSocket.bindAndListen(
new File(sockDir.getDir(), "test_sock_create_and_close").
getAbsolutePath());
serv.close();
}
/**
* Test DomainSocket path setting and getting.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testSocketPathSetGet() throws IOException {
Assert.assertEquals("/var/run/hdfs/sock.100",
DomainSocket.getEffectivePath("/var/run/hdfs/sock._PORT", 100));
}
/**
* Test that we get a read result of -1 on EOF.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testSocketReadEof() throws Exception {
final String TEST_PATH = new File(sockDir.getDir(),
"testSocketReadEof").getAbsolutePath();
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
ExecutorService exeServ = Executors.newSingleThreadExecutor();
Callable<Void> callable = new Callable<Void>() {
public Void call(){
DomainSocket conn;
try {
conn = serv.accept();
} catch (IOException e) {
throw new RuntimeException("unexpected IOException", e);
}
byte buf[] = new byte[100];
for (int i = 0; i < buf.length; i++) {
buf[i] = 0;
}
try {
Assert.assertEquals(-1, conn.getInputStream().read());
} catch (IOException e) {
throw new RuntimeException("unexpected IOException", e);
}
return null;
}
};
Future<Void> future = exeServ.submit(callable);
DomainSocket conn = DomainSocket.connect(serv.getPath());
Thread.sleep(50);
conn.close();
serv.close();
future.get(2, TimeUnit.MINUTES);
}
/**
* Test that if one thread is blocking in a read or write operation, another
* thread can close the socket and stop the accept.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testSocketAcceptAndClose() throws Exception {
final String TEST_PATH =
new File(sockDir.getDir(), "test_sock_accept_and_close").getAbsolutePath();
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
ExecutorService exeServ = Executors.newSingleThreadExecutor();
Callable<Void> callable = new Callable<Void>() {
public Void call(){
try {
serv.accept();
throw new RuntimeException("expected the accept() to be " +
"interrupted and fail");
} catch (AsynchronousCloseException e) {
return null;
} catch (IOException e) {
throw new RuntimeException("unexpected IOException", e);
}
}
};
Future<Void> future = exeServ.submit(callable);
Thread.sleep(500);
serv.close();
future.get(2, TimeUnit.MINUTES);
}
/**
* Test that we get an AsynchronousCloseException when the DomainSocket
* we're using is closed during a read or write operation.
*
* @throws IOException
*/
private void testAsyncCloseDuringIO(final boolean closeDuringWrite)
throws Exception {
final String TEST_PATH = new File(sockDir.getDir(),
"testAsyncCloseDuringIO(" + closeDuringWrite + ")").getAbsolutePath();
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
ExecutorService exeServ = Executors.newFixedThreadPool(2);
Callable<Void> serverCallable = new Callable<Void>() {
public Void call() {
DomainSocket serverConn = null;
try {
serverConn = serv.accept();
byte buf[] = new byte[100];
for (int i = 0; i < buf.length; i++) {
buf[i] = 0;
}
// The server just continues either writing or reading until someone
// asynchronously closes the client's socket. At that point, all our
// reads return EOF, and writes get a socket error.
if (closeDuringWrite) {
try {
while (true) {
serverConn.getOutputStream().write(buf);
}
} catch (IOException e) {
}
} else {
do { ; } while
(serverConn.getInputStream().read(buf, 0, buf.length) != -1);
}
} catch (IOException e) {
throw new RuntimeException("unexpected IOException", e);
} finally {
IOUtils.cleanup(DomainSocket.LOG, serverConn);
}
return null;
}
};
Future<Void> serverFuture = exeServ.submit(serverCallable);
final DomainSocket clientConn = DomainSocket.connect(serv.getPath());
Callable<Void> clientCallable = new Callable<Void>() {
public Void call(){
// The client writes or reads until another thread
// asynchronously closes the socket. At that point, we should
// get ClosedChannelException, or possibly its subclass
// AsynchronousCloseException.
byte buf[] = new byte[100];
for (int i = 0; i < buf.length; i++) {
buf[i] = 0;
}
try {
if (closeDuringWrite) {
while (true) {
clientConn.getOutputStream().write(buf);
}
} else {
while (true) {
clientConn.getInputStream().read(buf, 0, buf.length);
}
}
} catch (ClosedChannelException e) {
return null;
} catch (IOException e) {
throw new RuntimeException("unexpected IOException", e);
}
}
};
Future<Void> clientFuture = exeServ.submit(clientCallable);
Thread.sleep(500);
clientConn.close();
serv.close();
clientFuture.get(2, TimeUnit.MINUTES);
serverFuture.get(2, TimeUnit.MINUTES);
}
@Test(timeout=180000)
public void testAsyncCloseDuringWrite() throws Exception {
testAsyncCloseDuringIO(true);
}
@Test(timeout=180000)
public void testAsyncCloseDuringRead() throws Exception {
testAsyncCloseDuringIO(false);
}
/**
* Test that attempting to connect to an invalid path doesn't work.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testInvalidOperations() throws IOException {
try {
DomainSocket.connect(
new File(sockDir.getDir(), "test_sock_invalid_operation").
getAbsolutePath());
} catch (IOException e) {
GenericTestUtils.assertExceptionContains("connect(2) error: ", e);
}
}
/**
* Test setting some server options.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testServerOptions() throws Exception {
final String TEST_PATH = new File(sockDir.getDir(),
"test_sock_server_options").getAbsolutePath();
DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
try {
// Let's set a new receive buffer size
int bufSize = serv.getAttribute(DomainSocket.RECEIVE_BUFFER_SIZE);
int newBufSize = bufSize / 2;
serv.setAttribute(DomainSocket.RECEIVE_BUFFER_SIZE, newBufSize);
int nextBufSize = serv.getAttribute(DomainSocket.RECEIVE_BUFFER_SIZE);
Assert.assertEquals(newBufSize, nextBufSize);
// Let's set a server timeout
int newTimeout = 1000;
serv.setAttribute(DomainSocket.RECEIVE_TIMEOUT, newTimeout);
int nextTimeout = serv.getAttribute(DomainSocket.RECEIVE_TIMEOUT);
Assert.assertEquals(newTimeout, nextTimeout);
try {
serv.accept();
Assert.fail("expected the accept() to time out and fail");
} catch (SocketTimeoutException e) {
GenericTestUtils.assertExceptionContains("accept(2) error: ", e);
}
} finally {
serv.close();
Assert.assertFalse(serv.isOpen());
}
}
/**
* A Throwable representing success.
*
* We can't use null to represent this, because you cannot insert null into
* ArrayBlockingQueue.
*/
static class Success extends Throwable {
private static final long serialVersionUID = 1L;
}
static interface WriteStrategy {
/**
* Initialize a WriteStrategy object from a Socket.
*/
public void init(DomainSocket s) throws IOException;
/**
* Write some bytes.
*/
public void write(byte b[]) throws IOException;
}
static class OutputStreamWriteStrategy implements WriteStrategy {
private OutputStream outs = null;
public void init(DomainSocket s) throws IOException {
outs = s.getOutputStream();
}
public void write(byte b[]) throws IOException {
outs.write(b);
}
}
abstract static class ReadStrategy {
/**
* Initialize a ReadStrategy object from a DomainSocket.
*/
public abstract void init(DomainSocket s) throws IOException;
/**
* Read some bytes.
*/
public abstract int read(byte b[], int off, int length) throws IOException;
public void readFully(byte buf[], int off, int len) throws IOException {
int toRead = len;
while (toRead > 0) {
int ret = read(buf, off, toRead);
if (ret < 0) {
throw new IOException( "Premature EOF from inputStream");
}
toRead -= ret;
off += ret;
}
}
}
static class InputStreamReadStrategy extends ReadStrategy {
private InputStream ins = null;
@Override
public void init(DomainSocket s) throws IOException {
ins = s.getInputStream();
}
@Override
public int read(byte b[], int off, int length) throws IOException {
return ins.read(b, off, length);
}
}
static class DirectByteBufferReadStrategy extends ReadStrategy {
private DomainChannel ch = null;
@Override
public void init(DomainSocket s) throws IOException {
ch = s.getChannel();
}
@Override
public int read(byte b[], int off, int length) throws IOException {
ByteBuffer buf = ByteBuffer.allocateDirect(b.length);
int nread = ch.read(buf);
if (nread < 0) return nread;
buf.flip();
buf.get(b, off, nread);
return nread;
}
}
static class ArrayBackedByteBufferReadStrategy extends ReadStrategy {
private DomainChannel ch = null;
@Override
public void init(DomainSocket s) throws IOException {
ch = s.getChannel();
}
@Override
public int read(byte b[], int off, int length) throws IOException {
ByteBuffer buf = ByteBuffer.wrap(b);
int nread = ch.read(buf);
if (nread < 0) return nread;
buf.flip();
buf.get(b, off, nread);
return nread;
}
}
/**
* Test a simple client/server interaction.
*
* @throws IOException
*/
void testClientServer1(final Class<? extends WriteStrategy> writeStrategyClass,
final Class<? extends ReadStrategy> readStrategyClass) throws Exception {
final String TEST_PATH = new File(sockDir.getDir(),
"test_sock_client_server1").getAbsolutePath();
final byte clientMsg1[] = new byte[] { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 };
final byte serverMsg1[] = new byte[] { 0x9, 0x8, 0x7, 0x6, 0x5 };
final byte clientMsg2 = 0x45;
final ArrayBlockingQueue<Throwable> threadResults =
new ArrayBlockingQueue<Throwable>(2);
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
Thread serverThread = new Thread() {
public void run(){
// Run server
DomainSocket conn = null;
try {
conn = serv.accept();
byte in1[] = new byte[clientMsg1.length];
ReadStrategy reader = readStrategyClass.newInstance();
reader.init(conn);
reader.readFully(in1, 0, in1.length);
Assert.assertTrue(Arrays.equals(clientMsg1, in1));
WriteStrategy writer = writeStrategyClass.newInstance();
writer.init(conn);
writer.write(serverMsg1);
InputStream connInputStream = conn.getInputStream();
int in2 = connInputStream.read();
Assert.assertEquals((int)clientMsg2, in2);
conn.close();
} catch (Throwable e) {
threadResults.add(e);
Assert.fail(e.getMessage());
}
threadResults.add(new Success());
}
};
serverThread.start();
Thread clientThread = new Thread() {
public void run(){
try {
DomainSocket client = DomainSocket.connect(TEST_PATH);
WriteStrategy writer = writeStrategyClass.newInstance();
writer.init(client);
writer.write(clientMsg1);
ReadStrategy reader = readStrategyClass.newInstance();
reader.init(client);
byte in1[] = new byte[serverMsg1.length];
reader.readFully(in1, 0, in1.length);
Assert.assertTrue(Arrays.equals(serverMsg1, in1));
OutputStream clientOutputStream = client.getOutputStream();
clientOutputStream.write(clientMsg2);
client.close();
} catch (Throwable e) {
threadResults.add(e);
}
threadResults.add(new Success());
}
};
clientThread.start();
for (int i = 0; i < 2; i++) {
Throwable t = threadResults.take();
if (!(t instanceof Success)) {
Assert.fail(t.getMessage() + ExceptionUtils.getStackTrace(t));
}
}
serverThread.join(120000);
clientThread.join(120000);
serv.close();
}
@Test(timeout=180000)
public void testClientServerOutStreamInStream() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
InputStreamReadStrategy.class);
}
@Test(timeout=180000)
public void testClientServerOutStreamInDbb() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
DirectByteBufferReadStrategy.class);
}
@Test(timeout=180000)
public void testClientServerOutStreamInAbb() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
ArrayBackedByteBufferReadStrategy.class);
}
static private class PassedFile {
private final int idx;
private final byte[] contents;
private FileInputStream fis;
public PassedFile(int idx) throws IOException {
this.idx = idx;
this.contents = new byte[] { (byte)(idx % 127) };
Files.write(contents, new File(getPath()));
this.fis = new FileInputStream(getPath());
}
public String getPath() {
return new File(sockDir.getDir(), "passed_file" + idx).getAbsolutePath();
}
public FileInputStream getInputStream() throws IOException {
return fis;
}
public void cleanup() throws IOException {
new File(getPath()).delete();
fis.close();
}
public void checkInputStream(FileInputStream fis) throws IOException {
byte buf[] = new byte[contents.length];
IOUtils.readFully(fis, buf, 0, buf.length);
Arrays.equals(contents, buf);
}
protected void finalize() {
try {
cleanup();
} catch(Throwable t) {
// ignore
}
}
}
/**
* Test file descriptor passing.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testFdPassing() throws Exception {
final String TEST_PATH =
new File(sockDir.getDir(), "test_sock").getAbsolutePath();
final byte clientMsg1[] = new byte[] { 0x11, 0x22, 0x33, 0x44, 0x55, 0x66 };
final byte serverMsg1[] = new byte[] { 0x31, 0x30, 0x32, 0x34, 0x31, 0x33,
0x44, 0x1, 0x1, 0x1, 0x1, 0x1 };
final ArrayBlockingQueue<Throwable> threadResults =
new ArrayBlockingQueue<Throwable>(2);
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH);
final PassedFile passedFiles[] =
new PassedFile[] { new PassedFile(1), new PassedFile(2) };
final FileDescriptor passedFds[] = new FileDescriptor[passedFiles.length];
for (int i = 0; i < passedFiles.length; i++) {
passedFds[i] = passedFiles[i].getInputStream().getFD();
}
Thread serverThread = new Thread() {
public void run(){
// Run server
DomainSocket conn = null;
try {
conn = serv.accept();
byte in1[] = new byte[clientMsg1.length];
InputStream connInputStream = conn.getInputStream();
IOUtils.readFully(connInputStream, in1, 0, in1.length);
Assert.assertTrue(Arrays.equals(clientMsg1, in1));
DomainSocket domainConn = (DomainSocket)conn;
domainConn.sendFileDescriptors(passedFds, serverMsg1, 0,
serverMsg1.length);
conn.close();
} catch (Throwable e) {
threadResults.add(e);
Assert.fail(e.getMessage());
}
threadResults.add(new Success());
}
};
serverThread.start();
Thread clientThread = new Thread() {
public void run(){
try {
DomainSocket client = DomainSocket.connect(TEST_PATH);
OutputStream clientOutputStream = client.getOutputStream();
InputStream clientInputStream = client.getInputStream();
clientOutputStream.write(clientMsg1);
DomainSocket domainConn = (DomainSocket)client;
byte in1[] = new byte[serverMsg1.length];
FileInputStream recvFis[] = new FileInputStream[passedFds.length];
int r = domainConn.
recvFileInputStreams(recvFis, in1, 0, in1.length - 1);
Assert.assertTrue(r > 0);
IOUtils.readFully(clientInputStream, in1, r, in1.length - r);
Assert.assertTrue(Arrays.equals(serverMsg1, in1));
for (int i = 0; i < passedFds.length; i++) {
Assert.assertNotNull(recvFis[i]);
passedFiles[i].checkInputStream(recvFis[i]);
}
for (FileInputStream fis : recvFis) {
fis.close();
}
client.close();
} catch (Throwable e) {
threadResults.add(e);
}
threadResults.add(new Success());
}
};
clientThread.start();
for (int i = 0; i < 2; i++) {
Throwable t = threadResults.take();
if (!(t instanceof Success)) {
Assert.fail(t.getMessage() + ExceptionUtils.getStackTrace(t));
}
}
serverThread.join(120000);
clientThread.join(120000);
serv.close();
for (PassedFile pf : passedFiles) {
pf.cleanup();
}
}
/**
* Run validateSocketPathSecurity
*
* @param str The path to validate
* @param prefix A prefix to skip validation for
* @throws IOException
*/
private static void testValidateSocketPath(String str, String prefix)
throws IOException {
int skipComponents = 1;
File prefixFile = new File(prefix);
while (true) {
prefixFile = prefixFile.getParentFile();
if (prefixFile == null) {
break;
}
skipComponents++;
}
DomainSocket.validateSocketPathSecurity0(str,
skipComponents);
}
/**
* Test file descriptor path security.
*
* @throws IOException
*/
@Test(timeout=180000)
public void testFdPassingPathSecurity() throws Exception {
TemporarySocketDirectory tmp = new TemporarySocketDirectory();
try {
String prefix = tmp.getDir().getAbsolutePath();
Shell.execCommand(new String [] {
"mkdir", "-p", prefix + "/foo/bar/baz" });
Shell.execCommand(new String [] {
"chmod", "0700", prefix + "/foo/bar/baz" });
Shell.execCommand(new String [] {
"chmod", "0700", prefix + "/foo/bar" });
Shell.execCommand(new String [] {
"chmod", "0707", prefix + "/foo" });
Shell.execCommand(new String [] {
"mkdir", "-p", prefix + "/q1/q2" });
Shell.execCommand(new String [] {
"chmod", "0700", prefix + "/q1" });
Shell.execCommand(new String [] {
"chmod", "0700", prefix + "/q1/q2" });
testValidateSocketPath(prefix + "/q1/q2", prefix);
try {
testValidateSocketPath(prefix + "/foo/bar/baz", prefix);
} catch (IOException e) {
GenericTestUtils.assertExceptionContains("/foo' is world-writable. " +
"Its permissions are 0707. Please fix this or select a " +
"different socket path.", e);
}
try {
testValidateSocketPath(prefix + "/nope", prefix);
} catch (IOException e) {
GenericTestUtils.assertExceptionContains("failed to stat a path " +
"component: ", e);
}
// Root should be secure
DomainSocket.validateSocketPathSecurity0("/foo", 1);
} finally {
tmp.close();
}
}
}

View File

@ -187,6 +187,9 @@ Trunk (Unreleased)
HDFS-4633 TestDFSClientExcludedNodes fails sporadically if excluded nodes
cache expires too quickly (Chris Nauroth via Sanjay)
HDFS-347. DFS read performance suboptimal when client co-located on nodes
with data. (Colin Patrick McCabe via todd and atm)
OPTIMIZATIONS
BUG FIXES
@ -355,6 +358,62 @@ Trunk (Unreleased)
HDFS-4674. TestBPOfferService fails on Windows due to failure parsing
datanode data directory as URI. (Chris Nauroth via suresh)
BREAKDOWN OF HDFS-347 SUBTASKS AND RELATED JIRAS
HDFS-4353. Encapsulate connections to peers in Peer and PeerServer classes.
(Colin Patrick McCabe via todd)
HDFS-4354. Create DomainSocket and DomainPeer and associated unit tests.
(Colin Patrick McCabe via todd)
HDFS-4356. BlockReaderLocal should use passed file descriptors rather than paths.
(Colin Patrick McCabe via todd)
HDFS-4388. DomainSocket should throw AsynchronousCloseException when appropriate.
(Colin Patrick McCabe via todd)
HDFS-4390. Bypass UNIX domain socket unit tests when they cannot be run.
(Colin Patrick McCabe via todd)
HDFS-4400. DFSInputStream#getBlockReader: last retries should ignore the cache
(Colin Patrick McCabe via todd)
HDFS-4401. Fix bug in DomainSocket path validation
(Colin Patrick McCabe via todd)
HDFS-4402. Some small DomainSocket fixes: avoid findbugs warning, change
log level, etc. (Colin Patrick McCabe via todd)
HDFS-4418. increase default FileInputStreamCache size (todd)
HDFS-4416. Rename dfs.datanode.domain.socket.path to dfs.domain.socket.path
(Colin Patrick McCabe via todd)
HDFS-4417. Fix case where local reads get disabled incorrectly
(Colin Patrick McCabe and todd via todd)
HDFS-4433. Make TestPeerCache not flaky (Colin Patrick McCabe via todd)
HDFS-4438. TestDomainSocket fails when system umask is set to 0002. (Colin
Patrick McCabe via atm)
HDFS-4440. Avoid annoying log message when dfs.domain.socket.path is not
set. (Colin Patrick McCabe via atm)
HDFS-4473. Don't create domain socket unless we need it. (Colin Patrick McCabe via atm)
HDFS-4485. DN should chmod socket path a+w. (Colin Patrick McCabe via atm)
HDFS-4453. Make a simple doc to describe the usage and design of the
shortcircuit read feature. (Colin Patrick McCabe via atm)
HDFS-4496. DFSClient: don't create a domain socket unless we need it (Colin
Patrick McCabe via todd)
HDFS-347: style cleanups (Colin Patrick McCabe via atm)
HDFS-4538. Allow use of legacy blockreader (Colin Patrick McCabe via todd)
Release 2.0.5-beta - UNRELEASED
INCOMPATIBLE CHANGES
@ -399,6 +458,9 @@ Release 2.0.5-beta - UNRELEASED
HDFS-3940. Add Gset#clear method and clear the block map when namenode is
shutdown. (suresh)
HDFS-4679. Namenode operation checks should be done in a consistent
manner. (suresh)
OPTIMIZATIONS
BUG FIXES
@ -500,6 +562,9 @@ Release 2.0.5-beta - UNRELEASED
HDFS-4643. Fix flakiness in TestQuorumJournalManager. (todd)
HDFS-4639. startFileInternal() should not increment generation stamp.
(Plamen Jeliazkov via shv)
Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES
@ -2473,6 +2538,20 @@ Release 2.0.0-alpha - 05-23-2012
HDFS-3039. Address findbugs and javadoc warnings on branch. (todd via atm)
Release 0.23.8 - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES
IMPROVEMENTS
OPTIMIZATIONS
BUG FIXES
HDFS-4477. Secondary namenode may retain old tokens (daryn via kihwal)
Release 0.23.7 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -290,6 +290,14 @@
<Method name="persistPaxosData" />
<Bug pattern="OS_OPEN_STREAM" />
</Match>
<!-- getShortCircuitFdsForRead is supposed to return open streams. -->
<Match>
<Class name="org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl" />
<Method name="getShortCircuitFdsForRead" />
<Bug pattern="OBL_UNSATISFIED_OBLIGATION" />
</Match>
<!-- Don't complain about LocalDatanodeInfo's anonymous class -->
<Match>
<Class name="org.apache.hadoop.hdfs.BlockReaderLocal$LocalDatanodeInfo$1" />

View File

@ -515,6 +515,7 @@ http://maven.apache.org/xsd/maven-4.0.0.xsd">
<excludes>
<exclude>CHANGES.txt</exclude>
<exclude>CHANGES.HDFS-1623.txt</exclude>
<exclude>CHANGES.HDFS-347.txt</exclude>
<exclude>.idea/**</exclude>
<exclude>src/main/conf/*</exclude>
<exclude>src/main/docs/**</exclude>

View File

@ -18,10 +18,8 @@
package org.apache.hadoop.hdfs;
import java.io.IOException;
import java.net.Socket;
import org.apache.hadoop.fs.ByteBufferReadable;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
/**
* A BlockReader is responsible for reading a single block
@ -43,7 +41,29 @@ public interface BlockReader extends ByteBufferReadable {
*/
long skip(long n) throws IOException;
void close() throws IOException;
/**
* Returns an estimate of the number of bytes that can be read
* (or skipped over) from this input stream without performing
* network I/O.
*/
int available() throws IOException;
/**
* Close the block reader.
*
* @param peerCache The PeerCache to put the Peer we're using back
* into, or null if we should simply close the Peer
* we're using (along with its Socket).
* Ignored by Readers that don't maintain Peers.
* @param fisCache The FileInputStreamCache to put our FileInputStreams
* back into, or null if we should simply close them.
* Ignored by Readers that don't maintain
* FileInputStreams.
*
* @throws IOException
*/
void close(PeerCache peerCache, FileInputStreamCache fisCache)
throws IOException;
/**
* Read exactly the given amount of data, throwing an exception
@ -60,20 +80,4 @@ public interface BlockReader extends ByteBufferReadable {
* filled or the next call will return EOF.
*/
int readAll(byte[] buf, int offset, int len) throws IOException;
/**
* Take the socket used to talk to the DN.
*/
Socket takeSocket();
/**
* Whether the BlockReader has reached the end of its input stream
* and successfully sent a status code back to the datanode.
*/
boolean hasSentStatusCode();
/**
* @return a reference to the streams this block reader is using.
*/
IOStreamPair getStreams();
}

View File

@ -17,20 +17,31 @@
*/
package org.apache.hadoop.hdfs;
import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferEncryptor;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseProto;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.security.token.Token;
@ -39,75 +50,182 @@ import org.apache.hadoop.security.token.Token;
*/
@InterfaceAudience.Private
public class BlockReaderFactory {
/**
* @see #newBlockReader(Conf, Socket, String, ExtendedBlock, Token, long, long, int, boolean, String)
*/
public static BlockReader newBlockReader(
Configuration conf,
Socket sock, String file,
ExtendedBlock block, Token<BlockTokenIdentifier> blockToken,
long startOffset, long len, DataEncryptionKey encryptionKey)
throws IOException {
int bufferSize = conf.getInt(DFSConfigKeys.IO_FILE_BUFFER_SIZE_KEY,
DFSConfigKeys.IO_FILE_BUFFER_SIZE_DEFAULT);
return newBlockReader(new Conf(conf),
sock, file, block, blockToken, startOffset,
len, bufferSize, true, "", encryptionKey, null);
}
/**
* Create a new BlockReader specifically to satisfy a read.
* This method also sends the OP_READ_BLOCK request.
*
* @param conf the DFSClient configuration
* @param sock An established Socket to the DN. The BlockReader will not close it normally
* @param file File location
* @param block The block object
* @param blockToken The block token for security
* @param startOffset The read offset, relative to block head
* @param len The number of bytes to read
* @param len The number of bytes to read, or -1 to read as many as
* possible.
* @param bufferSize The IO buffer size (not the client buffer size)
* Ignored except on the legacy BlockReader.
* @param verifyChecksum Whether to verify checksum
* @param clientName Client name
* @return New BlockReader instance, or null on error.
* @param clientName Client name. Used for log messages.
* @param peer The peer
* @param datanodeID The datanode that the Peer is connected to
* @param domainSocketFactory The DomainSocketFactory to notify if the Peer
* is a DomainPeer which turns out to be faulty.
* If null, no factory will be notified in this
* case.
* @param allowShortCircuitLocalReads True if short-circuit local reads
* should be allowed.
* @return New BlockReader instance
*/
@SuppressWarnings("deprecation")
public static BlockReader newBlockReader(
Conf conf,
Socket sock, String file,
Configuration conf,
String file,
ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken,
long startOffset, long len,
int bufferSize, boolean verifyChecksum,
boolean verifyChecksum,
String clientName,
DataEncryptionKey encryptionKey,
IOStreamPair ioStreams)
throws IOException {
if (conf.useLegacyBlockReader) {
if (encryptionKey != null) {
throw new RuntimeException("Encryption is not supported with the legacy block reader.");
}
return RemoteBlockReader.newBlockReader(
sock, file, block, blockToken, startOffset, len, bufferSize, verifyChecksum, clientName);
} else {
if (ioStreams == null) {
ioStreams = new IOStreamPair(NetUtils.getInputStream(sock),
NetUtils.getOutputStream(sock, HdfsServerConstants.WRITE_TIMEOUT));
if (encryptionKey != null) {
IOStreamPair encryptedStreams =
DataTransferEncryptor.getEncryptedStreams(
ioStreams.out, ioStreams.in, encryptionKey);
ioStreams = encryptedStreams;
Peer peer,
DatanodeID datanodeID,
DomainSocketFactory domSockFactory,
boolean allowShortCircuitLocalReads)
throws IOException {
peer.setReadTimeout(conf.getInt(DFSConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY,
HdfsServerConstants.READ_TIMEOUT));
peer.setWriteTimeout(HdfsServerConstants.WRITE_TIMEOUT);
if (peer.getDomainSocket() != null) {
if (allowShortCircuitLocalReads &&
(!conf.getBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT))) {
// If this is a domain socket, and short-circuit local reads are
// enabled, try to set up a BlockReaderLocal.
BlockReader reader = newShortCircuitBlockReader(conf, file,
block, blockToken, startOffset, len, peer, datanodeID,
domSockFactory, verifyChecksum);
if (reader != null) {
// One we've constructed the short-circuit block reader, we don't
// need the socket any more. So let's return it to the cache.
PeerCache peerCache = PeerCache.getInstance(
conf.getInt(DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY,
DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_DEFAULT),
conf.getLong(DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY,
DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_DEFAULT));
peerCache.put(datanodeID, peer);
return reader;
}
}
// If this is a domain socket and we couldn't (or didn't want to) set
// up a BlockReaderLocal, check that we are allowed to pass data traffic
// over the socket before proceeding.
if (!conf.getBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC,
DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT)) {
throw new IOException("Because we can't do short-circuit access, " +
"and data traffic over domain sockets is disabled, " +
"we cannot use this socket to talk to " + datanodeID);
}
}
if (conf.getBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADER,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADER_DEFAULT)) {
return RemoteBlockReader.newBlockReader(file,
block, blockToken, startOffset, len,
conf.getInt(DFSConfigKeys.IO_FILE_BUFFER_SIZE_KEY,
DFSConfigKeys.IO_FILE_BUFFER_SIZE_DEFAULT),
verifyChecksum, clientName, peer, datanodeID);
} else {
return RemoteBlockReader2.newBlockReader(
sock, file, block, blockToken, startOffset, len, bufferSize,
verifyChecksum, clientName, encryptionKey, ioStreams);
file, block, blockToken, startOffset, len,
verifyChecksum, clientName, peer, datanodeID);
}
}
/**
* Create a new short-circuit BlockReader.
*
* Here, we ask the DataNode to pass us file descriptors over our
* DomainSocket. If the DataNode declines to do so, we'll return null here;
* otherwise, we'll return the BlockReaderLocal. If the DataNode declines,
* this function will inform the DomainSocketFactory that short-circuit local
* reads are disabled for this DataNode, so that we don't ask again.
*
* @param conf the configuration.
* @param file the file name. Used in log messages.
* @param block The block object.
* @param blockToken The block token for security.
* @param startOffset The read offset, relative to block head.
* @param len The number of bytes to read, or -1 to read
* as many as possible.
* @param peer The peer to use.
* @param datanodeID The datanode that the Peer is connected to.
* @param domSockFactory The DomainSocketFactory to notify if the Peer
* is a DomainPeer which turns out to be faulty.
* If null, no factory will be notified in this
* case.
* @param verifyChecksum True if we should verify the checksums.
* Note: even if this is true, when
* DFS_CLIENT_READ_CHECKSUM_SKIP_CHECKSUM_KEY is
* set, we will skip checksums.
*
* @return The BlockReaderLocal, or null if the
* DataNode declined to provide short-circuit
* access.
* @throws IOException If there was a communication error.
*/
private static BlockReaderLocal newShortCircuitBlockReader(
Configuration conf, String file, ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken, long startOffset,
long len, Peer peer, DatanodeID datanodeID,
DomainSocketFactory domSockFactory, boolean verifyChecksum)
throws IOException {
final DataOutputStream out =
new DataOutputStream(new BufferedOutputStream(
peer.getOutputStream()));
new Sender(out).requestShortCircuitFds(block, blockToken, 1);
DataInputStream in =
new DataInputStream(peer.getInputStream());
BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
PBHelper.vintPrefixed(in));
DomainSocket sock = peer.getDomainSocket();
switch (resp.getStatus()) {
case SUCCESS:
BlockReaderLocal reader = null;
byte buf[] = new byte[1];
FileInputStream fis[] = new FileInputStream[2];
sock.recvFileInputStreams(fis, buf, 0, buf.length);
try {
reader = new BlockReaderLocal(conf, file, block,
startOffset, len, fis[0], fis[1], datanodeID, verifyChecksum);
} finally {
if (reader == null) {
IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);
}
}
return reader;
case ERROR_UNSUPPORTED:
if (!resp.hasShortCircuitAccessVersion()) {
DFSClient.LOG.warn("short-circuit read access is disabled for " +
"DataNode " + datanodeID + ". reason: " + resp.getMessage());
domSockFactory.disableShortCircuitForPath(sock.getPath());
} else {
DFSClient.LOG.warn("short-circuit read access for the file " +
file + " is disabled for DataNode " + datanodeID +
". reason: " + resp.getMessage());
}
return null;
case ERROR_ACCESS_TOKEN:
String msg = "access control error while " +
"attempting to set up short-circuit access to " +
file + resp.getMessage();
DFSClient.LOG.debug(msg);
throw new InvalidBlockTokenException(msg);
default:
DFSClient.LOG.warn("error while attempting to set up short-circuit " +
"access to " + file + ": " + resp.getMessage());
domSockFactory.disableShortCircuitForPath(sock.getPath());
return null;
}
}
/**
* File name to print when accessing a block directly (from servlets)
* @param s Address of the block location
@ -119,4 +237,24 @@ public class BlockReaderFactory {
final String poolId, final long blockId) {
return s.toString() + ":" + poolId + ":" + blockId;
}
/**
* Get {@link BlockReaderLocalLegacy} for short circuited local reads.
* This block reader implements the path-based style of local reads
* first introduced in HDFS-2246.
*/
static BlockReader getLegacyBlockReaderLocal(UserGroupInformation ugi,
Configuration conf, String src, ExtendedBlock blk,
Token<BlockTokenIdentifier> accessToken, DatanodeInfo chosenNode,
int socketTimeout, long offsetIntoBlock,
boolean connectToDnViaHostname) throws InvalidToken, IOException {
try {
return BlockReaderLocalLegacy.newBlockReader(ugi, conf, src,
blk, accessToken, chosenNode, socketTimeout, offsetIntoBlock,
blk.getNumBytes() - offsetIntoBlock, connectToDnViaHostname);
} catch (RemoteException re) {
throw re.unwrapRemoteException(InvalidToken.class,
AccessControlException.class);
}
}
}

View File

@ -18,33 +18,20 @@
package org.apache.hadoop.hdfs;
import java.io.DataInputStream;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.Socket;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.util.DirectBufferPool;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.DataChecksum;
/**
@ -56,84 +43,19 @@ import org.apache.hadoop.util.DataChecksum;
* <ul>
* <li>The client performing short circuit reads must be configured at the
* datanode.</li>
* <li>The client gets the path to the file where block is stored using
* {@link org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol#getBlockLocalPathInfo(ExtendedBlock, Token)}
* RPC call</li>
* <li>Client uses kerberos authentication to connect to the datanode over RPC,
* if security is enabled.</li>
* <li>The client gets the file descriptors for the metadata file and the data
* file for the block using
* {@link org.apache.hadoop.hdfs.server.datanode.DataXceiver#requestShortCircuitFds}.
* </li>
* <li>The client reads the file descriptors.</li>
* </ul>
*/
class BlockReaderLocal implements BlockReader {
private static final Log LOG = LogFactory.getLog(DFSClient.class);
//Stores the cache and proxy for a local datanode.
private static class LocalDatanodeInfo {
private ClientDatanodeProtocol proxy = null;
private final Map<ExtendedBlock, BlockLocalPathInfo> cache;
LocalDatanodeInfo() {
final int cacheSize = 10000;
final float hashTableLoadFactor = 0.75f;
int hashTableCapacity = (int) Math.ceil(cacheSize / hashTableLoadFactor) + 1;
cache = Collections
.synchronizedMap(new LinkedHashMap<ExtendedBlock, BlockLocalPathInfo>(
hashTableCapacity, hashTableLoadFactor, true) {
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(
Map.Entry<ExtendedBlock, BlockLocalPathInfo> eldest) {
return size() > cacheSize;
}
});
}
private synchronized ClientDatanodeProtocol getDatanodeProxy(
UserGroupInformation ugi, final DatanodeInfo node,
final Configuration conf, final int socketTimeout,
final boolean connectToDnViaHostname) throws IOException {
if (proxy == null) {
try {
proxy = ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
@Override
public ClientDatanodeProtocol run() throws Exception {
return DFSUtil.createClientDatanodeProtocolProxy(node, conf,
socketTimeout, connectToDnViaHostname);
}
});
} catch (InterruptedException e) {
LOG.warn("encountered exception ", e);
}
}
return proxy;
}
private synchronized void resetDatanodeProxy() {
if (null != proxy) {
RPC.stopProxy(proxy);
proxy = null;
}
}
private BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b) {
return cache.get(b);
}
private void setBlockLocalPathInfo(ExtendedBlock b, BlockLocalPathInfo info) {
cache.put(b, info);
}
private void removeBlockLocalPathInfo(ExtendedBlock b) {
cache.remove(b);
}
}
// Multiple datanodes could be running on the local machine. Store proxies in
// a map keyed by the ipc port of the datanode.
private static Map<Integer, LocalDatanodeInfo> localDatanodeInfoMap = new HashMap<Integer, LocalDatanodeInfo>();
static final Log LOG = LogFactory.getLog(BlockReaderLocal.class);
private final FileInputStream dataIn; // reader for the data file
private final FileInputStream checksumIn; // reader for the checksum file
private final boolean verifyChecksum;
/**
* Offset from the most recent chunk boundary at which the next read should
@ -153,7 +75,6 @@ class BlockReaderLocal implements BlockReader {
private ByteBuffer slowReadBuff = null;
private ByteBuffer checksumBuff = null;
private DataChecksum checksum;
private final boolean verifyChecksum;
private static DirectBufferPool bufferPool = new DirectBufferPool();
@ -163,187 +84,92 @@ class BlockReaderLocal implements BlockReader {
/** offset in block where reader wants to actually read */
private long startOffset;
private final String filename;
private final DatanodeID datanodeID;
private final ExtendedBlock block;
/**
* The only way this object can be instantiated.
*/
static BlockReaderLocal newBlockReader(UserGroupInformation ugi,
Configuration conf, String file, ExtendedBlock blk,
Token<BlockTokenIdentifier> token, DatanodeInfo node, int socketTimeout,
long startOffset, long length, boolean connectToDnViaHostname)
throws IOException {
private static int getSlowReadBufferNumChunks(Configuration conf,
int bytesPerChecksum) {
LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node
.getIpcPort());
// check the cache first
BlockLocalPathInfo pathinfo = localDatanodeInfo.getBlockLocalPathInfo(blk);
if (pathinfo == null) {
pathinfo = getBlockPathInfo(ugi, blk, node, conf, socketTimeout, token,
connectToDnViaHostname);
}
int bufSize =
conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT);
// check to see if the file exists. It may so happen that the
// HDFS file has been deleted and this block-lookup is occurring
// on behalf of a new HDFS file. This time, the block file could
// be residing in a different portion of the fs.data.dir directory.
// In this case, we remove this entry from the cache. The next
// call to this method will re-populate the cache.
FileInputStream dataIn = null;
FileInputStream checksumIn = null;
BlockReaderLocal localBlockReader = null;
boolean skipChecksumCheck = skipChecksumCheck(conf);
try {
// get a local file system
File blkfile = new File(pathinfo.getBlockPath());
dataIn = new FileInputStream(blkfile);
if (LOG.isDebugEnabled()) {
LOG.debug("New BlockReaderLocal for file " + blkfile + " of size "
+ blkfile.length() + " startOffset " + startOffset + " length "
+ length + " short circuit checksum " + !skipChecksumCheck);
}
if (!skipChecksumCheck) {
// get the metadata file
File metafile = new File(pathinfo.getMetaPath());
checksumIn = new FileInputStream(metafile);
// read and handle the common header here. For now just a version
BlockMetadataHeader header = BlockMetadataHeader
.readHeader(new DataInputStream(checksumIn));
short version = header.getVersion();
if (version != BlockMetadataHeader.VERSION) {
LOG.warn("Wrong version (" + version + ") for metadata file for "
+ blk + " ignoring ...");
}
DataChecksum checksum = header.getChecksum();
long firstChunkOffset = startOffset
- (startOffset % checksum.getBytesPerChecksum());
localBlockReader = new BlockReaderLocal(conf, file, blk, token,
startOffset, length, pathinfo, checksum, true, dataIn,
firstChunkOffset, checksumIn);
} else {
localBlockReader = new BlockReaderLocal(conf, file, blk, token,
startOffset, length, pathinfo, dataIn);
}
} catch (IOException e) {
// remove from cache
localDatanodeInfo.removeBlockLocalPathInfo(blk);
DFSClient.LOG.warn("BlockReaderLocal: Removing " + blk
+ " from cache because local file " + pathinfo.getBlockPath()
+ " could not be opened.");
throw e;
} finally {
if (localBlockReader == null) {
if (dataIn != null) {
dataIn.close();
}
if (checksumIn != null) {
checksumIn.close();
}
}
}
return localBlockReader;
}
private static synchronized LocalDatanodeInfo getLocalDatanodeInfo(int port) {
LocalDatanodeInfo ldInfo = localDatanodeInfoMap.get(port);
if (ldInfo == null) {
ldInfo = new LocalDatanodeInfo();
localDatanodeInfoMap.put(port, ldInfo);
}
return ldInfo;
}
private static BlockLocalPathInfo getBlockPathInfo(UserGroupInformation ugi,
ExtendedBlock blk, DatanodeInfo node, Configuration conf, int timeout,
Token<BlockTokenIdentifier> token, boolean connectToDnViaHostname)
throws IOException {
LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node.getIpcPort());
BlockLocalPathInfo pathinfo = null;
ClientDatanodeProtocol proxy = localDatanodeInfo.getDatanodeProxy(ugi, node,
conf, timeout, connectToDnViaHostname);
try {
// make RPC to local datanode to find local pathnames of blocks
pathinfo = proxy.getBlockLocalPathInfo(blk, token);
if (pathinfo != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Cached location of block " + blk + " as " + pathinfo);
}
localDatanodeInfo.setBlockLocalPathInfo(blk, pathinfo);
}
} catch (IOException e) {
localDatanodeInfo.resetDatanodeProxy(); // Reset proxy on error
throw e;
}
return pathinfo;
}
private static boolean skipChecksumCheck(Configuration conf) {
return conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT);
}
private static int getSlowReadBufferNumChunks(Configuration conf, int bytesPerChecksum) {
int bufferSizeBytes = conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT);
if (bufferSizeBytes < bytesPerChecksum) {
throw new IllegalArgumentException("Configured BlockReaderLocal buffer size (" + bufferSizeBytes + ") " +
"is not large enough to hold a single chunk (" + bytesPerChecksum + "). Please configure " +
if (bufSize < bytesPerChecksum) {
throw new IllegalArgumentException("Configured BlockReaderLocal buffer size (" +
bufSize + ") is not large enough to hold a single chunk (" +
bytesPerChecksum + "). Please configure " +
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY + " appropriately");
}
// Round down to nearest chunk size
return bufferSizeBytes / bytesPerChecksum;
return bufSize / bytesPerChecksum;
}
private BlockReaderLocal(Configuration conf, String hdfsfile,
ExtendedBlock block, Token<BlockTokenIdentifier> token, long startOffset,
long length, BlockLocalPathInfo pathinfo, FileInputStream dataIn)
throws IOException {
this(conf, hdfsfile, block, token, startOffset, length, pathinfo,
DataChecksum.newDataChecksum(DataChecksum.Type.NULL, 4), false,
dataIn, startOffset, null);
}
private BlockReaderLocal(Configuration conf, String hdfsfile,
ExtendedBlock block, Token<BlockTokenIdentifier> token, long startOffset,
long length, BlockLocalPathInfo pathinfo, DataChecksum checksum,
boolean verifyChecksum, FileInputStream dataIn, long firstChunkOffset,
FileInputStream checksumIn) throws IOException {
this.filename = hdfsfile;
this.checksum = checksum;
this.verifyChecksum = verifyChecksum;
this.startOffset = Math.max(startOffset, 0);
bytesPerChecksum = this.checksum.getBytesPerChecksum();
checksumSize = this.checksum.getChecksumSize();
public BlockReaderLocal(Configuration conf, String filename,
ExtendedBlock block, long startOffset, long length,
FileInputStream dataIn, FileInputStream checksumIn,
DatanodeID datanodeID, boolean verifyChecksum) throws IOException {
this.dataIn = dataIn;
this.checksumIn = checksumIn;
this.offsetFromChunkBoundary = (int) (startOffset-firstChunkOffset);
this.startOffset = Math.max(startOffset, 0);
this.filename = filename;
this.datanodeID = datanodeID;
this.block = block;
int chunksPerChecksumRead = getSlowReadBufferNumChunks(conf, bytesPerChecksum);
slowReadBuff = bufferPool.getBuffer(bytesPerChecksum * chunksPerChecksumRead);
checksumBuff = bufferPool.getBuffer(checksumSize * chunksPerChecksumRead);
// Initially the buffers have nothing to read.
slowReadBuff.flip();
checksumBuff.flip();
// read and handle the common header here. For now just a version
checksumIn.getChannel().position(0);
BlockMetadataHeader header = BlockMetadataHeader
.readHeader(new DataInputStream(
new BufferedInputStream(checksumIn,
BlockMetadataHeader.getHeaderSize())));
short version = header.getVersion();
if (version != BlockMetadataHeader.VERSION) {
throw new IOException("Wrong version (" + version + ") of the " +
"metadata file for " + filename + ".");
}
if (!verifyChecksum) {
this.verifyChecksum = false;
} else {
this.verifyChecksum = !conf.getBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT);
}
long firstChunkOffset;
if (this.verifyChecksum) {
this.checksum = header.getChecksum();
this.bytesPerChecksum = this.checksum.getBytesPerChecksum();
this.checksumSize = this.checksum.getChecksumSize();
firstChunkOffset = startOffset
- (startOffset % checksum.getBytesPerChecksum());
this.offsetFromChunkBoundary = (int) (startOffset - firstChunkOffset);
int chunksPerChecksumRead = getSlowReadBufferNumChunks(conf, bytesPerChecksum);
slowReadBuff = bufferPool.getBuffer(bytesPerChecksum * chunksPerChecksumRead);
checksumBuff = bufferPool.getBuffer(checksumSize * chunksPerChecksumRead);
// Initially the buffers have nothing to read.
slowReadBuff.flip();
checksumBuff.flip();
long checkSumOffset = (firstChunkOffset / bytesPerChecksum) * checksumSize;
IOUtils.skipFully(checksumIn, checkSumOffset);
} else {
firstChunkOffset = startOffset;
this.checksum = null;
this.bytesPerChecksum = 0;
this.checksumSize = 0;
this.offsetFromChunkBoundary = 0;
}
boolean success = false;
try {
// Skip both input streams to beginning of the chunk containing startOffset
IOUtils.skipFully(dataIn, firstChunkOffset);
if (checksumIn != null) {
long checkSumOffset = (firstChunkOffset / bytesPerChecksum) * checksumSize;
IOUtils.skipFully(checksumIn, checkSumOffset);
}
// Reposition both input streams to the beginning of the chunk
// containing startOffset
this.dataIn.getChannel().position(firstChunkOffset);
success = true;
} finally {
if (!success) {
bufferPool.returnBuffer(slowReadBuff);
bufferPool.returnBuffer(checksumBuff);
if (slowReadBuff != null) bufferPool.returnBuffer(slowReadBuff);
if (checksumBuff != null) bufferPool.returnBuffer(checksumBuff);
}
}
}
@ -663,10 +489,17 @@ class BlockReaderLocal implements BlockReader {
}
@Override
public synchronized void close() throws IOException {
dataIn.close();
if (checksumIn != null) {
checksumIn.close();
public synchronized void close(PeerCache peerCache,
FileInputStreamCache fisCache) throws IOException {
if (fisCache != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("putting FileInputStream for " + filename +
" back into FileInputStreamCache");
}
fisCache.put(datanodeID, block, new FileInputStream[] {dataIn, checksumIn});
} else {
LOG.debug("closing FileInputStream for " + filename);
IOUtils.cleanup(LOG, dataIn, checksumIn);
}
if (slowReadBuff != null) {
bufferPool.returnBuffer(slowReadBuff);
@ -691,17 +524,8 @@ class BlockReaderLocal implements BlockReader {
}
@Override
public Socket takeSocket() {
return null;
}
@Override
public boolean hasSentStatusCode() {
return false;
}
@Override
public IOStreamPair getStreams() {
return null;
public int available() throws IOException {
// We never do network I/O in BlockReaderLocal.
return Integer.MAX_VALUE;
}
}

View File

@ -0,0 +1,704 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.Socket;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.util.DirectBufferPool;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.DataChecksum;
/**
* BlockReaderLocalLegacy enables local short circuited reads. If the DFS client is on
* the same machine as the datanode, then the client can read files directly
* from the local file system rather than going through the datanode for better
* performance. <br>
*
* This is the legacy implementation based on HDFS-2246, which requires
* permissions on the datanode to be set so that clients can directly access the
* blocks. The new implementation based on HDFS-347 should be preferred on UNIX
* systems where the required native code has been implemented.<br>
*
* {@link BlockReaderLocalLegacy} works as follows:
* <ul>
* <li>The client performing short circuit reads must be configured at the
* datanode.</li>
* <li>The client gets the path to the file where block is stored using
* {@link org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol#getBlockLocalPathInfo(ExtendedBlock, Token)}
* RPC call</li>
* <li>Client uses kerberos authentication to connect to the datanode over RPC,
* if security is enabled.</li>
* </ul>
*/
class BlockReaderLocalLegacy implements BlockReader {
private static final Log LOG = LogFactory.getLog(DFSClient.class);
//Stores the cache and proxy for a local datanode.
private static class LocalDatanodeInfo {
private ClientDatanodeProtocol proxy = null;
private final Map<ExtendedBlock, BlockLocalPathInfo> cache;
LocalDatanodeInfo() {
final int cacheSize = 10000;
final float hashTableLoadFactor = 0.75f;
int hashTableCapacity = (int) Math.ceil(cacheSize / hashTableLoadFactor) + 1;
cache = Collections
.synchronizedMap(new LinkedHashMap<ExtendedBlock, BlockLocalPathInfo>(
hashTableCapacity, hashTableLoadFactor, true) {
private static final long serialVersionUID = 1;
@Override
protected boolean removeEldestEntry(
Map.Entry<ExtendedBlock, BlockLocalPathInfo> eldest) {
return size() > cacheSize;
}
});
}
private synchronized ClientDatanodeProtocol getDatanodeProxy(
UserGroupInformation ugi, final DatanodeInfo node,
final Configuration conf, final int socketTimeout,
final boolean connectToDnViaHostname) throws IOException {
if (proxy == null) {
try {
proxy = ugi.doAs(new PrivilegedExceptionAction<ClientDatanodeProtocol>() {
@Override
public ClientDatanodeProtocol run() throws Exception {
return DFSUtil.createClientDatanodeProtocolProxy(node, conf,
socketTimeout, connectToDnViaHostname);
}
});
} catch (InterruptedException e) {
LOG.warn("encountered exception ", e);
}
}
return proxy;
}
private synchronized void resetDatanodeProxy() {
if (null != proxy) {
RPC.stopProxy(proxy);
proxy = null;
}
}
private BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b) {
return cache.get(b);
}
private void setBlockLocalPathInfo(ExtendedBlock b, BlockLocalPathInfo info) {
cache.put(b, info);
}
private void removeBlockLocalPathInfo(ExtendedBlock b) {
cache.remove(b);
}
}
// Multiple datanodes could be running on the local machine. Store proxies in
// a map keyed by the ipc port of the datanode.
private static Map<Integer, LocalDatanodeInfo> localDatanodeInfoMap = new HashMap<Integer, LocalDatanodeInfo>();
private final FileInputStream dataIn; // reader for the data file
private final FileInputStream checksumIn; // reader for the checksum file
/**
* Offset from the most recent chunk boundary at which the next read should
* take place. Is only set to non-zero at construction time, and is
* decremented (usually to 0) by subsequent reads. This avoids having to do a
* checksum read at construction to position the read cursor correctly.
*/
private int offsetFromChunkBoundary;
private byte[] skipBuf = null;
/**
* Used for checksummed reads that need to be staged before copying to their
* output buffer because they are either a) smaller than the checksum chunk
* size or b) issued by the slower read(byte[]...) path
*/
private ByteBuffer slowReadBuff = null;
private ByteBuffer checksumBuff = null;
private DataChecksum checksum;
private final boolean verifyChecksum;
private static DirectBufferPool bufferPool = new DirectBufferPool();
private final int bytesPerChecksum;
private final int checksumSize;
/** offset in block where reader wants to actually read */
private long startOffset;
private final String filename;
/**
* The only way this object can be instantiated.
*/
static BlockReaderLocalLegacy newBlockReader(UserGroupInformation ugi,
Configuration conf, String file, ExtendedBlock blk,
Token<BlockTokenIdentifier> token, DatanodeInfo node, int socketTimeout,
long startOffset, long length, boolean connectToDnViaHostname)
throws IOException {
LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node
.getIpcPort());
// check the cache first
BlockLocalPathInfo pathinfo = localDatanodeInfo.getBlockLocalPathInfo(blk);
if (pathinfo == null) {
pathinfo = getBlockPathInfo(ugi, blk, node, conf, socketTimeout, token,
connectToDnViaHostname);
}
// check to see if the file exists. It may so happen that the
// HDFS file has been deleted and this block-lookup is occurring
// on behalf of a new HDFS file. This time, the block file could
// be residing in a different portion of the fs.data.dir directory.
// In this case, we remove this entry from the cache. The next
// call to this method will re-populate the cache.
FileInputStream dataIn = null;
FileInputStream checksumIn = null;
BlockReaderLocalLegacy localBlockReader = null;
boolean skipChecksumCheck = skipChecksumCheck(conf);
try {
// get a local file system
File blkfile = new File(pathinfo.getBlockPath());
dataIn = new FileInputStream(blkfile);
if (LOG.isDebugEnabled()) {
LOG.debug("New BlockReaderLocalLegacy for file " + blkfile + " of size "
+ blkfile.length() + " startOffset " + startOffset + " length "
+ length + " short circuit checksum " + !skipChecksumCheck);
}
if (!skipChecksumCheck) {
// get the metadata file
File metafile = new File(pathinfo.getMetaPath());
checksumIn = new FileInputStream(metafile);
// read and handle the common header here. For now just a version
BlockMetadataHeader header = BlockMetadataHeader
.readHeader(new DataInputStream(checksumIn));
short version = header.getVersion();
if (version != BlockMetadataHeader.VERSION) {
LOG.warn("Wrong version (" + version + ") for metadata file for "
+ blk + " ignoring ...");
}
DataChecksum checksum = header.getChecksum();
long firstChunkOffset = startOffset
- (startOffset % checksum.getBytesPerChecksum());
localBlockReader = new BlockReaderLocalLegacy(conf, file, blk, token,
startOffset, length, pathinfo, checksum, true, dataIn,
firstChunkOffset, checksumIn);
} else {
localBlockReader = new BlockReaderLocalLegacy(conf, file, blk, token,
startOffset, length, pathinfo, dataIn);
}
} catch (IOException e) {
// remove from cache
localDatanodeInfo.removeBlockLocalPathInfo(blk);
DFSClient.LOG.warn("BlockReaderLocalLegacy: Removing " + blk
+ " from cache because local file " + pathinfo.getBlockPath()
+ " could not be opened.");
throw e;
} finally {
if (localBlockReader == null) {
if (dataIn != null) {
dataIn.close();
}
if (checksumIn != null) {
checksumIn.close();
}
}
}
return localBlockReader;
}
private static synchronized LocalDatanodeInfo getLocalDatanodeInfo(int port) {
LocalDatanodeInfo ldInfo = localDatanodeInfoMap.get(port);
if (ldInfo == null) {
ldInfo = new LocalDatanodeInfo();
localDatanodeInfoMap.put(port, ldInfo);
}
return ldInfo;
}
private static BlockLocalPathInfo getBlockPathInfo(UserGroupInformation ugi,
ExtendedBlock blk, DatanodeInfo node, Configuration conf, int timeout,
Token<BlockTokenIdentifier> token, boolean connectToDnViaHostname)
throws IOException {
LocalDatanodeInfo localDatanodeInfo = getLocalDatanodeInfo(node.getIpcPort());
BlockLocalPathInfo pathinfo = null;
ClientDatanodeProtocol proxy = localDatanodeInfo.getDatanodeProxy(ugi, node,
conf, timeout, connectToDnViaHostname);
try {
// make RPC to local datanode to find local pathnames of blocks
pathinfo = proxy.getBlockLocalPathInfo(blk, token);
if (pathinfo != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Cached location of block " + blk + " as " + pathinfo);
}
localDatanodeInfo.setBlockLocalPathInfo(blk, pathinfo);
}
} catch (IOException e) {
localDatanodeInfo.resetDatanodeProxy(); // Reset proxy on error
throw e;
}
return pathinfo;
}
private static boolean skipChecksumCheck(Configuration conf) {
return conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT);
}
private static int getSlowReadBufferNumChunks(Configuration conf, int bytesPerChecksum) {
int bufferSizeBytes = conf.getInt(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT);
if (bufferSizeBytes < bytesPerChecksum) {
throw new IllegalArgumentException("Configured BlockReaderLocalLegacy " +
"buffer size (" + bufferSizeBytes + ") is not large enough to hold " +
"a single chunk (" + bytesPerChecksum + "). Please configure " +
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY +
" appropriately");
}
// Round down to nearest chunk size
return bufferSizeBytes / bytesPerChecksum;
}
private BlockReaderLocalLegacy(Configuration conf, String hdfsfile,
ExtendedBlock block, Token<BlockTokenIdentifier> token, long startOffset,
long length, BlockLocalPathInfo pathinfo, FileInputStream dataIn)
throws IOException {
this(conf, hdfsfile, block, token, startOffset, length, pathinfo,
DataChecksum.newDataChecksum(DataChecksum.Type.NULL, 4), false,
dataIn, startOffset, null);
}
private BlockReaderLocalLegacy(Configuration conf, String hdfsfile,
ExtendedBlock block, Token<BlockTokenIdentifier> token, long startOffset,
long length, BlockLocalPathInfo pathinfo, DataChecksum checksum,
boolean verifyChecksum, FileInputStream dataIn, long firstChunkOffset,
FileInputStream checksumIn) throws IOException {
this.filename = hdfsfile;
this.checksum = checksum;
this.verifyChecksum = verifyChecksum;
this.startOffset = Math.max(startOffset, 0);
bytesPerChecksum = this.checksum.getBytesPerChecksum();
checksumSize = this.checksum.getChecksumSize();
this.dataIn = dataIn;
this.checksumIn = checksumIn;
this.offsetFromChunkBoundary = (int) (startOffset-firstChunkOffset);
int chunksPerChecksumRead = getSlowReadBufferNumChunks(conf, bytesPerChecksum);
slowReadBuff = bufferPool.getBuffer(bytesPerChecksum * chunksPerChecksumRead);
checksumBuff = bufferPool.getBuffer(checksumSize * chunksPerChecksumRead);
// Initially the buffers have nothing to read.
slowReadBuff.flip();
checksumBuff.flip();
boolean success = false;
try {
// Skip both input streams to beginning of the chunk containing startOffset
IOUtils.skipFully(dataIn, firstChunkOffset);
if (checksumIn != null) {
long checkSumOffset = (firstChunkOffset / bytesPerChecksum) * checksumSize;
IOUtils.skipFully(checksumIn, checkSumOffset);
}
success = true;
} finally {
if (!success) {
bufferPool.returnBuffer(slowReadBuff);
bufferPool.returnBuffer(checksumBuff);
}
}
}
/**
* Reads bytes into a buffer until EOF or the buffer's limit is reached
*/
private int fillBuffer(FileInputStream stream, ByteBuffer buf)
throws IOException {
int bytesRead = stream.getChannel().read(buf);
if (bytesRead < 0) {
//EOF
return bytesRead;
}
while (buf.remaining() > 0) {
int n = stream.getChannel().read(buf);
if (n < 0) {
//EOF
return bytesRead;
}
bytesRead += n;
}
return bytesRead;
}
/**
* Utility method used by read(ByteBuffer) to partially copy a ByteBuffer into
* another.
*/
private void writeSlice(ByteBuffer from, ByteBuffer to, int length) {
int oldLimit = from.limit();
from.limit(from.position() + length);
try {
to.put(from);
} finally {
from.limit(oldLimit);
}
}
@Override
public synchronized int read(ByteBuffer buf) throws IOException {
int nRead = 0;
if (verifyChecksum) {
// A 'direct' read actually has three phases. The first drains any
// remaining bytes from the slow read buffer. After this the read is
// guaranteed to be on a checksum chunk boundary. If there are still bytes
// to read, the fast direct path is used for as many remaining bytes as
// possible, up to a multiple of the checksum chunk size. Finally, any
// 'odd' bytes remaining at the end of the read cause another slow read to
// be issued, which involves an extra copy.
// Every 'slow' read tries to fill the slow read buffer in one go for
// efficiency's sake. As described above, all non-checksum-chunk-aligned
// reads will be served from the slower read path.
if (slowReadBuff.hasRemaining()) {
// There are remaining bytes from a small read available. This usually
// means this read is unaligned, which falls back to the slow path.
int fromSlowReadBuff = Math.min(buf.remaining(), slowReadBuff.remaining());
writeSlice(slowReadBuff, buf, fromSlowReadBuff);
nRead += fromSlowReadBuff;
}
if (buf.remaining() >= bytesPerChecksum && offsetFromChunkBoundary == 0) {
// Since we have drained the 'small read' buffer, we are guaranteed to
// be chunk-aligned
int len = buf.remaining() - (buf.remaining() % bytesPerChecksum);
// There's only enough checksum buffer space available to checksum one
// entire slow read buffer. This saves keeping the number of checksum
// chunks around.
len = Math.min(len, slowReadBuff.capacity());
int oldlimit = buf.limit();
buf.limit(buf.position() + len);
int readResult = 0;
try {
readResult = doByteBufferRead(buf);
} finally {
buf.limit(oldlimit);
}
if (readResult == -1) {
return nRead;
} else {
nRead += readResult;
buf.position(buf.position() + readResult);
}
}
// offsetFromChunkBoundary > 0 => unaligned read, use slow path to read
// until chunk boundary
if ((buf.remaining() > 0 && buf.remaining() < bytesPerChecksum) || offsetFromChunkBoundary > 0) {
int toRead = Math.min(buf.remaining(), bytesPerChecksum - offsetFromChunkBoundary);
int readResult = fillSlowReadBuffer(toRead);
if (readResult == -1) {
return nRead;
} else {
int fromSlowReadBuff = Math.min(readResult, buf.remaining());
writeSlice(slowReadBuff, buf, fromSlowReadBuff);
nRead += fromSlowReadBuff;
}
}
} else {
// Non-checksummed reads are much easier; we can just fill the buffer directly.
nRead = doByteBufferRead(buf);
if (nRead > 0) {
buf.position(buf.position() + nRead);
}
}
return nRead;
}
/**
* Tries to read as many bytes as possible into supplied buffer, checksumming
* each chunk if needed.
*
* <b>Preconditions:</b>
* <ul>
* <li>
* If checksumming is enabled, buf.remaining must be a multiple of
* bytesPerChecksum. Note that this is not a requirement for clients of
* read(ByteBuffer) - in the case of non-checksum-sized read requests,
* read(ByteBuffer) will substitute a suitably sized buffer to pass to this
* method.
* </li>
* </ul>
* <b>Postconditions:</b>
* <ul>
* <li>buf.limit and buf.mark are unchanged.</li>
* <li>buf.position += min(offsetFromChunkBoundary, totalBytesRead) - so the
* requested bytes can be read straight from the buffer</li>
* </ul>
*
* @param buf
* byte buffer to write bytes to. If checksums are not required, buf
* can have any number of bytes remaining, otherwise there must be a
* multiple of the checksum chunk size remaining.
* @return <tt>max(min(totalBytesRead, len) - offsetFromChunkBoundary, 0)</tt>
* that is, the the number of useful bytes (up to the amount
* requested) readable from the buffer by the client.
*/
private synchronized int doByteBufferRead(ByteBuffer buf) throws IOException {
if (verifyChecksum) {
assert buf.remaining() % bytesPerChecksum == 0;
}
int dataRead = -1;
int oldpos = buf.position();
// Read as much as we can into the buffer.
dataRead = fillBuffer(dataIn, buf);
if (dataRead == -1) {
return -1;
}
if (verifyChecksum) {
ByteBuffer toChecksum = buf.duplicate();
toChecksum.position(oldpos);
toChecksum.limit(oldpos + dataRead);
checksumBuff.clear();
// Equivalent to (int)Math.ceil(toChecksum.remaining() * 1.0 / bytesPerChecksum );
int numChunks =
(toChecksum.remaining() + bytesPerChecksum - 1) / bytesPerChecksum;
checksumBuff.limit(checksumSize * numChunks);
fillBuffer(checksumIn, checksumBuff);
checksumBuff.flip();
checksum.verifyChunkedSums(toChecksum, checksumBuff, filename,
this.startOffset);
}
if (dataRead >= 0) {
buf.position(oldpos + Math.min(offsetFromChunkBoundary, dataRead));
}
if (dataRead < offsetFromChunkBoundary) {
// yikes, didn't even get enough bytes to honour offset. This can happen
// even if we are verifying checksums if we are at EOF.
offsetFromChunkBoundary -= dataRead;
dataRead = 0;
} else {
dataRead -= offsetFromChunkBoundary;
offsetFromChunkBoundary = 0;
}
return dataRead;
}
/**
* Ensures that up to len bytes are available and checksummed in the slow read
* buffer. The number of bytes available to read is returned. If the buffer is
* not already empty, the number of remaining bytes is returned and no actual
* read happens.
*
* @param len
* the maximum number of bytes to make available. After len bytes
* are read, the underlying bytestream <b>must</b> be at a checksum
* boundary, or EOF. That is, (len + currentPosition) %
* bytesPerChecksum == 0.
* @return the number of bytes available to read, or -1 if EOF.
*/
private synchronized int fillSlowReadBuffer(int len) throws IOException {
int nRead = -1;
if (slowReadBuff.hasRemaining()) {
// Already got data, good to go.
nRead = Math.min(len, slowReadBuff.remaining());
} else {
// Round a complete read of len bytes (plus any implicit offset) to the
// next chunk boundary, since we try and read in multiples of a chunk
int nextChunk = len + offsetFromChunkBoundary +
(bytesPerChecksum - ((len + offsetFromChunkBoundary) % bytesPerChecksum));
int limit = Math.min(nextChunk, slowReadBuff.capacity());
assert limit % bytesPerChecksum == 0;
slowReadBuff.clear();
slowReadBuff.limit(limit);
nRead = doByteBufferRead(slowReadBuff);
if (nRead > 0) {
// So that next time we call slowReadBuff.hasRemaining(), we don't get a
// false positive.
slowReadBuff.limit(nRead + slowReadBuff.position());
}
}
return nRead;
}
@Override
public synchronized int read(byte[] buf, int off, int len) throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("read off " + off + " len " + len);
}
if (!verifyChecksum) {
return dataIn.read(buf, off, len);
}
int nRead = fillSlowReadBuffer(slowReadBuff.capacity());
if (nRead > 0) {
// Possible that buffer is filled with a larger read than we need, since
// we tried to read as much as possible at once
nRead = Math.min(len, nRead);
slowReadBuff.get(buf, off, nRead);
}
return nRead;
}
@Override
public synchronized long skip(long n) throws IOException {
if (LOG.isDebugEnabled()) {
LOG.debug("skip " + n);
}
if (n <= 0) {
return 0;
}
if (!verifyChecksum) {
return dataIn.skip(n);
}
// caller made sure newPosition is not beyond EOF.
int remaining = slowReadBuff.remaining();
int position = slowReadBuff.position();
int newPosition = position + (int)n;
// if the new offset is already read into dataBuff, just reposition
if (n <= remaining) {
assert offsetFromChunkBoundary == 0;
slowReadBuff.position(newPosition);
return n;
}
// for small gap, read through to keep the data/checksum in sync
if (n - remaining <= bytesPerChecksum) {
slowReadBuff.position(position + remaining);
if (skipBuf == null) {
skipBuf = new byte[bytesPerChecksum];
}
int ret = read(skipBuf, 0, (int)(n - remaining));
return ret;
}
// optimize for big gap: discard the current buffer, skip to
// the beginning of the appropriate checksum chunk and then
// read to the middle of that chunk to be in sync with checksums.
// We can't use this.offsetFromChunkBoundary because we need to know how
// many bytes of the offset were really read. Calling read(..) with a
// positive this.offsetFromChunkBoundary causes that many bytes to get
// silently skipped.
int myOffsetFromChunkBoundary = newPosition % bytesPerChecksum;
long toskip = n - remaining - myOffsetFromChunkBoundary;
slowReadBuff.position(slowReadBuff.limit());
checksumBuff.position(checksumBuff.limit());
IOUtils.skipFully(dataIn, toskip);
long checkSumOffset = (toskip / bytesPerChecksum) * checksumSize;
IOUtils.skipFully(checksumIn, checkSumOffset);
// read into the middle of the chunk
if (skipBuf == null) {
skipBuf = new byte[bytesPerChecksum];
}
assert skipBuf.length == bytesPerChecksum;
assert myOffsetFromChunkBoundary < bytesPerChecksum;
int ret = read(skipBuf, 0, myOffsetFromChunkBoundary);
if (ret == -1) { // EOS
return toskip;
} else {
return (toskip + ret);
}
}
@Override
public synchronized void close(PeerCache peerCache,
FileInputStreamCache fisCache) throws IOException {
IOUtils.cleanup(LOG, dataIn, checksumIn);
if (slowReadBuff != null) {
bufferPool.returnBuffer(slowReadBuff);
slowReadBuff = null;
}
if (checksumBuff != null) {
bufferPool.returnBuffer(checksumBuff);
checksumBuff = null;
}
startOffset = -1;
checksum = null;
}
@Override
public int readAll(byte[] buf, int offset, int len) throws IOException {
return BlockReaderUtil.readAll(this, buf, offset, len);
}
@Override
public void readFully(byte[] buf, int off, int len) throws IOException {
BlockReaderUtil.readFully(this, buf, off, len);
}
@Override
public int available() throws IOException {
// We never do network I/O in BlockReaderLocalLegacy.
return Integer.MAX_VALUE;
}
}

View File

@ -131,7 +131,6 @@ import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.BlockOpResponseP
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpBlockChecksumResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
@ -196,12 +195,13 @@ public class DFSClient implements java.io.Closeable {
final FileSystem.Statistics stats;
final int hdfsTimeout; // timeout value for a DFS operation.
private final String authority;
final SocketCache socketCache;
final PeerCache peerCache;
final Conf dfsClientConf;
private Random r = new Random();
private SocketAddress[] localInterfaceAddrs;
private DataEncryptionKey encryptionKey;
private boolean shouldUseLegacyBlockReaderLocal;
/**
* DFSClient configuration
*/
@ -228,11 +228,16 @@ public class DFSClient implements java.io.Closeable {
final short defaultReplication;
final String taskId;
final FsPermission uMask;
final boolean useLegacyBlockReader;
final boolean useLegacyBlockReaderLocal;
final boolean connectToDnViaHostname;
final boolean getHdfsBlocksMetadataEnabled;
final int getFileBlockStorageLocationsNumThreads;
final int getFileBlockStorageLocationsTimeout;
final String domainSocketPath;
final boolean skipShortCircuitChecksums;
final int shortCircuitBufferSize;
final boolean shortCircuitLocalReads;
final boolean domainSocketDataTraffic;
Conf(Configuration conf) {
maxFailoverAttempts = conf.getInt(
@ -283,9 +288,9 @@ public class DFSClient implements java.io.Closeable {
.getInt(DFS_CLIENT_BLOCK_WRITE_LOCATEFOLLOWINGBLOCK_RETRIES_KEY,
DFS_CLIENT_BLOCK_WRITE_LOCATEFOLLOWINGBLOCK_RETRIES_DEFAULT);
uMask = FsPermission.getUMask(conf);
useLegacyBlockReader = conf.getBoolean(
DFS_CLIENT_USE_LEGACY_BLOCKREADER,
DFS_CLIENT_USE_LEGACY_BLOCKREADER_DEFAULT);
useLegacyBlockReaderLocal = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT);
connectToDnViaHostname = conf.getBoolean(DFS_CLIENT_USE_DN_HOSTNAME,
DFS_CLIENT_USE_DN_HOSTNAME_DEFAULT);
getHdfsBlocksMetadataEnabled = conf.getBoolean(
@ -297,6 +302,20 @@ public class DFSClient implements java.io.Closeable {
getFileBlockStorageLocationsTimeout = conf.getInt(
DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT,
DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT_DEFAULT);
domainSocketPath = conf.getTrimmed(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT);
skipShortCircuitChecksums = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT);
shortCircuitBufferSize = conf.getInt(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT);
shortCircuitLocalReads = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT);
domainSocketDataTraffic = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC,
DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT);
}
private DataChecksum.Type getChecksumType(Configuration conf) {
@ -354,7 +373,7 @@ public class DFSClient implements java.io.Closeable {
private final Map<String, DFSOutputStream> filesBeingWritten
= new HashMap<String, DFSOutputStream>();
private boolean shortCircuitLocalReads;
private final DomainSocketFactory domainSocketFactory;
/**
* Same as this(NameNode.getAddress(conf), conf);
@ -398,6 +417,11 @@ public class DFSClient implements java.io.Closeable {
throws IOException {
// Copy only the required DFSClient configuration
this.dfsClientConf = new Conf(conf);
this.shouldUseLegacyBlockReaderLocal =
this.dfsClientConf.useLegacyBlockReaderLocal;
if (this.dfsClientConf.useLegacyBlockReaderLocal) {
LOG.debug("Using legacy short-circuit local reads.");
}
this.conf = conf;
this.stats = stats;
this.socketFactory = NetUtils.getSocketFactory(conf, ClientProtocol.class);
@ -427,12 +451,8 @@ public class DFSClient implements java.io.Closeable {
}
// read directly from the block file if configured.
this.shortCircuitLocalReads = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug("Short circuit read is " + shortCircuitLocalReads);
}
this.domainSocketFactory = new DomainSocketFactory(dfsClientConf);
String localInterfaces[] =
conf.getTrimmedStrings(DFSConfigKeys.DFS_CLIENT_LOCAL_INTERFACES);
localInterfaceAddrs = getLocalInterfaceAddrs(localInterfaces);
@ -442,7 +462,7 @@ public class DFSClient implements java.io.Closeable {
Joiner.on(',').join(localInterfaceAddrs) + "]");
}
this.socketCache = SocketCache.getInstance(dfsClientConf.socketCacheCapacity, dfsClientConf.socketCacheExpiry);
this.peerCache = PeerCache.getInstance(dfsClientConf.socketCacheCapacity, dfsClientConf.socketCacheExpiry);
}
/**
@ -797,29 +817,11 @@ public class DFSClient implements java.io.Closeable {
AccessControlException.class);
}
}
/**
* Get {@link BlockReader} for short circuited local reads.
*/
static BlockReader getLocalBlockReader(UserGroupInformation ugi,
Configuration conf, String src, ExtendedBlock blk,
Token<BlockTokenIdentifier> accessToken, DatanodeInfo chosenNode,
int socketTimeout, long offsetIntoBlock, boolean connectToDnViaHostname)
throws InvalidToken, IOException {
try {
return BlockReaderLocal.newBlockReader(ugi, conf, src, blk, accessToken,
chosenNode, socketTimeout, offsetIntoBlock, blk.getNumBytes()
- offsetIntoBlock, connectToDnViaHostname);
} catch (RemoteException re) {
throw re.unwrapRemoteException(InvalidToken.class,
AccessControlException.class);
}
}
private static Map<String, Boolean> localAddrMap = Collections
.synchronizedMap(new HashMap<String, Boolean>());
private static boolean isLocalAddress(InetSocketAddress targetAddr) {
static boolean isLocalAddress(InetSocketAddress targetAddr) {
InetAddress addr = targetAddr.getAddress();
Boolean cached = localAddrMap.get(addr.getHostAddress());
if (cached != null) {
@ -2319,10 +2321,6 @@ public class DFSClient implements java.io.Closeable {
super(in);
}
}
boolean shouldTryShortCircuitRead(InetSocketAddress targetAddr) {
return shortCircuitLocalReads && isLocalAddress(targetAddr);
}
void reportChecksumFailure(String file, ExtendedBlock blk, DatanodeInfo dn) {
DatanodeInfo [] dnArr = { dn };
@ -2346,13 +2344,15 @@ public class DFSClient implements java.io.Closeable {
+ ", ugi=" + ugi + "]";
}
void disableShortCircuit() {
LOG.info("Short circuit is disabled");
shortCircuitLocalReads = false;
public DomainSocketFactory getDomainSocketFactory() {
return domainSocketFactory;
}
@VisibleForTesting
boolean getShortCircuitLocalReads() {
return shortCircuitLocalReads;
public void disableLegacyBlockReaderLocal() {
shouldUseLegacyBlockReaderLocal = false;
}
public boolean useLegacyBlockReaderLocal() {
return shouldUseLegacyBlockReaderLocal;
}
}

View File

@ -265,6 +265,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_CLIENT_MAX_BLOCK_ACQUIRE_FAILURES_DEFAULT = 3;
public static final String DFS_CLIENT_USE_LEGACY_BLOCKREADER = "dfs.client.use.legacy.blockreader";
public static final boolean DFS_CLIENT_USE_LEGACY_BLOCKREADER_DEFAULT = false;
public static final String DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL = "dfs.client.use.legacy.blockreader.local";
public static final boolean DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT = false;
public static final String DFS_BALANCER_MOVEDWINWIDTH_KEY = "dfs.balancer.movedWinWidth";
public static final long DFS_BALANCER_MOVEDWINWIDTH_DEFAULT = 5400*1000L;
public static final String DFS_DATANODE_ADDRESS_KEY = "dfs.datanode.address";
@ -347,7 +349,13 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY = "dfs.client.read.shortcircuit.skip.checksum";
public static final boolean DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_DEFAULT = false;
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_KEY = "dfs.client.read.shortcircuit.buffer.size";
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY = "dfs.client.read.shortcircuit.streams.cache.size";
public static final int DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT = 100;
public static final String DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY = "dfs.client.read.shortcircuit.streams.cache.expiry.ms";
public static final long DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT = 5000;
public static final int DFS_CLIENT_READ_SHORTCIRCUIT_BUFFER_SIZE_DEFAULT = 1024 * 1024;
public static final String DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC = "dfs.client.domain.socket.data.traffic";
public static final boolean DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT = false;
// property for fsimage compression
public static final String DFS_IMAGE_COMPRESS_KEY = "dfs.image.compress";
@ -404,6 +412,8 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_NAMENODE_MAX_OP_SIZE_DEFAULT = 50 * 1024 * 1024;
public static final String DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY = "dfs.block.local-path-access.user";
public static final String DFS_DOMAIN_SOCKET_PATH_KEY = "dfs.domain.socket.path";
public static final String DFS_DOMAIN_SOCKET_PATH_DEFAULT = "";
// HA related configuration
public static final String DFS_HA_NAMENODES_KEY_PREFIX = "dfs.ha.namenodes";

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hdfs;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
@ -32,18 +33,20 @@ import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.fs.ByteBufferReadable;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.fs.FSInputStream;
import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.hdfs.SocketCache.SocketAndStreams;
import org.apache.hadoop.hdfs.net.DomainPeer;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
@ -51,20 +54,23 @@ import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.token.Token;
import com.google.common.annotations.VisibleForTesting;
/****************************************************************
* DFSInputStream provides bytes from a named file. It handles
* negotiation of the namenode and various datanodes as necessary.
****************************************************************/
@InterfaceAudience.Private
public class DFSInputStream extends FSInputStream implements ByteBufferReadable {
private final SocketCache socketCache;
@VisibleForTesting
static boolean tcpReadsDisabledForTesting = false;
private final PeerCache peerCache;
private final DFSClient dfsClient;
private boolean closed = false;
private final String src;
private final long prefetchSize;
private BlockReader blockReader = null;
@ -76,6 +82,8 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
private long pos = 0;
private long blockEnd = -1;
private final FileInputStreamCache fileInputStreamCache;
/**
* This variable tracks the number of failures since the start of the
* most recent user-facing operation. That is to say, it should be reset
@ -110,7 +118,14 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
this.verifyChecksum = verifyChecksum;
this.buffersize = buffersize;
this.src = src;
this.socketCache = dfsClient.socketCache;
this.peerCache = dfsClient.peerCache;
this.fileInputStreamCache = new FileInputStreamCache(
dfsClient.conf.getInt(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_DEFAULT),
dfsClient.conf.getLong(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_EXPIRY_MS_DEFAULT));
prefetchSize = dfsClient.getConf().prefetchSize;
timeWindow = dfsClient.getConf().timeWindow;
nCachedConnRetry = dfsClient.getConf().nCachedConnRetry;
@ -243,7 +258,9 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
locatedBlocks.getFileLength() + lastBlockBeingWrittenLength;
}
private synchronized boolean blockUnderConstruction() {
// Short circuit local reads are forbidden for files that are
// under construction. See HDFS-2757.
synchronized boolean shortCircuitForbidden() {
return locatedBlocks.isUnderConstruction();
}
@ -424,7 +441,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
// Will be getting a new BlockReader.
if (blockReader != null) {
closeBlockReader(blockReader);
blockReader.close(peerCache, fileInputStreamCache);
blockReader = null;
}
@ -462,7 +479,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
return chosenNode;
} catch (AccessControlException ex) {
DFSClient.LOG.warn("Short circuit access failed " + ex);
dfsClient.disableShortCircuit();
dfsClient.disableLegacyBlockReaderLocal();
continue;
} catch (IOException ex) {
if (ex instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
@ -510,10 +527,11 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
dfsClient.checkOpen();
if (blockReader != null) {
closeBlockReader(blockReader);
blockReader.close(peerCache, fileInputStreamCache);
blockReader = null;
}
super.close();
fileInputStreamCache.close();
closed = true;
}
@ -811,7 +829,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
addIntoCorruptedBlockMap(block.getBlock(), chosenNode, corruptedBlockMap);
} catch (AccessControlException ex) {
DFSClient.LOG.warn("Short circuit access failed " + ex);
dfsClient.disableShortCircuit();
dfsClient.disableLegacyBlockReaderLocal();
continue;
} catch (IOException e) {
if (e instanceof InvalidEncryptionKeyException && refetchEncryptionKey > 0) {
@ -837,7 +855,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
}
} finally {
if (reader != null) {
closeBlockReader(reader);
reader.close(peerCache, fileInputStreamCache);
}
}
// Put chosen node into dead list, continue
@ -845,22 +863,34 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
}
}
/**
* Close the given BlockReader and cache its socket.
*/
private void closeBlockReader(BlockReader reader) throws IOException {
if (reader.hasSentStatusCode()) {
IOStreamPair ioStreams = reader.getStreams();
Socket oldSock = reader.takeSocket();
socketCache.put(oldSock, ioStreams);
private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
Peer peer = null;
boolean success = false;
Socket sock = null;
try {
sock = dfsClient.socketFactory.createSocket();
NetUtils.connect(sock, addr,
dfsClient.getRandomLocalInterfaceAddr(),
dfsClient.getConf().socketTimeout);
peer = TcpPeerServer.peerFromSocketAndKey(sock,
dfsClient.getDataEncryptionKey());
success = true;
return peer;
} finally {
if (!success) {
IOUtils.closeQuietly(peer);
IOUtils.closeQuietly(sock);
}
}
reader.close();
}
/**
* Retrieve a BlockReader suitable for reading.
* This method will reuse the cached connection to the DN if appropriate.
* Otherwise, it will create a new connection.
* Throwing an IOException from this method is basically equivalent to
* declaring the DataNode bad, so we try to connect a lot of different ways
* before doing that.
*
* @param dnAddr Address of the datanode
* @param chosenNode Chosen datanode information
@ -885,82 +915,113 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
boolean verifyChecksum,
String clientName)
throws IOException {
// Can't local read a block under construction, see HDFS-2757
if (dfsClient.shouldTryShortCircuitRead(dnAddr) &&
!blockUnderConstruction()) {
return DFSClient.getLocalBlockReader(dfsClient.ugi, dfsClient.conf,
src, block, blockToken, chosenNode, dfsClient.hdfsTimeout,
startOffset, dfsClient.connectToDnViaHostname());
// Firstly, we check to see if we have cached any file descriptors for
// local blocks. If so, we can just re-use those file descriptors.
FileInputStream fis[] = fileInputStreamCache.get(chosenNode, block);
if (fis != null) {
if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
"the FileInputStreamCache.");
}
return new BlockReaderLocal(dfsClient.conf, file,
block, startOffset, len, fis[0], fis[1], chosenNode, verifyChecksum);
}
IOException err = null;
boolean fromCache = true;
// Allow retry since there is no way of knowing whether the cached socket
// is good until we actually use it.
for (int retries = 0; retries <= nCachedConnRetry && fromCache; ++retries) {
SocketAndStreams sockAndStreams = null;
// Don't use the cache on the last attempt - it's possible that there
// are arbitrarily many unusable sockets in the cache, but we don't
// want to fail the read.
if (retries < nCachedConnRetry) {
sockAndStreams = socketCache.get(dnAddr);
}
Socket sock;
if (sockAndStreams == null) {
fromCache = false;
sock = dfsClient.socketFactory.createSocket();
// TCP_NODELAY is crucial here because of bad interactions between
// Nagle's Algorithm and Delayed ACKs. With connection keepalive
// between the client and DN, the conversation looks like:
// 1. Client -> DN: Read block X
// 2. DN -> Client: data for block X
// 3. Client -> DN: Status OK (successful read)
// 4. Client -> DN: Read block Y
// The fact that step #3 and #4 are both in the client->DN direction
// triggers Nagling. If the DN is using delayed ACKs, this results
// in a delay of 40ms or more.
//
// TCP_NODELAY disables nagling and thus avoids this performance
// disaster.
sock.setTcpNoDelay(true);
NetUtils.connect(sock, dnAddr,
dfsClient.getRandomLocalInterfaceAddr(),
dfsClient.getConf().socketTimeout);
sock.setSoTimeout(dfsClient.getConf().socketTimeout);
} else {
sock = sockAndStreams.sock;
}
// If the legacy local block reader is enabled and we are reading a local
// block, try to create a BlockReaderLocalLegacy. The legacy local block
// reader implements local reads in the style first introduced by HDFS-2246.
if ((dfsClient.useLegacyBlockReaderLocal()) &&
DFSClient.isLocalAddress(dnAddr) &&
(!shortCircuitForbidden())) {
try {
// The OP_READ_BLOCK request is sent as we make the BlockReader
BlockReader reader =
BlockReaderFactory.newBlockReader(dfsClient.getConf(),
sock, file, block,
blockToken,
startOffset, len,
bufferSize, verifyChecksum,
clientName,
dfsClient.getDataEncryptionKey(),
sockAndStreams == null ? null : sockAndStreams.ioStreams);
return BlockReaderFactory.getLegacyBlockReaderLocal(dfsClient.ugi,
dfsClient.conf, clientName, block, blockToken, chosenNode,
dfsClient.hdfsTimeout, startOffset,dfsClient.connectToDnViaHostname());
} catch (IOException e) {
DFSClient.LOG.warn("error creating legacy BlockReaderLocal. " +
"Disabling legacy local reads.", e);
dfsClient.disableLegacyBlockReaderLocal();
}
}
// Look for cached domain peers.
int cacheTries = 0;
DomainSocketFactory dsFactory = dfsClient.getDomainSocketFactory();
BlockReader reader = null;
for (; cacheTries < nCachedConnRetry; ++cacheTries) {
Peer peer = peerCache.get(chosenNode, true);
if (peer == null) break;
try {
boolean allowShortCircuitLocalReads = dfsClient.getConf().
shortCircuitLocalReads && (!shortCircuitForbidden());
reader = BlockReaderFactory.newBlockReader(
dfsClient.conf, file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, allowShortCircuitLocalReads);
return reader;
} catch (IOException ex) {
// Our socket is no good.
DFSClient.LOG.debug("Error making BlockReader. Closing stale " + sock, ex);
if (sockAndStreams != null) {
sockAndStreams.close();
} else {
sock.close();
DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
"Closing stale " + peer, ex);
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
err = ex;
}
}
throw err;
// Try to create a DomainPeer.
DomainSocket domSock = dsFactory.create(dnAddr, this);
if (domSock != null) {
Peer peer = new DomainPeer(domSock);
try {
boolean allowShortCircuitLocalReads = dfsClient.getConf().
shortCircuitLocalReads && (!shortCircuitForbidden());
reader = BlockReaderFactory.newBlockReader(
dfsClient.conf, file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, allowShortCircuitLocalReads);
return reader;
} catch (IOException e) {
DFSClient.LOG.warn("failed to connect to " + domSock, e);
} finally {
if (reader == null) {
// If the Peer that we got the error from was a DomainPeer,
// mark the socket path as bad, so that newDataSocket will not try
// to re-open this socket for a while.
dsFactory.disableDomainSocketPath(domSock.getPath());
IOUtils.closeQuietly(peer);
}
}
}
// Look for cached peers.
for (; cacheTries < nCachedConnRetry; ++cacheTries) {
Peer peer = peerCache.get(chosenNode, false);
if (peer == null) break;
try {
reader = BlockReaderFactory.newBlockReader(
dfsClient.conf, file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, false);
return reader;
} catch (IOException ex) {
DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
peer, ex);
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
}
}
if (tcpReadsDisabledForTesting) {
throw new IOException("TCP reads are disabled.");
}
// Try to create a new remote peer.
Peer peer = newTcpPeer(dnAddr);
return BlockReaderFactory.newBlockReader(
dfsClient.conf, file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, false);
}
@ -1094,7 +1155,7 @@ public class DFSInputStream extends FSInputStream implements ByteBufferReadable
// the TCP buffer, then just eat up the intervening data.
//
int diff = (int)(targetPos - pos);
if (diff <= DFSClient.TCP_WINDOW_SIZE) {
if (diff <= blockReader.available()) {
try {
pos += blockReader.skip(diff);
if (pos == targetPos) {

View File

@ -0,0 +1,142 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.util.concurrent.TimeUnit;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.DFSClient.Conf;
import org.apache.hadoop.net.unix.DomainSocket;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
class DomainSocketFactory {
public static final Log LOG = LogFactory.getLog(DomainSocketFactory.class);
private final Conf conf;
enum PathStatus {
UNUSABLE,
SHORT_CIRCUIT_DISABLED,
}
/**
* Information about domain socket paths.
*/
Cache<String, PathStatus> pathInfo =
CacheBuilder.newBuilder()
.expireAfterWrite(10, TimeUnit.MINUTES)
.build();
public DomainSocketFactory(Conf conf) {
this.conf = conf;
String feature = null;
if (conf.shortCircuitLocalReads && (!conf.useLegacyBlockReaderLocal)) {
feature = "The short-circuit local reads feature";
} else if (conf.domainSocketDataTraffic) {
feature = "UNIX domain socket data traffic";
}
if (feature != null) {
if (conf.domainSocketPath.isEmpty()) {
LOG.warn(feature + " is disabled because you have not set " +
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
} else if (DomainSocket.getLoadingFailureReason() != null) {
LOG.warn(feature + " is disabled because " +
DomainSocket.getLoadingFailureReason());
} else {
LOG.debug(feature + "is enabled.");
}
}
}
/**
* Create a DomainSocket.
*
* @param addr The address of the DataNode
* @param stream The DFSInputStream the socket will be created for.
*
* @return null if the socket could not be created; the
* socket otherwise. If there was an error while
* creating the socket, we will add the socket path
* to our list of failed domain socket paths.
*/
DomainSocket create(InetSocketAddress addr, DFSInputStream stream) {
// If there is no domain socket path configured, we can't use domain
// sockets.
if (conf.domainSocketPath.isEmpty()) return null;
// If we can't do anything with the domain socket, don't create it.
if ((conf.domainSocketDataTraffic == false) &&
((!conf.shortCircuitLocalReads) || conf.useLegacyBlockReaderLocal)) {
return null;
}
// UNIX domain sockets can only be used to talk to local peers
if (!DFSClient.isLocalAddress(addr)) return null;
// If the DomainSocket code is not loaded, we can't create
// DomainSocket objects.
if (DomainSocket.getLoadingFailureReason() != null) return null;
String escapedPath = DomainSocket.
getEffectivePath(conf.domainSocketPath, addr.getPort());
PathStatus info = pathInfo.getIfPresent(escapedPath);
if (info == PathStatus.UNUSABLE) {
// We tried to connect to this domain socket before, and it was totally
// unusable.
return null;
}
if ((!conf.domainSocketDataTraffic) &&
((info == PathStatus.SHORT_CIRCUIT_DISABLED) ||
stream.shortCircuitForbidden())) {
// If we don't want to pass data over domain sockets, and we don't want
// to pass file descriptors over them either, we have no use for domain
// sockets.
return null;
}
boolean success = false;
DomainSocket sock = null;
try {
sock = DomainSocket.connect(escapedPath);
sock.setAttribute(DomainSocket.RECEIVE_TIMEOUT, conf.socketTimeout);
success = true;
} catch (IOException e) {
LOG.warn("error creating DomainSocket", e);
// fall through
} finally {
if (!success) {
if (sock != null) {
IOUtils.closeQuietly(sock);
}
pathInfo.put(escapedPath, PathStatus.UNUSABLE);
sock = null;
}
}
return sock;
}
public void disableShortCircuitForPath(String path) {
pathInfo.put(path, PathStatus.SHORT_CIRCUIT_DISABLED);
}
public void disableDomainSocketPath(String path) {
pathInfo.put(path, PathStatus.UNUSABLE);
}
}

View File

@ -0,0 +1,264 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.FileInputStream;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.util.concurrent.ScheduledThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Time;
import com.google.common.collect.LinkedListMultimap;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* FileInputStream cache is used to cache FileInputStream objects that we
* have received from the DataNode.
*/
class FileInputStreamCache {
private final static Log LOG = LogFactory.getLog(FileInputStreamCache.class);
/**
* The executor service that runs the cacheCleaner. There is only one of
* these per VM.
*/
private final static ScheduledThreadPoolExecutor executor
= new ScheduledThreadPoolExecutor(1, new ThreadFactoryBuilder().
setDaemon(true).setNameFormat("FileInputStreamCache Cleaner").
build());
/**
* The CacheCleaner for this FileInputStreamCache. We don't create this
* and schedule it until it becomes necessary.
*/
private CacheCleaner cacheCleaner;
/**
* Maximum number of entries to allow in the cache.
*/
private final int maxCacheSize;
/**
* The minimum time in milliseconds to preserve an element in the cache.
*/
private final long expiryTimeMs;
/**
* True if the FileInputStreamCache is closed.
*/
private boolean closed = false;
/**
* Cache entries.
*/
private final LinkedListMultimap<Key, Value> map = LinkedListMultimap.create();
/**
* Expiry thread which makes sure that the file descriptors get closed
* after a while.
*/
class CacheCleaner implements Runnable {
@Override
public void run() {
synchronized(FileInputStreamCache.this) {
if (closed) return;
long curTime = Time.monotonicNow();
for (Iterator<Entry<Key, Value>> iter = map.entries().iterator();
iter.hasNext();
iter = map.entries().iterator()) {
Entry<Key, Value> entry = iter.next();
if (entry.getValue().getTime() + expiryTimeMs >= curTime) {
break;
}
entry.getValue().close();
iter.remove();
}
}
}
}
/**
* The key identifying a FileInputStream array.
*/
static class Key {
private final DatanodeID datanodeID;
private final ExtendedBlock block;
public Key(DatanodeID datanodeID, ExtendedBlock block) {
this.datanodeID = datanodeID;
this.block = block;
}
@Override
public boolean equals(Object other) {
if (!(other instanceof FileInputStreamCache.Key)) {
return false;
}
FileInputStreamCache.Key otherKey = (FileInputStreamCache.Key)other;
return (block.equals(otherKey.block) &
(block.getGenerationStamp() == otherKey.block.getGenerationStamp()) &
datanodeID.equals(otherKey.datanodeID));
}
@Override
public int hashCode() {
return block.hashCode();
}
}
/**
* The value containing a FileInputStream array and the time it was added to
* the cache.
*/
static class Value {
private final FileInputStream fis[];
private final long time;
public Value (FileInputStream fis[]) {
this.fis = fis;
this.time = Time.monotonicNow();
}
public FileInputStream[] getFileInputStreams() {
return fis;
}
public long getTime() {
return time;
}
public void close() {
IOUtils.cleanup(LOG, fis);
}
}
/**
* Create a new FileInputStream
*
* @param maxCacheSize The maximum number of elements to allow in
* the cache.
* @param expiryTimeMs The minimum time in milliseconds to preserve
* elements in the cache.
*/
public FileInputStreamCache(int maxCacheSize, long expiryTimeMs) {
this.maxCacheSize = maxCacheSize;
this.expiryTimeMs = expiryTimeMs;
}
/**
* Put an array of FileInputStream objects into the cache.
*
* @param datanodeID The DatanodeID to store the streams under.
* @param block The Block to store the streams under.
* @param fis The streams.
*/
public void put(DatanodeID datanodeID, ExtendedBlock block,
FileInputStream fis[]) {
boolean inserted = false;
try {
synchronized(this) {
if (closed) return;
if (map.size() + 1 > maxCacheSize) {
Iterator<Entry<Key, Value>> iter = map.entries().iterator();
if (!iter.hasNext()) return;
Entry<Key, Value> entry = iter.next();
entry.getValue().close();
iter.remove();
}
if (cacheCleaner == null) {
cacheCleaner = new CacheCleaner();
executor.scheduleAtFixedRate(cacheCleaner, expiryTimeMs, expiryTimeMs,
TimeUnit.MILLISECONDS);
}
map.put(new Key(datanodeID, block), new Value(fis));
inserted = true;
}
} finally {
if (!inserted) {
IOUtils.cleanup(LOG, fis);
}
}
}
/**
* Find and remove an array of FileInputStream objects from the cache.
*
* @param datanodeID The DatanodeID to search for.
* @param block The Block to search for.
*
* @return null if no streams can be found; the
* array otherwise. If this is non-null, the
* array will have been removed from the cache.
*/
public synchronized FileInputStream[] get(DatanodeID datanodeID,
ExtendedBlock block) {
Key key = new Key(datanodeID, block);
List<Value> ret = map.get(key);
if (ret.isEmpty()) return null;
Value val = ret.get(0);
map.remove(key, val);
return val.getFileInputStreams();
}
/**
* Close the cache and free all associated resources.
*/
public synchronized void close() {
if (closed) return;
closed = true;
if (cacheCleaner != null) {
executor.remove(cacheCleaner);
}
for (Iterator<Entry<Key, Value>> iter = map.entries().iterator();
iter.hasNext();
iter = map.entries().iterator()) {
Entry<Key, Value> entry = iter.next();
entry.getValue().close();
iter.remove();
}
}
public synchronized String toString() {
StringBuilder bld = new StringBuilder();
bld.append("FileInputStreamCache(");
String prefix = "";
for (Entry<Key, Value> entry : map.entries()) {
bld.append(prefix);
bld.append(entry.getKey());
prefix = ", ";
}
bld.append(")");
return bld.toString();
}
public long getExpiryTimeMs() {
return expiryTimeMs;
}
public int getMaxCacheSize() {
return maxCacheSize;
}
}

View File

@ -18,92 +18,104 @@
package org.apache.hadoop.hdfs;
import java.io.Closeable;
import java.net.Socket;
import java.net.SocketAddress;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import java.io.IOException;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.LinkedListMultimap;
import org.apache.commons.logging.Log;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
/**
* A cache of input stream sockets to Data Node.
*/
class SocketCache {
private static final Log LOG = LogFactory.getLog(SocketCache.class);
@InterfaceAudience.Private
static class SocketAndStreams implements Closeable {
public final Socket sock;
public final IOStreamPair ioStreams;
long createTime;
class PeerCache {
private static final Log LOG = LogFactory.getLog(PeerCache.class);
private static class Key {
final DatanodeID dnID;
final boolean isDomain;
public SocketAndStreams(Socket s, IOStreamPair ioStreams) {
this.sock = s;
this.ioStreams = ioStreams;
this.createTime = Time.monotonicNow();
Key(DatanodeID dnID, boolean isDomain) {
this.dnID = dnID;
this.isDomain = isDomain;
}
@Override
public void close() {
if (ioStreams != null) {
IOUtils.closeStream(ioStreams.in);
IOUtils.closeStream(ioStreams.out);
public boolean equals(Object o) {
if (!(o instanceof Key)) {
return false;
}
IOUtils.closeSocket(sock);
Key other = (Key)o;
return dnID.equals(other.dnID) && isDomain == other.isDomain;
}
public long getCreateTime() {
return this.createTime;
@Override
public int hashCode() {
return dnID.hashCode() ^ (isDomain ? 1 : 0);
}
}
private static class Value {
private final Peer peer;
private final long time;
Value(Peer peer, long time) {
this.peer = peer;
this.time = time;
}
Peer getPeer() {
return peer;
}
long getTime() {
return time;
}
}
private Daemon daemon;
/** A map for per user per datanode. */
private static LinkedListMultimap<SocketAddress, SocketAndStreams> multimap =
private final LinkedListMultimap<Key, Value> multimap =
LinkedListMultimap.create();
private static int capacity;
private static long expiryPeriod;
private static SocketCache scInstance = new SocketCache();
private static boolean isInitedOnce = false;
public static synchronized SocketCache getInstance(int c, long e) {
// capacity is only initialized once
if (isInitedOnce == false) {
capacity = c;
expiryPeriod = e;
private final int capacity;
private final long expiryPeriod;
private static PeerCache instance = null;
@VisibleForTesting
PeerCache(int c, long e) {
this.capacity = c;
this.expiryPeriod = e;
if (capacity == 0 ) {
LOG.info("SocketCache disabled.");
}
else if (expiryPeriod == 0) {
throw new IllegalStateException("Cannot initialize expiryPeriod to " +
expiryPeriod + "when cache is enabled.");
}
isInitedOnce = true;
if (capacity == 0 ) {
LOG.info("SocketCache disabled.");
}
else if (expiryPeriod == 0) {
throw new IllegalStateException("Cannot initialize expiryPeriod to " +
expiryPeriod + "when cache is enabled.");
}
}
public static synchronized PeerCache getInstance(int c, long e) {
// capacity is only initialized once
if (instance == null) {
instance = new PeerCache(c, e);
} else { //already initialized once
if (capacity != c || expiryPeriod != e) {
LOG.info("capacity and expiry periods already set to " + capacity +
" and " + expiryPeriod + " respectively. Cannot set it to " + c +
" and " + e);
if (instance.capacity != c || instance.expiryPeriod != e) {
LOG.info("capacity and expiry periods already set to " +
instance.capacity + " and " + instance.expiryPeriod +
" respectively. Cannot set it to " + c + " and " + e);
}
}
return scInstance;
return instance;
}
private boolean isDaemonStarted() {
@ -120,44 +132,47 @@ class SocketCache {
@Override
public void run() {
try {
SocketCache.this.run();
PeerCache.this.run();
} catch(InterruptedException e) {
//noop
} finally {
SocketCache.this.clear();
PeerCache.this.clear();
}
}
@Override
public String toString() {
return String.valueOf(SocketCache.this);
return String.valueOf(PeerCache.this);
}
});
daemon.start();
}
/**
* Get a cached socket to the given address.
* @param remote Remote address the socket is connected to.
* @return A socket with unknown state, possibly closed underneath. Or null.
* Get a cached peer connected to the given DataNode.
* @param dnId The DataNode to get a Peer for.
* @param isDomain Whether to retrieve a DomainPeer or not.
*
* @return An open Peer connected to the DN, or null if none
* was found.
*/
public synchronized SocketAndStreams get(SocketAddress remote) {
public synchronized Peer get(DatanodeID dnId, boolean isDomain) {
if (capacity <= 0) { // disabled
return null;
}
List<SocketAndStreams> sockStreamList = multimap.get(remote);
List<Value> sockStreamList = multimap.get(new Key(dnId, isDomain));
if (sockStreamList == null) {
return null;
}
Iterator<SocketAndStreams> iter = sockStreamList.iterator();
Iterator<Value> iter = sockStreamList.iterator();
while (iter.hasNext()) {
SocketAndStreams candidate = iter.next();
Value candidate = iter.next();
iter.remove();
if (!candidate.sock.isClosed()) {
return candidate;
if (!candidate.getPeer().isClosed()) {
return candidate.getPeer();
}
}
return null;
@ -167,30 +182,23 @@ class SocketCache {
* Give an unused socket to the cache.
* @param sock socket not used by anyone.
*/
public synchronized void put(Socket sock, IOStreamPair ioStreams) {
Preconditions.checkNotNull(sock);
SocketAndStreams s = new SocketAndStreams(sock, ioStreams);
public synchronized void put(DatanodeID dnId, Peer peer) {
Preconditions.checkNotNull(dnId);
Preconditions.checkNotNull(peer);
if (peer.isClosed()) return;
if (capacity <= 0) {
// Cache disabled.
s.close();
IOUtils.cleanup(LOG, peer);
return;
}
startExpiryDaemon();
SocketAddress remoteAddr = sock.getRemoteSocketAddress();
if (remoteAddr == null) {
LOG.warn("Cannot cache (unconnected) socket with no remote address: " +
sock);
IOUtils.closeSocket(sock);
return;
}
if (capacity == multimap.size()) {
evictOldest();
}
multimap.put(remoteAddr, s);
multimap.put(new Key(dnId, peer.getDomainSocket() != null),
new Value(peer, Time.monotonicNow()));
}
public synchronized int size() {
@ -202,18 +210,17 @@ class SocketCache {
*/
private synchronized void evictExpired(long expiryPeriod) {
while (multimap.size() != 0) {
Iterator<Entry<SocketAddress, SocketAndStreams>> iter =
Iterator<Entry<Key, Value>> iter =
multimap.entries().iterator();
Entry<SocketAddress, SocketAndStreams> entry = iter.next();
Entry<Key, Value> entry = iter.next();
// if oldest socket expired, remove it
if (entry == null ||
Time.monotonicNow() - entry.getValue().getCreateTime() <
Time.monotonicNow() - entry.getValue().getTime() <
expiryPeriod) {
break;
}
IOUtils.cleanup(LOG, entry.getValue().getPeer());
iter.remove();
SocketAndStreams s = entry.getValue();
s.close();
}
}
@ -221,16 +228,18 @@ class SocketCache {
* Evict the oldest entry in the cache.
*/
private synchronized void evictOldest() {
Iterator<Entry<SocketAddress, SocketAndStreams>> iter =
// We can get the oldest element immediately, because of an interesting
// property of LinkedListMultimap: its iterator traverses entries in the
// order that they were added.
Iterator<Entry<Key, Value>> iter =
multimap.entries().iterator();
if (!iter.hasNext()) {
throw new IllegalStateException("Cannot evict from empty cache! " +
"capacity: " + capacity);
}
Entry<SocketAddress, SocketAndStreams> entry = iter.next();
Entry<Key, Value> entry = iter.next();
IOUtils.cleanup(LOG, entry.getValue().getPeer());
iter.remove();
SocketAndStreams s = entry.getValue();
s.close();
}
/**
@ -255,11 +264,24 @@ class SocketCache {
* Empty the cache, and close all sockets.
*/
@VisibleForTesting
protected synchronized void clear() {
for (SocketAndStreams sockAndStream : multimap.values()) {
sockAndStream.close();
synchronized void clear() {
for (Value value : multimap.values()) {
IOUtils.cleanup(LOG, value.getPeer());
}
multimap.clear();
}
@VisibleForTesting
void close() {
clear();
if (daemon != null) {
daemon.interrupt();
try {
daemon.join();
} catch (InterruptedException e) {
throw new RuntimeException("failed to join thread");
}
}
daemon = null;
}
}

View File

@ -29,6 +29,8 @@ import java.nio.ByteBuffer;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.FSInputChecker;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
@ -54,8 +56,8 @@ import org.apache.hadoop.util.DataChecksum;
@InterfaceAudience.Private
@Deprecated
public class RemoteBlockReader extends FSInputChecker implements BlockReader {
Socket dnSock; //for now just sending the status code (e.g. checksumOk) after the read.
private final Peer peer;
private final DatanodeID datanodeID;
private final DataInputStream in;
private DataChecksum checksum;
@ -125,9 +127,9 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
// if eos was set in the previous read, send a status code to the DN
if (eos && !eosBefore && nRead >= 0) {
if (needChecksum()) {
sendReadResult(dnSock, Status.CHECKSUM_OK);
sendReadResult(peer, Status.CHECKSUM_OK);
} else {
sendReadResult(dnSock, Status.SUCCESS);
sendReadResult(peer, Status.SUCCESS);
}
}
return nRead;
@ -321,7 +323,8 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
private RemoteBlockReader(String file, String bpid, long blockId,
DataInputStream in, DataChecksum checksum, boolean verifyChecksum,
long startOffset, long firstChunkOffset, long bytesToRead, Socket dnSock) {
long startOffset, long firstChunkOffset, long bytesToRead, Peer peer,
DatanodeID datanodeID) {
// Path is used only for printing block and file information in debug
super(new Path("/blk_" + blockId + ":" + bpid + ":of:"+ file)/*too non path-like?*/,
1, verifyChecksum,
@ -329,7 +332,8 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
checksum.getBytesPerChecksum(),
checksum.getChecksumSize());
this.dnSock = dnSock;
this.peer = peer;
this.datanodeID = datanodeID;
this.in = in;
this.checksum = checksum;
this.startOffset = Math.max( startOffset, 0 );
@ -348,13 +352,6 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
checksumSize = this.checksum.getChecksumSize();
}
public static RemoteBlockReader newBlockReader(Socket sock, String file,
ExtendedBlock block, Token<BlockTokenIdentifier> blockToken,
long startOffset, long len, int bufferSize) throws IOException {
return newBlockReader(sock, file, block, blockToken, startOffset,
len, bufferSize, true, "");
}
/**
* Create a new BlockReader specifically to satisfy a read.
* This method also sends the OP_READ_BLOCK request.
@ -370,16 +367,17 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
* @param clientName Client name
* @return New BlockReader instance, or null on error.
*/
public static RemoteBlockReader newBlockReader( Socket sock, String file,
public static RemoteBlockReader newBlockReader(String file,
ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken,
long startOffset, long len,
int bufferSize, boolean verifyChecksum,
String clientName)
String clientName, Peer peer,
DatanodeID datanodeID)
throws IOException {
// in and out will be closed when sock is closed (by the caller)
final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(
NetUtils.getOutputStream(sock, HdfsServerConstants.WRITE_TIMEOUT)));
final DataOutputStream out =
new DataOutputStream(new BufferedOutputStream(peer.getOutputStream()));
new Sender(out).readBlock(block, blockToken, clientName, startOffset, len,
verifyChecksum);
@ -388,12 +386,11 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
//
DataInputStream in = new DataInputStream(
new BufferedInputStream(NetUtils.getInputStream(sock),
bufferSize));
new BufferedInputStream(peer.getInputStream(), bufferSize));
BlockOpResponseProto status = BlockOpResponseProto.parseFrom(
PBHelper.vintPrefixed(in));
RemoteBlockReader2.checkSuccess(status, sock, block, file);
RemoteBlockReader2.checkSuccess(status, peer, block, file);
ReadOpChecksumInfoProto checksumInfo =
status.getReadOpChecksumInfo();
DataChecksum checksum = DataTransferProtoUtil.fromProto(
@ -411,15 +408,19 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
}
return new RemoteBlockReader(file, block.getBlockPoolId(), block.getBlockId(),
in, checksum, verifyChecksum, startOffset, firstChunkOffset, len, sock);
in, checksum, verifyChecksum, startOffset, firstChunkOffset, len,
peer, datanodeID);
}
@Override
public synchronized void close() throws IOException {
public synchronized void close(PeerCache peerCache,
FileInputStreamCache fisCache) throws IOException {
startOffset = -1;
checksum = null;
if (dnSock != null) {
dnSock.close();
if (peerCache != null & sentStatusCode) {
peerCache.put(datanodeID, peer);
} else {
peer.close();
}
// in will be closed when its Socket is closed.
@ -436,37 +437,21 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
return readFully(this, buf, offset, len);
}
@Override
public Socket takeSocket() {
assert hasSentStatusCode() :
"BlockReader shouldn't give back sockets mid-read";
Socket res = dnSock;
dnSock = null;
return res;
}
@Override
public boolean hasSentStatusCode() {
return sentStatusCode;
}
/**
* When the reader reaches end of the read, it sends a status response
* (e.g. CHECKSUM_OK) to the DN. Failure to do so could lead to the DN
* closing our connection (which we will re-open), but won't affect
* data correctness.
*/
void sendReadResult(Socket sock, Status statusCode) {
assert !sentStatusCode : "already sent status code to " + sock;
void sendReadResult(Peer peer, Status statusCode) {
assert !sentStatusCode : "already sent status code to " + peer;
try {
RemoteBlockReader2.writeReadResult(
NetUtils.getOutputStream(sock, HdfsServerConstants.WRITE_TIMEOUT),
statusCode);
RemoteBlockReader2.writeReadResult(peer.getOutputStream(), statusCode);
sentStatusCode = true;
} catch (IOException e) {
// It's ok not to be able to send this. But something is probably wrong.
LOG.info("Could not send read status (" + statusCode + ") to datanode " +
sock.getInetAddress() + ": " + e.getMessage());
peer.getRemoteAddressString() + ": " + e.getMessage());
}
}
@ -486,12 +471,11 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
public int read(ByteBuffer buf) throws IOException {
throw new UnsupportedOperationException("readDirect unsupported in RemoteBlockReader");
}
@Override
public IOStreamPair getStreams() {
// This class doesn't support encryption, which is the only thing this
// method is used for. See HDFS-3637.
return null;
public int available() throws IOException {
// An optimistic estimate of how much data is available
// to us without doing network I/O.
return DFSClient.TCP_WINDOW_SIZE;
}
}

View File

@ -23,16 +23,16 @@ import java.io.DataOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtoUtil;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketHeader;
import org.apache.hadoop.hdfs.protocol.datatransfer.PacketReceiver;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
@ -44,10 +44,11 @@ import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.net.SocketInputWrapper;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.DataChecksum;
import com.google.common.annotations.VisibleForTesting;
/**
* This is a wrapper around connection to datanode
* and understands checksum, offset etc.
@ -79,9 +80,8 @@ public class RemoteBlockReader2 implements BlockReader {
static final Log LOG = LogFactory.getLog(RemoteBlockReader2.class);
Socket dnSock;
// for now just sending the status code (e.g. checksumOk) after the read.
private IOStreamPair ioStreams;
final private Peer peer;
final private DatanodeID datanodeID;
private final ReadableByteChannel in;
private DataChecksum checksum;
@ -114,6 +114,11 @@ public class RemoteBlockReader2 implements BlockReader {
/** Amount of unread data in the current received packet */
int dataLeft = 0;
@VisibleForTesting
public Peer getPeer() {
return peer;
}
@Override
public synchronized int read(byte[] buf, int off, int len)
throws IOException {
@ -246,13 +251,13 @@ public class RemoteBlockReader2 implements BlockReader {
}
protected RemoteBlockReader2(String file, String bpid, long blockId,
ReadableByteChannel in, DataChecksum checksum, boolean verifyChecksum,
long startOffset, long firstChunkOffset, long bytesToRead, Socket dnSock,
IOStreamPair ioStreams) {
DataChecksum checksum, boolean verifyChecksum,
long startOffset, long firstChunkOffset, long bytesToRead, Peer peer,
DatanodeID datanodeID) {
// Path is used only for printing block and file information in debug
this.dnSock = dnSock;
this.ioStreams = ioStreams;
this.in = in;
this.peer = peer;
this.datanodeID = datanodeID;
this.in = peer.getInputStreamChannel();
this.checksum = checksum;
this.verifyChecksum = verifyChecksum;
this.startOffset = Math.max( startOffset, 0 );
@ -269,39 +274,20 @@ public class RemoteBlockReader2 implements BlockReader {
@Override
public synchronized void close() throws IOException {
public synchronized void close(PeerCache peerCache,
FileInputStreamCache fisCache) throws IOException {
packetReceiver.close();
startOffset = -1;
checksum = null;
if (dnSock != null) {
dnSock.close();
if (peerCache != null && sentStatusCode) {
peerCache.put(datanodeID, peer);
} else {
peer.close();
}
// in will be closed when its Socket is closed.
}
/**
* Take the socket used to talk to the DN.
*/
@Override
public Socket takeSocket() {
assert hasSentStatusCode() :
"BlockReader shouldn't give back sockets mid-read";
Socket res = dnSock;
dnSock = null;
return res;
}
/**
* Whether the BlockReader has reached the end of its input stream
* and successfully sent a status code back to the datanode.
*/
@Override
public boolean hasSentStatusCode() {
return sentStatusCode;
}
/**
* When the reader reaches end of the read, it sends a status response
* (e.g. CHECKSUM_OK) to the DN. Failure to do so could lead to the DN
@ -309,14 +295,14 @@ public class RemoteBlockReader2 implements BlockReader {
* data correctness.
*/
void sendReadResult(Status statusCode) {
assert !sentStatusCode : "already sent status code to " + dnSock;
assert !sentStatusCode : "already sent status code to " + peer;
try {
writeReadResult(ioStreams.out, statusCode);
writeReadResult(peer.getOutputStream(), statusCode);
sentStatusCode = true;
} catch (IOException e) {
// It's ok not to be able to send this. But something is probably wrong.
LOG.info("Could not send read status (" + statusCode + ") to datanode " +
dnSock.getInetAddress() + ": " + e.getMessage());
peer.getRemoteAddressString() + ": " + e.getMessage());
}
}
@ -367,42 +353,34 @@ public class RemoteBlockReader2 implements BlockReader {
* @param blockToken The block token for security
* @param startOffset The read offset, relative to block head
* @param len The number of bytes to read
* @param bufferSize The IO buffer size (not the client buffer size)
* @param verifyChecksum Whether to verify checksum
* @param clientName Client name
* @param peer The Peer to use
* @param datanodeID The DatanodeID this peer is connected to
* @return New BlockReader instance, or null on error.
*/
public static BlockReader newBlockReader(Socket sock, String file,
public static BlockReader newBlockReader(String file,
ExtendedBlock block,
Token<BlockTokenIdentifier> blockToken,
long startOffset, long len,
int bufferSize, boolean verifyChecksum,
boolean verifyChecksum,
String clientName,
DataEncryptionKey encryptionKey,
IOStreamPair ioStreams)
Peer peer, DatanodeID datanodeID)
throws IOException {
ReadableByteChannel ch;
if (ioStreams.in instanceof SocketInputWrapper) {
ch = ((SocketInputWrapper)ioStreams.in).getReadableByteChannel();
} else {
ch = (ReadableByteChannel) ioStreams.in;
}
// in and out will be closed when sock is closed (by the caller)
final DataOutputStream out = new DataOutputStream(new BufferedOutputStream(
ioStreams.out));
peer.getOutputStream()));
new Sender(out).readBlock(block, blockToken, clientName, startOffset, len,
verifyChecksum);
//
// Get bytes in block
//
DataInputStream in = new DataInputStream(ioStreams.in);
DataInputStream in = new DataInputStream(peer.getInputStream());
BlockOpResponseProto status = BlockOpResponseProto.parseFrom(
PBHelper.vintPrefixed(in));
checkSuccess(status, sock, block, file);
checkSuccess(status, peer, block, file);
ReadOpChecksumInfoProto checksumInfo =
status.getReadOpChecksumInfo();
DataChecksum checksum = DataTransferProtoUtil.fromProto(
@ -420,34 +398,36 @@ public class RemoteBlockReader2 implements BlockReader {
}
return new RemoteBlockReader2(file, block.getBlockPoolId(), block.getBlockId(),
ch, checksum, verifyChecksum, startOffset, firstChunkOffset, len, sock,
ioStreams);
checksum, verifyChecksum, startOffset, firstChunkOffset, len, peer,
datanodeID);
}
static void checkSuccess(
BlockOpResponseProto status, Socket sock,
BlockOpResponseProto status, Peer peer,
ExtendedBlock block, String file)
throws IOException {
if (status.getStatus() != Status.SUCCESS) {
if (status.getStatus() == Status.ERROR_ACCESS_TOKEN) {
throw new InvalidBlockTokenException(
"Got access token error for OP_READ_BLOCK, self="
+ sock.getLocalSocketAddress() + ", remote="
+ sock.getRemoteSocketAddress() + ", for file " + file
+ peer.getLocalAddressString() + ", remote="
+ peer.getRemoteAddressString() + ", for file " + file
+ ", for pool " + block.getBlockPoolId() + " block "
+ block.getBlockId() + "_" + block.getGenerationStamp());
} else {
throw new IOException("Got error for OP_READ_BLOCK, self="
+ sock.getLocalSocketAddress() + ", remote="
+ sock.getRemoteSocketAddress() + ", for file " + file
+ peer.getLocalAddressString() + ", remote="
+ peer.getRemoteAddressString() + ", for file " + file
+ ", for pool " + block.getBlockPoolId() + " block "
+ block.getBlockId() + "_" + block.getGenerationStamp());
}
}
}
@Override
public IOStreamPair getStreams() {
return ioStreams;
public int available() throws IOException {
// An optimistic estimate of how much data is available
// to us without doing network I/O.
return DFSClient.TCP_WINDOW_SIZE;
}
}

View File

@ -0,0 +1,128 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.Socket;
import java.nio.channels.ReadableByteChannel;
import org.apache.hadoop.net.unix.DomainSocket;
/**
* Represents a peer that we communicate with by using a basic Socket
* that has no associated Channel.
*
*/
class BasicInetPeer implements Peer {
private final Socket socket;
private final OutputStream out;
private final InputStream in;
private final boolean isLocal;
public BasicInetPeer(Socket socket) throws IOException {
this.socket = socket;
this.out = socket.getOutputStream();
this.in = socket.getInputStream();
this.isLocal = socket.getInetAddress().equals(socket.getLocalAddress());
}
@Override
public ReadableByteChannel getInputStreamChannel() {
/*
* This Socket has no channel, so there's nothing to return here.
*/
return null;
}
@Override
public void setReadTimeout(int timeoutMs) throws IOException {
socket.setSoTimeout(timeoutMs);
}
@Override
public int getReceiveBufferSize() throws IOException {
return socket.getReceiveBufferSize();
}
@Override
public boolean getTcpNoDelay() throws IOException {
return socket.getTcpNoDelay();
}
@Override
public void setWriteTimeout(int timeoutMs) {
/*
* We can't implement write timeouts. :(
*
* Java provides no facility to set a blocking write timeout on a Socket.
* You can simulate a blocking write with a timeout by using
* non-blocking I/O. However, we can't use nio here, because this Socket
* doesn't have an associated Channel.
*
* See http://bugs.sun.com/bugdatabase/view_bug.do?bug_id=4031100 for
* more details.
*/
}
@Override
public boolean isClosed() {
return socket.isClosed();
}
@Override
public void close() throws IOException {
socket.close();
}
@Override
public String getRemoteAddressString() {
return socket.getRemoteSocketAddress().toString();
}
@Override
public String getLocalAddressString() {
return socket.getLocalSocketAddress().toString();
}
@Override
public InputStream getInputStream() throws IOException {
return in;
}
@Override
public OutputStream getOutputStream() throws IOException {
return out;
}
@Override
public boolean isLocal() {
return isLocal;
}
@Override
public String toString() {
return "BasicInetPeer(" + socket.toString() + ")";
}
@Override
public DomainSocket getDomainSocket() {
return null;
}
}

View File

@ -0,0 +1,117 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.ReadableByteChannel;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.classification.InterfaceAudience;
/**
* Represents a peer that we communicate with by using blocking I/O
* on a UNIX domain socket.
*/
@InterfaceAudience.Private
public class DomainPeer implements Peer {
private final DomainSocket socket;
private final OutputStream out;
private final InputStream in;
private final ReadableByteChannel channel;
public DomainPeer(DomainSocket socket) {
this.socket = socket;
this.out = socket.getOutputStream();
this.in = socket.getInputStream();
this.channel = socket.getChannel();
}
@Override
public ReadableByteChannel getInputStreamChannel() {
return channel;
}
@Override
public void setReadTimeout(int timeoutMs) throws IOException {
socket.setAttribute(DomainSocket.RECEIVE_TIMEOUT, timeoutMs);
}
@Override
public int getReceiveBufferSize() throws IOException {
return socket.getAttribute(DomainSocket.RECEIVE_BUFFER_SIZE);
}
@Override
public boolean getTcpNoDelay() throws IOException {
/* No TCP, no TCP_NODELAY. */
return false;
}
@Override
public void setWriteTimeout(int timeoutMs) throws IOException {
socket.setAttribute(DomainSocket.SEND_TIMEOUT, timeoutMs);
}
@Override
public boolean isClosed() {
return !socket.isOpen();
}
@Override
public void close() throws IOException {
socket.close();
}
@Override
public String getRemoteAddressString() {
return "unix:" + socket.getPath();
}
@Override
public String getLocalAddressString() {
return "<local>";
}
@Override
public InputStream getInputStream() throws IOException {
return in;
}
@Override
public OutputStream getOutputStream() throws IOException {
return out;
}
@Override
public boolean isLocal() {
/* UNIX domain sockets can only be used for local communication. */
return true;
}
@Override
public String toString() {
return "DomainPeer(" + getRemoteAddressString() + ")";
}
@Override
public DomainSocket getDomainSocket() {
return socket;
}
}

View File

@ -0,0 +1,89 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import java.net.SocketTimeoutException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.hdfs.net.PeerServer;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.classification.InterfaceAudience;
@InterfaceAudience.Private
public class DomainPeerServer implements PeerServer {
static Log LOG = LogFactory.getLog(DomainPeerServer.class);
private final DomainSocket sock;
DomainPeerServer(DomainSocket sock) {
this.sock = sock;
}
public DomainPeerServer(String path, int port)
throws IOException {
this(DomainSocket.bindAndListen(DomainSocket.getEffectivePath(path, port)));
}
public String getBindPath() {
return sock.getPath();
}
@Override
public void setReceiveBufferSize(int size) throws IOException {
sock.setAttribute(DomainSocket.RECEIVE_BUFFER_SIZE, size);
}
@Override
public Peer accept() throws IOException, SocketTimeoutException {
DomainSocket connSock = sock.accept();
Peer peer = null;
boolean success = false;
try {
peer = new DomainPeer(connSock);
success = true;
return peer;
} finally {
if (!success) {
if (peer != null) peer.close();
connSock.close();
}
}
}
@Override
public String getListeningString() {
return "unix:" + sock.getPath();
}
@Override
public void close() throws IOException {
try {
sock.close();
} catch (IOException e) {
LOG.error("error closing DomainPeerServer: ", e);
}
}
@Override
public String toString() {
return "DomainPeerServer(" + getListeningString() + ")";
}
}

View File

@ -0,0 +1,142 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferEncryptor;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.net.unix.DomainSocket;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.ReadableByteChannel;
/**
* Represents a peer that we communicate with by using an encrypted
* communications medium.
*/
@InterfaceAudience.Private
public class EncryptedPeer implements Peer {
private final Peer enclosedPeer;
/**
* An encrypted InputStream.
*/
private final InputStream in;
/**
* An encrypted OutputStream.
*/
private final OutputStream out;
/**
* An encrypted ReadableByteChannel.
*/
private final ReadableByteChannel channel;
public EncryptedPeer(Peer enclosedPeer, DataEncryptionKey key)
throws IOException {
this.enclosedPeer = enclosedPeer;
IOStreamPair ios = DataTransferEncryptor.getEncryptedStreams(
enclosedPeer.getOutputStream(), enclosedPeer.getInputStream(), key);
this.in = ios.in;
this.out = ios.out;
this.channel = ios.in instanceof ReadableByteChannel ?
(ReadableByteChannel)ios.in : null;
}
@Override
public ReadableByteChannel getInputStreamChannel() {
return channel;
}
@Override
public void setReadTimeout(int timeoutMs) throws IOException {
enclosedPeer.setReadTimeout(timeoutMs);
}
@Override
public int getReceiveBufferSize() throws IOException {
return enclosedPeer.getReceiveBufferSize();
}
@Override
public boolean getTcpNoDelay() throws IOException {
return enclosedPeer.getTcpNoDelay();
}
@Override
public void setWriteTimeout(int timeoutMs) throws IOException {
enclosedPeer.setWriteTimeout(timeoutMs);
}
@Override
public boolean isClosed() {
return enclosedPeer.isClosed();
}
@Override
public void close() throws IOException {
try {
in.close();
} finally {
try {
out.close();
} finally {
enclosedPeer.close();
}
}
}
@Override
public String getRemoteAddressString() {
return enclosedPeer.getRemoteAddressString();
}
@Override
public String getLocalAddressString() {
return enclosedPeer.getLocalAddressString();
}
@Override
public InputStream getInputStream() throws IOException {
return in;
}
@Override
public OutputStream getOutputStream() throws IOException {
return out;
}
@Override
public boolean isLocal() {
return enclosedPeer.isLocal();
}
@Override
public String toString() {
return "EncryptedPeer(" + enclosedPeer + ")";
}
@Override
public DomainSocket getDomainSocket() {
return enclosedPeer.getDomainSocket();
}
}

View File

@ -0,0 +1,131 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.Socket;
import java.nio.channels.ReadableByteChannel;
import org.apache.hadoop.net.SocketInputStream;
import org.apache.hadoop.net.SocketOutputStream;
import org.apache.hadoop.net.unix.DomainSocket;
/**
* Represents a peer that we communicate with by using non-blocking I/O
* on a Socket.
*/
class NioInetPeer implements Peer {
private final Socket socket;
/**
* An InputStream which simulates blocking I/O with timeouts using NIO.
*/
private final SocketInputStream in;
/**
* An OutputStream which simulates blocking I/O with timeouts using NIO.
*/
private final SocketOutputStream out;
private final boolean isLocal;
NioInetPeer(Socket socket) throws IOException {
this.socket = socket;
this.in = new SocketInputStream(socket.getChannel(), 0);
this.out = new SocketOutputStream(socket.getChannel(), 0);
this.isLocal = socket.getInetAddress().equals(socket.getLocalAddress());
}
@Override
public ReadableByteChannel getInputStreamChannel() {
return in;
}
@Override
public void setReadTimeout(int timeoutMs) throws IOException {
in.setTimeout(timeoutMs);
}
@Override
public int getReceiveBufferSize() throws IOException {
return socket.getReceiveBufferSize();
}
@Override
public boolean getTcpNoDelay() throws IOException {
return socket.getTcpNoDelay();
}
@Override
public void setWriteTimeout(int timeoutMs) throws IOException {
out.setTimeout(timeoutMs);
}
@Override
public boolean isClosed() {
return socket.isClosed();
}
@Override
public void close() throws IOException {
// We always close the outermost streams-- in this case, 'in' and 'out'
// Closing either one of these will also close the Socket.
try {
in.close();
} finally {
out.close();
}
}
@Override
public String getRemoteAddressString() {
return socket.getRemoteSocketAddress().toString();
}
@Override
public String getLocalAddressString() {
return socket.getLocalSocketAddress().toString();
}
@Override
public InputStream getInputStream() throws IOException {
return in;
}
@Override
public OutputStream getOutputStream() throws IOException {
return out;
}
@Override
public boolean isLocal() {
return isLocal;
}
@Override
public String toString() {
return "NioInetPeer(" + socket.toString() + ")";
}
@Override
public DomainSocket getDomainSocket() {
return null;
}
}

View File

@ -0,0 +1,115 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.ReadableByteChannel;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.net.unix.DomainSocket;
/**
* Represents a connection to a peer.
*/
@InterfaceAudience.Private
public interface Peer extends Closeable {
/**
* @return The input stream channel associated with this
* peer, or null if it has none.
*/
public ReadableByteChannel getInputStreamChannel();
/**
* Set the read timeout on this peer.
*
* @param timeoutMs The timeout in milliseconds.
*/
public void setReadTimeout(int timeoutMs) throws IOException;
/**
* @return The receive buffer size.
*/
public int getReceiveBufferSize() throws IOException;
/**
* @return True if TCP_NODELAY is turned on.
*/
public boolean getTcpNoDelay() throws IOException;
/**
* Set the write timeout on this peer.
*
* Note: this is not honored for BasicInetPeer.
* See {@link BasicSocketPeer#setWriteTimeout} for details.
*
* @param timeoutMs The timeout in milliseconds.
*/
public void setWriteTimeout(int timeoutMs) throws IOException;
/**
* @return true only if the peer is closed.
*/
public boolean isClosed();
/**
* Close the peer.
*
* It's safe to re-close a Peer that is already closed.
*/
public void close() throws IOException;
/**
* @return A string representing the remote end of our
* connection to the peer.
*/
public String getRemoteAddressString();
/**
* @return A string representing the local end of our
* connection to the peer.
*/
public String getLocalAddressString();
/**
* @return An InputStream associated with the Peer.
* This InputStream will be valid until you close
* this peer with Peer#close.
*/
public InputStream getInputStream() throws IOException;
/**
* @return An OutputStream associated with the Peer.
* This OutputStream will be valid until you close
* this peer with Peer#close.
*/
public OutputStream getOutputStream() throws IOException;
/**
* @return True if the peer resides on the same
* computer as we.
*/
public boolean isLocal();
/**
* @return The DomainSocket associated with the current
* peer, or null if there is none.
*/
public DomainSocket getDomainSocket();
}

View File

@ -0,0 +1,60 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.Closeable;
import org.apache.hadoop.classification.InterfaceAudience;
import java.io.IOException;
import java.net.SocketTimeoutException;
@InterfaceAudience.Private
public interface PeerServer extends Closeable {
/**
* Set the receive buffer size of the PeerServer.
*
* @param size The receive buffer size.
*/
public void setReceiveBufferSize(int size) throws IOException;
/**
* Listens for a connection to be made to this server and accepts
* it. The method blocks until a connection is made.
*
* @exception IOException if an I/O error occurs when waiting for a
* connection.
* @exception SecurityException if a security manager exists and its
* <code>checkAccept</code> method doesn't allow the operation.
* @exception SocketTimeoutException if a timeout was previously set and
* the timeout has been reached.
*/
public Peer accept() throws IOException, SocketTimeoutException;
/**
* @return A string representation of the address we're
* listening on.
*/
public String getListeningString();
/**
* Free the resources associated with this peer server.
* This normally includes sockets, etc.
*
* @throws IOException If there is an error closing the PeerServer
*/
public void close() throws IOException;
}

View File

@ -0,0 +1,156 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.net;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.security.token.block.DataEncryptionKey;
import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.Server;
@InterfaceAudience.Private
public class TcpPeerServer implements PeerServer {
static Log LOG = LogFactory.getLog(TcpPeerServer.class);
private final ServerSocket serverSocket;
public static Peer peerFromSocket(Socket socket)
throws IOException {
Peer peer = null;
boolean success = false;
try {
// TCP_NODELAY is crucial here because of bad interactions between
// Nagle's Algorithm and Delayed ACKs. With connection keepalive
// between the client and DN, the conversation looks like:
// 1. Client -> DN: Read block X
// 2. DN -> Client: data for block X
// 3. Client -> DN: Status OK (successful read)
// 4. Client -> DN: Read block Y
// The fact that step #3 and #4 are both in the client->DN direction
// triggers Nagling. If the DN is using delayed ACKs, this results
// in a delay of 40ms or more.
//
// TCP_NODELAY disables nagling and thus avoids this performance
// disaster.
socket.setTcpNoDelay(true);
SocketChannel channel = socket.getChannel();
if (channel == null) {
peer = new BasicInetPeer(socket);
} else {
peer = new NioInetPeer(socket);
}
success = true;
return peer;
} finally {
if (!success) {
if (peer != null) peer.close();
socket.close();
}
}
}
public static Peer peerFromSocketAndKey(Socket s,
DataEncryptionKey key) throws IOException {
Peer peer = null;
boolean success = false;
try {
peer = peerFromSocket(s);
if (key != null) {
peer = new EncryptedPeer(peer, key);
}
success = true;
return peer;
} finally {
if (!success) {
IOUtils.cleanup(null, peer);
}
}
}
/**
* Create a non-secure TcpPeerServer.
*
* @param socketWriteTimeout The Socket write timeout in ms.
* @param bindAddr The address to bind to.
* @throws IOException
*/
public TcpPeerServer(int socketWriteTimeout,
InetSocketAddress bindAddr) throws IOException {
this.serverSocket = (socketWriteTimeout > 0) ?
ServerSocketChannel.open().socket() : new ServerSocket();
Server.bind(serverSocket, bindAddr, 0);
}
/**
* Create a secure TcpPeerServer.
*
* @param secureResources Security resources.
*/
public TcpPeerServer(SecureResources secureResources) {
this.serverSocket = secureResources.getStreamingSocket();
}
/**
* @return the IP address which this TcpPeerServer is listening on.
*/
public InetSocketAddress getStreamingAddr() {
return new InetSocketAddress(
serverSocket.getInetAddress().getHostAddress(),
serverSocket.getLocalPort());
}
@Override
public void setReceiveBufferSize(int size) throws IOException {
this.serverSocket.setReceiveBufferSize(size);
}
@Override
public Peer accept() throws IOException, SocketTimeoutException {
Peer peer = peerFromSocket(serverSocket.accept());
return peer;
}
@Override
public String getListeningString() {
return serverSocket.getLocalSocketAddress().toString();
}
@Override
public void close() throws IOException {
try {
serverSocket.close();
} catch(IOException e) {
LOG.error("error closing TcpPeerServer: ", e);
}
}
@Override
public String toString() {
return "TcpPeerServer(" + getListeningString() + ")";
}
}

View File

@ -107,6 +107,18 @@ public interface DataTransferProtocol {
final String clientName,
final DatanodeInfo[] targets) throws IOException;
/**
* Request short circuit access file descriptors from a DataNode.
*
* @param blk The block to get file descriptors for.
* @param blockToken Security token for accessing the block.
* @param maxVersion Maximum version of the block data the client
* can understand.
*/
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token<BlockTokenIdentifier> blockToken,
int maxVersion) throws IOException;
/**
* Receive a block from a source datanode
* and then notifies the namenode

View File

@ -34,7 +34,8 @@ public enum Op {
REPLACE_BLOCK((byte)83),
COPY_BLOCK((byte)84),
BLOCK_CHECKSUM((byte)85),
TRANSFER_BLOCK((byte)86);
TRANSFER_BLOCK((byte)86),
REQUEST_SHORT_CIRCUIT_FDS((byte)87);
/** The code for this operation. */
public final byte code;

View File

@ -30,6 +30,7 @@ import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpCopyBlockProto
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpReadBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpReplaceBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpTransferBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpRequestShortCircuitAccessProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpWriteBlockProto;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
@ -76,6 +77,9 @@ public abstract class Receiver implements DataTransferProtocol {
case TRANSFER_BLOCK:
opTransferBlock(in);
break;
case REQUEST_SHORT_CIRCUIT_FDS:
opRequestShortCircuitFds(in);
break;
default:
throw new IOException("Unknown op " + op + " in data stream");
}
@ -117,6 +121,15 @@ public abstract class Receiver implements DataTransferProtocol {
PBHelper.convert(proto.getTargetsList()));
}
/** Receive {@link Op#REQUEST_SHORT_CIRCUIT_FDS} */
private void opRequestShortCircuitFds(DataInputStream in) throws IOException {
final OpRequestShortCircuitAccessProto proto =
OpRequestShortCircuitAccessProto.parseFrom(vintPrefixed(in));
requestShortCircuitFds(PBHelper.convert(proto.getHeader().getBlock()),
PBHelper.convert(proto.getHeader().getToken()),
proto.getMaxVersion());
}
/** Receive OP_REPLACE_BLOCK */
private void opReplaceBlock(DataInputStream in) throws IOException {
OpReplaceBlockProto proto = OpReplaceBlockProto.parseFrom(vintPrefixed(in));

View File

@ -34,6 +34,7 @@ import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpCopyBlockProto
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpReadBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpReplaceBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpTransferBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpRequestShortCircuitAccessProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.OpWriteBlockProto;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
@ -140,6 +141,17 @@ public class Sender implements DataTransferProtocol {
send(out, Op.TRANSFER_BLOCK, proto);
}
@Override
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token<BlockTokenIdentifier> blockToken,
int maxVersion) throws IOException {
OpRequestShortCircuitAccessProto proto =
OpRequestShortCircuitAccessProto.newBuilder()
.setHeader(DataTransferProtoUtil.buildBaseHeader(
blk, blockToken)).setMaxVersion(maxVersion).build();
send(out, Op.REQUEST_SHORT_CIRCUIT_FDS, proto);
}
@Override
public void replaceBlock(final ExtendedBlock blk,
final Token<BlockTokenIdentifier> blockToken,

View File

@ -310,6 +310,23 @@ public class DelegationTokenSecretManager
namesystem.logUpdateMasterKey(key);
}
}
@Override //AbstractDelegationTokenManager
protected void logExpireToken(final DelegationTokenIdentifier dtId)
throws IOException {
synchronized (noInterruptsLock) {
// The edit logging code will fail catastrophically if it
// is interrupted during a logSync, since the interrupt
// closes the edit log files. Doing this inside the
// above lock and then checking interruption status
// prevents this bug.
if (Thread.interrupted()) {
throw new InterruptedIOException(
"Interrupted before expiring delegation token");
}
namesystem.logExpireDelegationToken(dtId);
}
}
/** A utility method for creating credentials. */
public static Credentials createCredentials(final NameNode namenode,

View File

@ -45,6 +45,8 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@ -208,9 +210,12 @@ public class JspHelper {
// Use the block name for file name.
String file = BlockReaderFactory.getFileName(addr, poolId, blockId);
BlockReader blockReader = BlockReaderFactory.newBlockReader(
conf, s, file,
conf, file,
new ExtendedBlock(poolId, blockId, 0, genStamp), blockToken,
offsetIntoBlock, amtToRead, encryptionKey);
offsetIntoBlock, amtToRead, true,
"JspHelper", TcpPeerServer.peerFromSocketAndKey(s, encryptionKey),
new DatanodeID(addr.getAddress().toString(),
addr.getHostName(), poolId, addr.getPort(), 0, 0), null, false);
byte[] buf = new byte[(int)amtToRead];
int readOffset = 0;
@ -229,8 +234,7 @@ public class JspHelper {
amtToRead -= numRead;
readOffset += numRead;
}
blockReader = null;
s.close();
blockReader.close(null, null);
out.print(HtmlQuoting.quoteHtmlChars(new String(buf, Charsets.UTF_8)));
}

View File

@ -53,19 +53,18 @@ import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.UnknownHostException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.channels.ServerSocketChannel;
import java.nio.channels.SocketChannel;
import java.security.PrivilegedExceptionAction;
import java.util.AbstractList;
@ -93,6 +92,8 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.net.DomainPeerServer;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
@ -151,11 +152,11 @@ import org.apache.hadoop.io.ReadaheadPool;
import org.apache.hadoop.ipc.ProtobufRpcEngine;
import org.apache.hadoop.ipc.RPC;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.ipc.Server;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.net.DNS;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.SecurityUtil;
import org.apache.hadoop.security.UserGroupInformation;
@ -235,6 +236,7 @@ public class DataNode extends Configured
LogFactory.getLog(DataNode.class.getName() + ".clienttrace");
private static final String USAGE = "Usage: java DataNode [-rollback | -regular]";
static final int CURRENT_BLOCK_FORMAT_VERSION = 1;
/**
* Use {@link NetUtils#createSocketAddr(String)} instead.
@ -252,6 +254,7 @@ public class DataNode extends Configured
public final static String EMPTY_DEL_HINT = "";
AtomicInteger xmitsInProgress = new AtomicInteger();
Daemon dataXceiverServer = null;
Daemon localDataXceiverServer = null;
ThreadGroup threadGroup = null;
private DNConf dnConf;
private volatile boolean heartbeatsDisabledForTests = false;
@ -263,6 +266,7 @@ public class DataNode extends Configured
private String hostName;
private DatanodeID id;
final private String fileDescriptorPassingDisabledReason;
boolean isBlockTokenEnabled;
BlockPoolTokenSecretManager blockPoolTokenSecretManager;
private boolean hasAnyBlockPoolRegistered = false;
@ -311,6 +315,24 @@ public class DataNode extends Configured
this.getHdfsBlockLocationsEnabled = conf.getBoolean(
DFSConfigKeys.DFS_HDFS_BLOCKS_METADATA_ENABLED,
DFSConfigKeys.DFS_HDFS_BLOCKS_METADATA_ENABLED_DEFAULT);
// Determine whether we should try to pass file descriptors to clients.
if (conf.getBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT)) {
String reason = DomainSocket.getLoadingFailureReason();
if (reason != null) {
LOG.warn("File descriptor passing is disabled because " + reason);
this.fileDescriptorPassingDisabledReason = reason;
} else {
LOG.info("File descriptor passing is enabled.");
this.fileDescriptorPassingDisabledReason = null;
}
} else {
this.fileDescriptorPassingDisabledReason =
"File descriptor passing was not configured.";
LOG.debug(this.fileDescriptorPassingDisabledReason);
}
try {
hostName = getHostName(conf);
LOG.info("Configured hostname is " + hostName);
@ -525,25 +547,63 @@ public class DataNode extends Configured
private void initDataXceiver(Configuration conf) throws IOException {
// find free port or use privileged port provided
ServerSocket ss;
if (secureResources == null) {
InetSocketAddress addr = DataNode.getStreamingAddr(conf);
ss = (dnConf.socketWriteTimeout > 0) ?
ServerSocketChannel.open().socket() : new ServerSocket();
Server.bind(ss, addr, 0);
TcpPeerServer tcpPeerServer;
if (secureResources != null) {
tcpPeerServer = new TcpPeerServer(secureResources);
} else {
ss = secureResources.getStreamingSocket();
tcpPeerServer = new TcpPeerServer(dnConf.socketWriteTimeout,
DataNode.getStreamingAddr(conf));
}
ss.setReceiveBufferSize(HdfsConstants.DEFAULT_DATA_SOCKET_SIZE);
streamingAddr = new InetSocketAddress(ss.getInetAddress().getHostAddress(),
ss.getLocalPort());
tcpPeerServer.setReceiveBufferSize(HdfsConstants.DEFAULT_DATA_SOCKET_SIZE);
streamingAddr = tcpPeerServer.getStreamingAddr();
LOG.info("Opened streaming server at " + streamingAddr);
this.threadGroup = new ThreadGroup("dataXceiverServer");
this.dataXceiverServer = new Daemon(threadGroup,
new DataXceiverServer(ss, conf, this));
new DataXceiverServer(tcpPeerServer, conf, this));
this.threadGroup.setDaemon(true); // auto destroy when empty
if (conf.getBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT) ||
conf.getBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC,
DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC_DEFAULT)) {
DomainPeerServer domainPeerServer =
getDomainPeerServer(conf, streamingAddr.getPort());
if (domainPeerServer != null) {
this.localDataXceiverServer = new Daemon(threadGroup,
new DataXceiverServer(domainPeerServer, conf, this));
LOG.info("Listening on UNIX domain socket: " +
domainPeerServer.getBindPath());
}
}
}
static DomainPeerServer getDomainPeerServer(Configuration conf,
int port) throws IOException {
String domainSocketPath =
conf.getTrimmed(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_DEFAULT);
if (domainSocketPath.isEmpty()) {
if (conf.getBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY,
DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_DEFAULT) &&
(!conf.getBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL,
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL_DEFAULT))) {
LOG.warn("Although short-circuit local reads are configured, " +
"they are disabled because you didn't configure " +
DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY);
}
return null;
}
if (DomainSocket.getLoadingFailureReason() != null) {
throw new RuntimeException("Although a UNIX domain socket " +
"path is configured as " + domainSocketPath + ", we cannot " +
"start a localDataXceiverServer because " +
DomainSocket.getLoadingFailureReason());
}
DomainPeerServer domainPeerServer =
new DomainPeerServer(domainSocketPath, port);
domainPeerServer.setReceiveBufferSize(
HdfsConstants.DEFAULT_DATA_SOCKET_SIZE);
return domainPeerServer;
}
// calls specific to BP
@ -1044,6 +1104,42 @@ public class DataNode extends Configured
return info;
}
@InterfaceAudience.LimitedPrivate("HDFS")
static public class ShortCircuitFdsUnsupportedException extends IOException {
private static final long serialVersionUID = 1L;
public ShortCircuitFdsUnsupportedException(String msg) {
super(msg);
}
}
@InterfaceAudience.LimitedPrivate("HDFS")
static public class ShortCircuitFdsVersionException extends IOException {
private static final long serialVersionUID = 1L;
public ShortCircuitFdsVersionException(String msg) {
super(msg);
}
}
FileInputStream[] requestShortCircuitFdsForRead(final ExtendedBlock blk,
final Token<BlockTokenIdentifier> token, int maxVersion)
throws ShortCircuitFdsUnsupportedException,
ShortCircuitFdsVersionException, IOException {
if (fileDescriptorPassingDisabledReason != null) {
throw new ShortCircuitFdsUnsupportedException(
fileDescriptorPassingDisabledReason);
}
checkBlockToken(blk, token, BlockTokenSecretManager.AccessMode.READ);
int blkVersion = CURRENT_BLOCK_FORMAT_VERSION;
if (maxVersion < blkVersion) {
throw new ShortCircuitFdsVersionException("Your client is too old " +
"to read this block! Its format version is " +
blkVersion + ", but the highest format version you can read is " +
maxVersion);
}
metrics.incrBlocksGetLocalPathInfo();
return data.getShortCircuitFdsForRead(blk);
}
@Override
public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks,
List<Token<BlockTokenIdentifier>> tokens) throws IOException,
@ -1118,32 +1214,45 @@ public class DataNode extends Configured
if (dataXceiverServer != null) {
((DataXceiverServer) this.dataXceiverServer.getRunnable()).kill();
this.dataXceiverServer.interrupt();
// wait for all data receiver threads to exit
if (this.threadGroup != null) {
int sleepMs = 2;
while (true) {
this.threadGroup.interrupt();
LOG.info("Waiting for threadgroup to exit, active threads is " +
this.threadGroup.activeCount());
if (this.threadGroup.activeCount() == 0) {
break;
}
try {
Thread.sleep(sleepMs);
} catch (InterruptedException e) {}
sleepMs = sleepMs * 3 / 2; // exponential backoff
if (sleepMs > 1000) {
sleepMs = 1000;
}
}
if (localDataXceiverServer != null) {
((DataXceiverServer) this.localDataXceiverServer.getRunnable()).kill();
this.localDataXceiverServer.interrupt();
}
// wait for all data receiver threads to exit
if (this.threadGroup != null) {
int sleepMs = 2;
while (true) {
this.threadGroup.interrupt();
LOG.info("Waiting for threadgroup to exit, active threads is " +
this.threadGroup.activeCount());
if (this.threadGroup.activeCount() == 0) {
break;
}
try {
Thread.sleep(sleepMs);
} catch (InterruptedException e) {}
sleepMs = sleepMs * 3 / 2; // exponential backoff
if (sleepMs > 1000) {
sleepMs = 1000;
}
}
// wait for dataXceiveServer to terminate
this.threadGroup = null;
}
if (this.dataXceiverServer != null) {
// wait for dataXceiverServer to terminate
try {
this.dataXceiverServer.join();
} catch (InterruptedException ie) {
}
}
if (this.localDataXceiverServer != null) {
// wait for localDataXceiverServer to terminate
try {
this.localDataXceiverServer.join();
} catch (InterruptedException ie) {
}
}
if(blockPoolManager != null) {
try {
@ -1538,6 +1647,9 @@ public class DataNode extends Configured
// start dataXceiveServer
dataXceiverServer.start();
if (localDataXceiverServer != null) {
localDataXceiverServer.start();
}
ipcServer.start();
startPlugins(conf);
}

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.server.datanode;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.ERROR;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.ERROR_UNSUPPORTED;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.ERROR_ACCESS_TOKEN;
import static org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status.SUCCESS;
import static org.apache.hadoop.util.Time.now;
@ -28,6 +29,8 @@ import java.io.BufferedOutputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.FileDescriptor;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
@ -39,6 +42,7 @@ import java.nio.channels.ClosedChannelException;
import java.util.Arrays;
import org.apache.commons.logging.Log;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
@ -59,12 +63,13 @@ import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.datanode.DataNode.ShortCircuitFdsUnsupportedException;
import org.apache.hadoop.hdfs.server.datanode.DataNode.ShortCircuitFdsVersionException;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.LengthInputStream;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.SocketInputWrapper;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.DataChecksum;
@ -79,8 +84,7 @@ class DataXceiver extends Receiver implements Runnable {
public static final Log LOG = DataNode.LOG;
static final Log ClientTraceLog = DataNode.ClientTraceLog;
private final Socket s;
private final boolean isLocal; //is a local connection?
private final Peer peer;
private final String remoteAddress; // address of remote side
private final String localAddress; // local address of this daemon
private final DataNode datanode;
@ -88,7 +92,7 @@ class DataXceiver extends Receiver implements Runnable {
private final DataXceiverServer dataXceiverServer;
private final boolean connectToDnViaHostname;
private long opStartTime; //the start time of receiving an Op
private final SocketInputWrapper socketIn;
private final InputStream socketIn;
private OutputStream socketOut;
/**
@ -97,25 +101,23 @@ class DataXceiver extends Receiver implements Runnable {
*/
private String previousOpClientName;
public static DataXceiver create(Socket s, DataNode dn,
public static DataXceiver create(Peer peer, DataNode dn,
DataXceiverServer dataXceiverServer) throws IOException {
return new DataXceiver(s, dn, dataXceiverServer);
return new DataXceiver(peer, dn, dataXceiverServer);
}
private DataXceiver(Socket s,
DataNode datanode,
private DataXceiver(Peer peer, DataNode datanode,
DataXceiverServer dataXceiverServer) throws IOException {
this.s = s;
this.peer = peer;
this.dnConf = datanode.getDnConf();
this.socketIn = NetUtils.getInputStream(s);
this.socketOut = NetUtils.getOutputStream(s, dnConf.socketWriteTimeout);
this.isLocal = s.getInetAddress().equals(s.getLocalAddress());
this.socketIn = peer.getInputStream();
this.socketOut = peer.getOutputStream();
this.datanode = datanode;
this.dataXceiverServer = dataXceiverServer;
this.connectToDnViaHostname = datanode.getDnConf().connectToDnViaHostname;
remoteAddress = s.getRemoteSocketAddress().toString();
localAddress = s.getLocalSocketAddress().toString();
remoteAddress = peer.getRemoteAddressString();
localAddress = peer.getLocalAddressString();
if (LOG.isDebugEnabled()) {
LOG.debug("Number of active connections is: "
@ -155,11 +157,10 @@ class DataXceiver extends Receiver implements Runnable {
public void run() {
int opsProcessed = 0;
Op op = null;
dataXceiverServer.childSockets.add(s);
dataXceiverServer.addPeer(peer);
try {
peer.setWriteTimeout(datanode.getDnConf().socketWriteTimeout);
InputStream input = socketIn;
if (dnConf.encryptDataTransfer) {
IOStreamPair encryptedStreams = null;
@ -169,8 +170,9 @@ class DataXceiver extends Receiver implements Runnable {
dnConf.encryptionAlgorithm);
} catch (InvalidMagicNumberException imne) {
LOG.info("Failed to read expected encryption handshake from client " +
"at " + s.getInetAddress() + ". Perhaps the client is running an " +
"older version of Hadoop which does not support encryption");
"at " + peer.getRemoteAddressString() + ". Perhaps the client " +
"is running an older version of Hadoop which does not support " +
"encryption");
return;
}
input = encryptedStreams.in;
@ -189,9 +191,9 @@ class DataXceiver extends Receiver implements Runnable {
try {
if (opsProcessed != 0) {
assert dnConf.socketKeepaliveTimeout > 0;
socketIn.setTimeout(dnConf.socketKeepaliveTimeout);
peer.setReadTimeout(dnConf.socketKeepaliveTimeout);
} else {
socketIn.setTimeout(dnConf.socketTimeout);
peer.setReadTimeout(dnConf.socketTimeout);
}
op = readOp();
} catch (InterruptedIOException ignored) {
@ -202,7 +204,7 @@ class DataXceiver extends Receiver implements Runnable {
if (opsProcessed > 0 &&
(err instanceof EOFException || err instanceof ClosedChannelException)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Cached " + s.toString() + " closing after " + opsProcessed + " ops");
LOG.debug("Cached " + peer + " closing after " + opsProcessed + " ops");
}
} else {
throw err;
@ -212,13 +214,13 @@ class DataXceiver extends Receiver implements Runnable {
// restore normal timeout
if (opsProcessed != 0) {
s.setSoTimeout(dnConf.socketTimeout);
peer.setReadTimeout(dnConf.socketTimeout);
}
opStartTime = now();
processOp(op);
++opsProcessed;
} while (!s.isClosed() && dnConf.socketKeepaliveTimeout > 0);
} while (!peer.isClosed() && dnConf.socketKeepaliveTimeout > 0);
} catch (Throwable t) {
LOG.error(datanode.getDisplayName() + ":DataXceiver error processing " +
((op == null) ? "unknown" : op.name()) + " operation " +
@ -230,9 +232,70 @@ class DataXceiver extends Receiver implements Runnable {
+ datanode.getXceiverCount());
}
updateCurrentThreadName("Cleaning up");
dataXceiverServer.closePeer(peer);
IOUtils.closeStream(in);
IOUtils.closeSocket(s);
dataXceiverServer.childSockets.remove(s);
}
}
@Override
public void requestShortCircuitFds(final ExtendedBlock blk,
final Token<BlockTokenIdentifier> token,
int maxVersion) throws IOException {
updateCurrentThreadName("Passing file descriptors for block " + blk);
BlockOpResponseProto.Builder bld = BlockOpResponseProto.newBuilder();
FileInputStream fis[] = null;
try {
if (peer.getDomainSocket() == null) {
throw new IOException("You cannot pass file descriptors over " +
"anything but a UNIX domain socket.");
}
fis = datanode.requestShortCircuitFdsForRead(blk, token, maxVersion);
bld.setStatus(SUCCESS);
bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
} catch (ShortCircuitFdsVersionException e) {
bld.setStatus(ERROR_UNSUPPORTED);
bld.setShortCircuitAccessVersion(DataNode.CURRENT_BLOCK_FORMAT_VERSION);
bld.setMessage(e.getMessage());
} catch (ShortCircuitFdsUnsupportedException e) {
bld.setStatus(ERROR_UNSUPPORTED);
bld.setMessage(e.getMessage());
} catch (InvalidToken e) {
bld.setStatus(ERROR_ACCESS_TOKEN);
bld.setMessage(e.getMessage());
} catch (IOException e) {
bld.setStatus(ERROR);
bld.setMessage(e.getMessage());
}
try {
bld.build().writeDelimitedTo(socketOut);
if (fis != null) {
FileDescriptor fds[] = new FileDescriptor[fis.length];
for (int i = 0; i < fds.length; i++) {
fds[i] = fis[i].getFD();
}
byte buf[] = new byte[] { (byte)0 };
peer.getDomainSocket().
sendFileDescriptors(fds, buf, 0, buf.length);
}
} finally {
if (ClientTraceLog.isInfoEnabled()) {
DatanodeRegistration dnR = datanode.getDNRegistrationForBP(blk
.getBlockPoolId());
BlockSender.ClientTraceLog.info(String.format(
String.format(
"src: %s, dest: %s, op: %s, blockid: %s, srvID: %s, " +
"success: %b",
"127.0.0.1", // src IP
"127.0.0.1", // dst IP
"REQUEST_SHORT_CIRCUIT_FDS", // operation
blk.getBlockId(), // block id
dnR.getStorageID(),
(fis != null)
)));
}
if (fis != null) {
IOUtils.cleanup(LOG, fis);
}
}
}
@ -287,8 +350,9 @@ class DataXceiver extends Receiver implements Runnable {
ClientReadStatusProto stat = ClientReadStatusProto.parseFrom(
PBHelper.vintPrefixed(in));
if (!stat.hasStatus()) {
LOG.warn("Client " + s.getInetAddress() + " did not send a valid status " +
"code after reading. Will close connection.");
LOG.warn("Client " + peer.getRemoteAddressString() +
" did not send a valid status code after reading. " +
"Will close connection.");
IOUtils.closeStream(out);
}
} catch (IOException ioe) {
@ -321,7 +385,7 @@ class DataXceiver extends Receiver implements Runnable {
//update metrics
datanode.metrics.addReadBlockOp(elapsed());
datanode.metrics.incrReadsFromClient(isLocal);
datanode.metrics.incrReadsFromClient(peer.isLocal());
}
@Override
@ -359,8 +423,8 @@ class DataXceiver extends Receiver implements Runnable {
LOG.debug("isDatanode=" + isDatanode
+ ", isClient=" + isClient
+ ", isTransfer=" + isTransfer);
LOG.debug("writeBlock receive buf size " + s.getReceiveBufferSize() +
" tcp no delay " + s.getTcpNoDelay());
LOG.debug("writeBlock receive buf size " + peer.getReceiveBufferSize() +
" tcp no delay " + peer.getTcpNoDelay());
}
// We later mutate block's generation stamp and length, but we need to
@ -391,8 +455,8 @@ class DataXceiver extends Receiver implements Runnable {
stage != BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) {
// open a block receiver
blockReceiver = new BlockReceiver(block, in,
s.getRemoteSocketAddress().toString(),
s.getLocalSocketAddress().toString(),
peer.getRemoteAddressString(),
peer.getLocalAddressString(),
stage, latestGenerationStamp, minBytesRcvd, maxBytesRcvd,
clientname, srcDataNode, datanode, requestedChecksum);
} else {
@ -547,7 +611,7 @@ class DataXceiver extends Receiver implements Runnable {
//update metrics
datanode.metrics.addWriteBlockOp(elapsed());
datanode.metrics.incrWritesFromClient(isLocal);
datanode.metrics.incrWritesFromClient(peer.isLocal());
}
@Override
@ -555,7 +619,7 @@ class DataXceiver extends Receiver implements Runnable {
final Token<BlockTokenIdentifier> blockToken,
final String clientName,
final DatanodeInfo[] targets) throws IOException {
checkAccess(null, true, blk, blockToken,
checkAccess(socketOut, true, blk, blockToken,
Op.TRANSFER_BLOCK, BlockTokenSecretManager.AccessMode.COPY);
previousOpClientName = clientName;
updateCurrentThreadName(Op.TRANSFER_BLOCK + " " + blk);
@ -642,8 +706,9 @@ class DataXceiver extends Receiver implements Runnable {
}
if (!dataXceiverServer.balanceThrottler.acquire()) { // not able to start
String msg = "Not able to copy block " + block.getBlockId() + " to "
+ s.getRemoteSocketAddress() + " because threads quota is exceeded.";
String msg = "Not able to copy block " + block.getBlockId() + " " +
"to " + peer.getRemoteAddressString() + " because threads " +
"quota is exceeded.";
LOG.info(msg);
sendResponse(ERROR, msg);
return;
@ -672,7 +737,7 @@ class DataXceiver extends Receiver implements Runnable {
datanode.metrics.incrBytesRead((int) read);
datanode.metrics.incrBlocksRead();
LOG.info("Copied " + block + " to " + s.getRemoteSocketAddress());
LOG.info("Copied " + block + " to " + peer.getRemoteAddressString());
} catch (IOException ioe) {
isOpSuccess = false;
LOG.info("opCopyBlock " + block + " received exception " + ioe);
@ -717,8 +782,9 @@ class DataXceiver extends Receiver implements Runnable {
}
if (!dataXceiverServer.balanceThrottler.acquire()) { // not able to start
String msg = "Not able to receive block " + block.getBlockId() + " from "
+ s.getRemoteSocketAddress() + " because threads quota is exceeded.";
String msg = "Not able to receive block " + block.getBlockId() +
" from " + peer.getRemoteAddressString() + " because threads " +
"quota is exceeded.";
LOG.warn(msg);
sendResponse(ERROR, msg);
return;
@ -795,7 +861,7 @@ class DataXceiver extends Receiver implements Runnable {
// notify name node
datanode.notifyNamenodeReceivedBlock(block, delHint);
LOG.info("Moved " + block + " from " + s.getRemoteSocketAddress());
LOG.info("Moved " + block + " from " + peer.getRemoteAddressString());
} catch (IOException ioe) {
opStatus = ERROR;
@ -818,7 +884,7 @@ class DataXceiver extends Receiver implements Runnable {
try {
sendResponse(opStatus, errMsg);
} catch (IOException ioe) {
LOG.warn("Error writing reply back to " + s.getRemoteSocketAddress());
LOG.warn("Error writing reply back to " + peer.getRemoteAddressString());
}
IOUtils.closeStream(proxyOut);
IOUtils.closeStream(blockReceiver);
@ -872,7 +938,7 @@ class DataXceiver extends Receiver implements Runnable {
}
private void checkAccess(DataOutputStream out, final boolean reply,
private void checkAccess(OutputStream out, final boolean reply,
final ExtendedBlock blk,
final Token<BlockTokenIdentifier> t,
final Op op,
@ -887,11 +953,6 @@ class DataXceiver extends Receiver implements Runnable {
} catch(InvalidToken e) {
try {
if (reply) {
if (out == null) {
out = new DataOutputStream(
NetUtils.getOutputStream(s, dnConf.socketWriteTimeout));
}
BlockOpResponseProto.Builder resp = BlockOpResponseProto.newBuilder()
.setStatus(ERROR_ACCESS_TOKEN);
if (mode == BlockTokenSecretManager.AccessMode.WRITE) {

View File

@ -18,18 +18,16 @@
package org.apache.hadoop.hdfs.server.datanode;
import java.io.IOException;
import java.net.ServerSocket;
import java.net.Socket;
import java.net.SocketTimeoutException;
import java.nio.channels.AsynchronousCloseException;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.net.PeerServer;
import org.apache.hadoop.hdfs.server.balancer.Balancer;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.io.IOUtils;
@ -45,11 +43,9 @@ import org.apache.hadoop.util.Daemon;
class DataXceiverServer implements Runnable {
public static final Log LOG = DataNode.LOG;
ServerSocket ss;
DataNode datanode;
// Record all sockets opened for data transfer
Set<Socket> childSockets = Collections.synchronizedSet(
new HashSet<Socket>());
private final PeerServer peerServer;
private final DataNode datanode;
private final Set<Peer> peers = new HashSet<Peer>();
/**
* Maximal number of concurrent xceivers per node.
@ -109,10 +105,10 @@ class DataXceiverServer implements Runnable {
long estimateBlockSize;
DataXceiverServer(ServerSocket ss, Configuration conf,
DataXceiverServer(PeerServer peerServer, Configuration conf,
DataNode datanode) {
this.ss = ss;
this.peerServer = peerServer;
this.datanode = datanode;
this.maxXceiverCount =
@ -130,12 +126,10 @@ class DataXceiverServer implements Runnable {
@Override
public void run() {
Peer peer = null;
while (datanode.shouldRun) {
Socket s = null;
try {
s = ss.accept();
s.setTcpNoDelay(true);
// Timeouts are set within DataXceiver.run()
peer = peerServer.accept();
// Make sure the xceiver count is not exceeded
int curXceiverCount = datanode.getXceiverCount();
@ -146,7 +140,7 @@ class DataXceiverServer implements Runnable {
}
new Daemon(datanode.threadGroup,
DataXceiver.create(s, datanode, this))
DataXceiver.create(peer, datanode, this))
.start();
} catch (SocketTimeoutException ignored) {
// wake up to see if should continue to run
@ -157,10 +151,10 @@ class DataXceiverServer implements Runnable {
LOG.warn(datanode.getDisplayName() + ":DataXceiverServer: ", ace);
}
} catch (IOException ie) {
IOUtils.closeSocket(s);
IOUtils.cleanup(null, peer);
LOG.warn(datanode.getDisplayName() + ":DataXceiverServer: ", ie);
} catch (OutOfMemoryError ie) {
IOUtils.closeSocket(s);
IOUtils.cleanup(null, peer);
// DataNode can run out of memory if there is too many transfers.
// Log the event, Sleep for 30 seconds, other transfers may complete by
// then.
@ -176,33 +170,35 @@ class DataXceiverServer implements Runnable {
datanode.shouldRun = false;
}
}
synchronized (this) {
for (Peer p : peers) {
IOUtils.cleanup(LOG, p);
}
}
try {
ss.close();
peerServer.close();
} catch (IOException ie) {
LOG.warn(datanode.getDisplayName()
+ " :DataXceiverServer: close exception", ie);
}
}
void kill() {
assert datanode.shouldRun == false :
"shoudRun should be set to false before killing";
try {
this.ss.close();
this.peerServer.close();
} catch (IOException ie) {
LOG.warn(datanode.getDisplayName() + ":DataXceiverServer.kill(): ", ie);
}
}
synchronized void addPeer(Peer peer) {
peers.add(peer);
}
// close all the sockets that were accepted earlier
synchronized (childSockets) {
for (Iterator<Socket> it = childSockets.iterator();
it.hasNext();) {
Socket thissock = it.next();
try {
thissock.close();
} catch (IOException e) {
}
}
}
synchronized void closePeer(Peer peer) {
peers.remove(peer);
IOUtils.cleanup(null, peer);
}
}

View File

@ -19,6 +19,7 @@ package org.apache.hadoop.hdfs.server.datanode.fsdataset;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
@ -386,4 +387,6 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks)
throws IOException;
FileInputStream[] getShortCircuitFdsForRead(ExtendedBlock block)
throws IOException;
}

View File

@ -76,6 +76,7 @@ import org.apache.hadoop.hdfs.server.datanode.fsdataset.VolumeChoosingPolicy;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.nativeio.NativeIO;
import org.apache.hadoop.metrics2.util.MBeans;
import org.apache.hadoop.util.DataChecksum;
@ -1700,6 +1701,26 @@ class FsDatasetImpl implements FsDatasetSpi<FsVolumeImpl> {
return info;
}
@Override // FsDatasetSpi
public FileInputStream[] getShortCircuitFdsForRead(ExtendedBlock block)
throws IOException {
File datafile = getBlockFile(block);
File metafile = FsDatasetUtil.getMetaFile(datafile,
block.getGenerationStamp());
FileInputStream fis[] = new FileInputStream[2];
boolean success = false;
try {
fis[0] = new FileInputStream(datafile);
fis[1] = new FileInputStream(metafile);
success = true;
return fis;
} finally {
if (!success) {
IOUtils.cleanup(null, fis);
}
}
}
@Override // FsDatasetSpi
public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks)
throws IOException {

View File

@ -246,8 +246,7 @@ public class FSDirectory implements Closeable {
long preferredBlockSize,
String clientName,
String clientMachine,
DatanodeDescriptor clientNode,
long generationStamp)
DatanodeDescriptor clientNode)
throws FileAlreadyExistsException, QuotaExceededException,
UnresolvedLinkException, SnapshotAccessControlException {
waitForReady();

View File

@ -1221,7 +1221,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot set permission for " + src, safeMode);
}
@ -1259,7 +1258,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot set owner for " + src, safeMode);
}
@ -1313,9 +1311,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
LocatedBlocks getBlockLocations(String src, long offset, long length,
boolean doAccessTime, boolean needBlockToken, boolean checkSafeMode)
throws FileNotFoundException, UnresolvedLinkException, IOException {
FSPermissionChecker pc = getPermissionChecker();
try {
return getBlockLocationsInt(pc, src, offset, length, doAccessTime,
return getBlockLocationsInt(src, offset, length, doAccessTime,
needBlockToken, checkSafeMode);
} catch (AccessControlException e) {
logAuditEvent(false, "open", src);
@ -1323,14 +1320,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
}
}
private LocatedBlocks getBlockLocationsInt(FSPermissionChecker pc,
String src, long offset, long length, boolean doAccessTime,
boolean needBlockToken, boolean checkSafeMode)
private LocatedBlocks getBlockLocationsInt(String src, long offset,
long length, boolean doAccessTime, boolean needBlockToken,
boolean checkSafeMode)
throws FileNotFoundException, UnresolvedLinkException, IOException {
if (isPermissionEnabled) {
checkPathAccess(pc, src, FsAction.READ);
}
if (offset < 0) {
throw new HadoopIllegalArgumentException(
"Negative offset is not supported. File: " + src);
@ -1358,13 +1351,11 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
* Get block locations within the specified range, updating the
* access times if necessary.
*/
private LocatedBlocks getBlockLocationsUpdateTimes(String src,
long offset,
long length,
boolean doAccessTime,
boolean needBlockToken)
throws FileNotFoundException, UnresolvedLinkException, IOException {
private LocatedBlocks getBlockLocationsUpdateTimes(String src, long offset,
long length, boolean doAccessTime, boolean needBlockToken)
throws FileNotFoundException,
UnresolvedLinkException, IOException {
FSPermissionChecker pc = getPermissionChecker();
for (int attempt = 0; attempt < 2; attempt++) {
boolean isReadOp = (attempt == 0);
if (isReadOp) { // first attempt is with readlock
@ -1380,6 +1371,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
} else {
checkOperation(OperationCategory.WRITE);
}
if (isPermissionEnabled) {
checkPathAccess(pc, src, FsAction.READ);
}
// if the namenode is in safemode, then do not update access time
if (isInSafeMode()) {
@ -1426,6 +1420,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
void concat(String target, String [] srcs)
throws IOException, UnresolvedLinkException {
if(FSNamesystem.LOG.isDebugEnabled()) {
FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
" to " + target);
}
try {
concatInt(target, srcs);
} catch (AccessControlException e) {
@ -1436,11 +1434,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private void concatInt(String target, String [] srcs)
throws IOException, UnresolvedLinkException {
if(FSNamesystem.LOG.isDebugEnabled()) {
FSNamesystem.LOG.debug("concat " + Arrays.toString(srcs) +
" to " + target);
}
// verify args
if(target.isEmpty()) {
throw new IllegalArgumentException("Target file name is empty");
@ -1594,6 +1587,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
void setTimes(String src, long mtime, long atime)
throws IOException, UnresolvedLinkException {
if (!isAccessTimeSupported() && atime != -1) {
throw new IOException("Access time for hdfs is not configured. " +
" Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
}
try {
setTimesInt(src, mtime, atime);
} catch (AccessControlException e) {
@ -1604,16 +1601,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private void setTimesInt(String src, long mtime, long atime)
throws IOException, UnresolvedLinkException {
if (!isAccessTimeSupported() && atime != -1) {
throw new IOException("Access time for hdfs is not configured. " +
" Please set " + DFS_NAMENODE_ACCESSTIME_PRECISION_KEY + " configuration parameter.");
}
HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot set times " + src, safeMode);
}
// Write access is required to set access and modification times
if (isPermissionEnabled) {
@ -1639,6 +1635,9 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
void createSymlink(String target, String link,
PermissionStatus dirPerms, boolean createParent)
throws IOException, UnresolvedLinkException {
if (!DFSUtil.isValidName(link)) {
throw new InvalidPathException("Invalid file name: " + link);
}
try {
createSymlinkInt(target, link, dirPerms, createParent);
} catch (AccessControlException e) {
@ -1650,17 +1649,34 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private void createSymlinkInt(String target, String link,
PermissionStatus dirPerms, boolean createParent)
throws IOException, UnresolvedLinkException {
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target="
+ target + " link=" + link);
}
HdfsFileStatus resultingStat = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot create symlink " + link, safeMode);
}
if (!createParent) {
verifyParentDir(link);
}
createSymlinkInternal(pc, target, link, dirPerms, createParent);
if (!dir.isValidToCreate(link)) {
throw new IOException("failed to create link " + link
+" either because the filename is invalid or the file exists");
}
if (isPermissionEnabled) {
checkAncestorAccess(pc, link, FsAction.WRITE);
}
// validate that we have enough inodes.
checkFsObjectLimit();
// add symbolic link to namespace
dir.addSymlink(link, target, dirPerms, createParent);
resultingStat = getAuditFileInfo(link, false);
} finally {
writeUnlock();
@ -1669,37 +1685,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
logAuditEvent(true, "createSymlink", link, target, resultingStat);
}
/**
* Create a symbolic link.
*/
private void createSymlinkInternal(FSPermissionChecker pc, String target,
String link, PermissionStatus dirPerms, boolean createParent)
throws IOException, UnresolvedLinkException {
assert hasWriteLock();
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.createSymlink: target=" +
target + " link=" + link);
}
if (isInSafeMode()) {
throw new SafeModeException("Cannot create symlink " + link, safeMode);
}
if (!DFSUtil.isValidName(link)) {
throw new InvalidPathException("Invalid file name: " + link);
}
if (!dir.isValidToCreate(link)) {
throw new IOException("failed to create link " + link
+" either because the filename is invalid or the file exists");
}
if (isPermissionEnabled) {
checkAncestorAccess(pc, link, FsAction.WRITE);
}
// validate that we have enough inodes.
checkFsObjectLimit();
// add symbolic link to namespace
dir.addSymlink(link, target, dirPerms, createParent);
}
/**
* Set replication for an existing file.
*
@ -1819,13 +1804,24 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
throws AccessControlException, SafeModeException,
FileAlreadyExistsException, UnresolvedLinkException,
FileNotFoundException, ParentNotDirectoryException, IOException {
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
+ ", holder=" + holder
+ ", clientMachine=" + clientMachine
+ ", createParent=" + createParent
+ ", replication=" + replication
+ ", createFlag=" + flag.toString());
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
boolean skipSync = false;
final HdfsFileStatus stat;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
startFileInternal(pc, src, permissions, holder, clientMachine, flag,
createParent, replication, blockSize);
stat = dir.getFileInfo(src, false);
@ -1868,21 +1864,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
AccessControlException, UnresolvedLinkException, FileNotFoundException,
ParentNotDirectoryException, IOException {
assert hasWriteLock();
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.startFile: src=" + src
+ ", holder=" + holder
+ ", clientMachine=" + clientMachine
+ ", createParent=" + createParent
+ ", replication=" + replication
+ ", createFlag=" + flag.toString());
}
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot create file" + src, safeMode);
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
// Verify that the destination does not exist as a directory already.
final INodesInPath iip = dir.getINodesInPath4Write(src);
final INode inode = iip.getLastINode();
@ -1945,9 +1930,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
checkFsObjectLimit();
// increment global generation stamp
long genstamp = nextGenerationStamp();
INodeFileUnderConstruction newNode = dir.addFile(src, permissions,
replication, blockSize, holder, clientMachine, clientNode, genstamp);
replication, blockSize, holder, clientMachine, clientNode);
if (newNode == null) {
throw new IOException("DIR* NameSystem.startFile: " +
"Unable to add file to namespace.");
@ -2013,21 +1997,20 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
boolean recoverLease(String src, String holder, String clientMachine)
throws IOException {
if (!DFSUtil.isValidName(src)) {
throw new IOException("Invalid file name: " + src);
}
boolean skipSync = false;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException(
"Cannot recover the lease of " + src, safeMode);
}
if (!DFSUtil.isValidName(src)) {
throw new IOException("Invalid file name: " + src);
}
final INodeFile inode = INodeFile.valueOf(dir.getINode(src), src);
if (!inode.isUnderConstruction()) {
return true;
@ -2151,13 +2134,20 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
"Append is not enabled on this NameNode. Use the " +
DFS_SUPPORT_APPEND_KEY + " configuration option to enable it.");
}
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.appendFile: src=" + src
+ ", holder=" + holder
+ ", clientMachine=" + clientMachine);
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
LocatedBlock lb = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
lb = startFileInternal(pc, src, null, holder, clientMachine,
EnumSet.of(CreateFlag.APPEND),
false, blockManager.maxReplication, 0);
@ -2448,21 +2438,21 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
boolean abandonBlock(ExtendedBlock b, String src, String holder)
throws LeaseExpiredException, FileNotFoundException,
UnresolvedLinkException, IOException {
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: " + b
+ "of file " + src);
}
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
//
// Remove the block from the pending creates list
//
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("BLOCK* NameSystem.abandonBlock: "
+b+"of file "+src);
}
if (isInSafeMode()) {
throw new SafeModeException("Cannot abandon block " + b +
" for fle" + src, safeMode);
}
//
// Remove the block from the pending creates list
//
INodeFileUnderConstruction file = checkLease(src, holder);
dir.removeBlock(src, file, ExtendedBlock.getLocalBlock(b));
if(NameNode.stateChangeLog.isDebugEnabled()) {
@ -2524,19 +2514,23 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
boolean completeFile(String src, String holder, ExtendedBlock last)
throws SafeModeException, UnresolvedLinkException, IOException {
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
src + " for " + holder);
}
checkBlock(last);
boolean success = false;
checkOperation(OperationCategory.WRITE);
writeLock();
try {
checkOperation(OperationCategory.WRITE);
success = completeFileInternal(src, holder,
ExtendedBlock.getLocalBlock(last));
success = completeFileInternal(src, holder,
ExtendedBlock.getLocalBlock(last));
} finally {
writeUnlock();
}
getEditLog().logSync();
NameNode.stateChangeLog.info("DIR* completeFile: " + src + " is closed by "
+ holder);
return success;
}
@ -2544,10 +2538,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
String holder, Block last) throws SafeModeException,
UnresolvedLinkException, IOException {
assert hasWriteLock();
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.completeFile: " +
src + " for " + holder);
}
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot complete file " + src, safeMode);
}
@ -2588,9 +2579,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
finalizeINodeFileUnderConstruction(src, pendingFile,
iip.getLatestSnapshot());
NameNode.stateChangeLog.info("DIR* completeFile: " + src + " is closed by "
+ holder);
return true;
}
@ -2691,18 +2679,19 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private boolean renameToInt(String src, String dst)
throws IOException, UnresolvedLinkException {
boolean status = false;
HdfsFileStatus resultingStat = null;
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: " + src +
" to " + dst);
}
if (!DFSUtil.isValidName(dst)) {
throw new IOException("Invalid name: " + dst);
}
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
boolean status = false;
HdfsFileStatus resultingStat = null;
writeLock();
try {
checkOperation(OperationCategory.WRITE);
status = renameToInternal(pc, src, dst);
if (status) {
resultingStat = getAuditFileInfo(dst, false);
@ -2722,12 +2711,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private boolean renameToInternal(FSPermissionChecker pc, String src, String dst)
throws IOException, UnresolvedLinkException {
assert hasWriteLock();
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot rename " + src, safeMode);
}
if (!DFSUtil.isValidName(dst)) {
throw new IOException("Invalid name: " + dst);
}
if (isPermissionEnabled) {
//We should not be doing this. This is move() not renameTo().
//but for now,
@ -2749,16 +2736,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
/** Rename src to dst */
void renameTo(String src, String dst, Options.Rename... options)
throws IOException, UnresolvedLinkException {
HdfsFileStatus resultingStat = null;
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.renameTo: with options - "
+ src + " to " + dst);
}
if (!DFSUtil.isValidName(dst)) {
throw new InvalidPathException("Invalid name: " + dst);
}
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
HdfsFileStatus resultingStat = null;
writeLock();
try {
checkOperation(OperationCategory.WRITE);
renameToInternal(pc, src, dst, options);
resultingStat = getAuditFileInfo(dst, false);
} finally {
@ -2777,12 +2766,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private void renameToInternal(FSPermissionChecker pc, String src, String dst,
Options.Rename... options) throws IOException {
assert hasWriteLock();
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot rename " + src, safeMode);
}
if (!DFSUtil.isValidName(dst)) {
throw new InvalidPathException("Invalid name: " + dst);
}
if (isPermissionEnabled) {
checkParentAccess(pc, src, FsAction.WRITE);
checkAncestorAccess(pc, dst, FsAction.WRITE);
@ -2969,16 +2956,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
HdfsFileStatus getFileInfo(String src, boolean resolveLink)
throws AccessControlException, UnresolvedLinkException,
StandbyException, IOException {
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException("Invalid file name: " + src);
}
HdfsFileStatus stat = null;
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.READ);
readLock();
try {
checkOperation(OperationCategory.READ);
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException("Invalid file name: " + src);
}
if (isPermissionEnabled) {
checkTraverse(pc, src);
}
@ -3035,16 +3021,18 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
private boolean mkdirsInt(String src, PermissionStatus permissions,
boolean createParent) throws IOException, UnresolvedLinkException {
HdfsFileStatus resultingStat = null;
boolean status = false;
if(NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug("DIR* NameSystem.mkdirs: " + src);
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
FSPermissionChecker pc = getPermissionChecker();
checkOperation(OperationCategory.WRITE);
HdfsFileStatus resultingStat = null;
boolean status = false;
writeLock();
try {
checkOperation(OperationCategory.WRITE);
status = mkdirsInternal(pc, src, permissions, createParent);
if (status) {
resultingStat = dir.getFileInfo(src, false);
@ -3066,6 +3054,7 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
PermissionStatus permissions, boolean createParent)
throws IOException, UnresolvedLinkException {
assert hasWriteLock();
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot create directory " + src, safeMode);
}
@ -3077,9 +3066,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
// a new directory is not created.
return true;
}
if (!DFSUtil.isValidName(src)) {
throw new InvalidPathException(src);
}
if (isPermissionEnabled) {
checkAncestorAccess(pc, src, FsAction.WRITE);
}
@ -3355,8 +3341,15 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
boolean closeFile, boolean deleteblock, DatanodeID[] newtargets,
String[] newtargetstorages)
throws IOException, UnresolvedLinkException {
String src = "";
LOG.info("commitBlockSynchronization(lastblock=" + lastblock
+ ", newgenerationstamp=" + newgenerationstamp
+ ", newlength=" + newlength
+ ", newtargets=" + Arrays.asList(newtargets)
+ ", closeFile=" + closeFile
+ ", deleteBlock=" + deleteblock
+ ")");
checkOperation(OperationCategory.WRITE);
String src = "";
writeLock();
try {
checkOperation(OperationCategory.WRITE);
@ -3368,13 +3361,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
"Cannot commitBlockSynchronization while in safe mode",
safeMode);
}
LOG.info("commitBlockSynchronization(lastblock=" + lastblock
+ ", newgenerationstamp=" + newgenerationstamp
+ ", newlength=" + newlength
+ ", newtargets=" + Arrays.asList(newtargets)
+ ", closeFile=" + closeFile
+ ", deleteBlock=" + deleteblock
+ ")");
final BlockInfo storedBlock = blockManager.getStoredBlock(ExtendedBlock
.getLocalBlock(lastblock));
if (storedBlock == null) {
@ -3465,7 +3451,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot renew lease for " + holder, safeMode);
}
@ -4951,11 +4936,10 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
*/
void reportBadBlocks(LocatedBlock[] blocks) throws IOException {
checkOperation(OperationCategory.WRITE);
NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
writeLock();
try {
checkOperation(OperationCategory.WRITE);
NameNode.stateChangeLog.info("*DIR* reportBadBlocks");
for (int i = 0; i < blocks.length; i++) {
ExtendedBlock blk = blocks[i].getBlock();
DatanodeInfo[] nodes = blocks[i].getLocations();
@ -5018,6 +5002,12 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
ExtendedBlock newBlock, DatanodeID[] newNodes)
throws IOException {
checkOperation(OperationCategory.WRITE);
LOG.info("updatePipeline(block=" + oldBlock
+ ", newGenerationStamp=" + newBlock.getGenerationStamp()
+ ", newLength=" + newBlock.getNumBytes()
+ ", newNodes=" + Arrays.asList(newNodes)
+ ", clientName=" + clientName
+ ")");
writeLock();
try {
checkOperation(OperationCategory.WRITE);
@ -5027,12 +5017,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
}
assert newBlock.getBlockId()==oldBlock.getBlockId() : newBlock + " and "
+ oldBlock + " has different block identifier";
LOG.info("updatePipeline(block=" + oldBlock
+ ", newGenerationStamp=" + newBlock.getGenerationStamp()
+ ", newLength=" + newBlock.getNumBytes()
+ ", newNodes=" + Arrays.asList(newNodes)
+ ", clientName=" + clientName
+ ")");
updatePipelineInternal(clientName, oldBlock, newBlock, newNodes);
} finally {
writeUnlock();
@ -5284,7 +5268,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
writeLock();
try {
checkOperation(OperationCategory.WRITE);
if (isInSafeMode()) {
throw new SafeModeException("Cannot issue delegation token", safeMode);
}
@ -5409,6 +5392,21 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
getEditLog().logSync();
}
/**
* Log the cancellation of expired tokens to edit logs
*
* @param id token identifier to cancel
*/
public void logExpireDelegationToken(DelegationTokenIdentifier id) {
assert !isInSafeMode() :
"this should never be called while in safemode, since we stop " +
"the DT manager before entering safemode!";
// No need to hold FSN lock since we don't access any internal
// structures, and this is stopped before the FSN shuts itself
// down, etc.
getEditLog().logCancelDelegationToken(id);
}
private void logReassignLease(String leaseHolder, String src,
String newHolder) {
assert hasWriteLock();

View File

@ -42,6 +42,7 @@ import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.DirectoryListing;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -559,9 +560,10 @@ public class NamenodeFsck {
String file = BlockReaderFactory.getFileName(targetAddr, block.getBlockPoolId(),
block.getBlockId());
blockReader = BlockReaderFactory.newBlockReader(
conf, s, file, block, lblock
.getBlockToken(), 0, -1,
namenode.getRpcServer().getDataEncryptionKey());
conf, file, block, lblock.getBlockToken(), 0, -1, true, "fsck",
TcpPeerServer.peerFromSocketAndKey(s, namenode.getRpcServer().
getDataEncryptionKey()),
chosenNode, null, false);
} catch (IOException ex) {
// Put chosen node into dead list, continue

View File

@ -74,6 +74,8 @@ message DeleteBlockPoolResponseProto {
* Gets the file information where block and its metadata is stored
* block - block for which path information is being requested
* token - block token
*
* This message is deprecated in favor of file descriptor passing.
*/
message GetBlockLocalPathInfoRequestProto {
required ExtendedBlockProto block = 1;
@ -84,6 +86,8 @@ message GetBlockLocalPathInfoRequestProto {
* block - block for which file path information is being returned
* localPath - file path where the block data is stored
* localMetaPath - file path where the block meta data is stored
*
* This message is deprecated in favor of file descriptor passing.
*/
message GetBlockLocalPathInfoResponseProto {
required ExtendedBlockProto block = 1;

View File

@ -115,6 +115,16 @@ message OpBlockChecksumProto {
required BaseHeaderProto header = 1;
}
message OpRequestShortCircuitAccessProto {
required BaseHeaderProto header = 1;
/** In order to get short-circuit access to block data, clients must set this
* to the highest version of the block data that they can understand.
* Currently 1 is the only version, but more versions may exist in the future
* if the on-disk format changes.
*/
required uint32 maxVersion = 2;
}
message PacketHeaderProto {
// All fields must be fixed-length!
@ -133,6 +143,7 @@ enum Status {
ERROR_EXISTS = 4;
ERROR_ACCESS_TOKEN = 5;
CHECKSUM_OK = 6;
ERROR_UNSUPPORTED = 7;
}
message PipelineAckProto {
@ -165,6 +176,16 @@ message BlockOpResponseProto {
/** explanatory text which may be useful to log on the client side */
optional string message = 5;
/** If the server chooses to agree to the request of a client for
* short-circuit access, it will send a response message with the relevant
* file descriptors attached.
*
* In the body of the message, this version number will be set to the
* specific version number of the block data that the client is about to
* read.
*/
optional uint32 shortCircuitAccessVersion = 6;
}
/**

View File

@ -1231,6 +1231,17 @@
</description>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value></value>
<description>
Optional. This is a path to a UNIX domain socket that will be used for
communication between the DataNode and local HDFS clients.
If the string "_PORT" is present in this path, it will be replaced by the
TCP port of the DataNode.
</description>
</property>
<property>
<name>dfs.datanode.fsdataset.volume.choosing.balanced-space-threshold</name>
<value>10737418240</value> <!-- 10 GB -->

View File

@ -0,0 +1,68 @@
~~ Licensed under the Apache License, Version 2.0 (the "License");
~~ you may not use this file except in compliance with the License.
~~ You may obtain a copy of the License at
~~
~~ http://www.apache.org/licenses/LICENSE-2.0
~~
~~ Unless required by applicable law or agreed to in writing, software
~~ distributed under the License is distributed on an "AS IS" BASIS,
~~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
~~ See the License for the specific language governing permissions and
~~ limitations under the License. See accompanying LICENSE file.
---
Hadoop Distributed File System-${project.version} - Short-Circuit Local Reads
---
---
${maven.build.timestamp}
HDFS Short-Circuit Local Reads
\[ {{{./index.html}Go Back}} \]
%{toc|section=1|fromDepth=0}
* {Background}
In <<<HDFS>>>, reads normally go through the <<<DataNode>>>. Thus, when the
client asks the <<<DataNode>>> to read a file, the <<<DataNode>>> reads that
file off of the disk and sends the data to the client over a TCP socket.
So-called "short-circuit" reads bypass the <<<DataNode>>>, allowing the client
to read the file directly. Obviously, this is only possible in cases where
the client is co-located with the data. Short-circuit reads provide a
substantial performance boost to many applications.
* {Configuration}
To configure short-circuit local reads, you will need to enable
<<<libhadoop.so>>>. See
{{{../hadoop-common/NativeLibraries.html}Native
Libraries}} for details on enabling this library.
Short-circuit reads make use of a UNIX domain socket. This is a special path
in the filesystem that allows the client and the DataNodes to communicate.
You will need to set a path to this socket. The DataNode needs to be able to
create this path. On the other hand, it should not be possible for any user
except the hdfs user or root to create this path. For this reason, paths
under <<</var/run>>> or <<</var/lib>>> are often used.
Short-circuit local reads need to be configured on both the <<<DataNode>>>
and the client.
* {Example Configuration}
Here is an example configuration.
----
<configuration>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>true</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/lib/hadoop-hdfs/dn_socket</value>
</property>
</configuration>
----

View File

@ -28,9 +28,9 @@ import java.net.Socket;
import java.util.List;
import java.util.Random;
import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@ -150,12 +150,12 @@ public class BlockReaderTestUtil {
sock.setSoTimeout(HdfsServerConstants.READ_TIMEOUT);
return BlockReaderFactory.newBlockReader(
new DFSClient.Conf(conf),
sock, targetAddr.toString()+ ":" + block.getBlockId(), block,
conf,
targetAddr.toString()+ ":" + block.getBlockId(), block,
testBlock.getBlockToken(),
offset, lenToRead,
conf.getInt(CommonConfigurationKeys.IO_FILE_BUFFER_SIZE_KEY, 4096),
true, "", null, null);
true, "BlockReaderTestUtil", TcpPeerServer.peerFromSocket(sock),
nodes[0], null, false);
}
/**
@ -166,5 +166,4 @@ public class BlockReaderTestUtil {
int ipcport = nodes[0].getIpcPort();
return cluster.getDataNode(ipcport);
}
}

View File

@ -2188,14 +2188,27 @@ public class MiniDFSCluster {
/**
* Get file correpsonding to a block
* @param storageDir storage directory
* @param blk block to be corrupted
* @return file corresponding to the block
* @param blk the block
* @return data file corresponding to the block
*/
public static File getBlockFile(File storageDir, ExtendedBlock blk) {
return new File(getFinalizedDir(storageDir, blk.getBlockPoolId()),
blk.getBlockName());
}
/**
* Get the latest metadata file correpsonding to a block
* @param storageDir storage directory
* @param blk the block
* @return metadata file corresponding to the block
*/
public static File getBlockMetadataFile(File storageDir, ExtendedBlock blk) {
return new File(getFinalizedDir(storageDir, blk.getBlockPoolId()),
blk.getBlockName() + "_" + blk.getGenerationStamp() +
Block.METADATA_EXTENSION);
}
/**
* Shut down a cluster if it is not null
* @param cluster cluster reference or null
@ -2223,7 +2236,7 @@ public class MiniDFSCluster {
}
/**
* Get files related to a block for a given datanode
* Get the block data file for a block from a given datanode
* @param dnIndex Index of the datanode to get block files for
* @param block block for which corresponding files are needed
*/
@ -2238,6 +2251,24 @@ public class MiniDFSCluster {
}
return null;
}
/**
* Get the block metadata file for a block from a given datanode
*
* @param dnIndex Index of the datanode to get block files for
* @param block block for which corresponding files are needed
*/
public static File getBlockMetadataFile(int dnIndex, ExtendedBlock block) {
// Check for block file in the two storage directories of the datanode
for (int i = 0; i <=1 ; i++) {
File storageDir = MiniDFSCluster.getStorageDir(dnIndex, i);
File blockMetaFile = getBlockMetadataFile(storageDir, block);
if (blockMetaFile.exists()) {
return blockMetaFile;
}
}
return null;
}
/**
* Throw an exception if the MiniDFSCluster is not started with a single

View File

@ -17,90 +17,333 @@
*/
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.apache.hadoop.io.IOUtils;
import org.junit.Assert;
import org.junit.Test;
public class TestBlockReaderLocal {
static MiniDFSCluster cluster;
static HdfsConfiguration conf;
@BeforeClass
public static void setupCluster() throws IOException {
conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
false);
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
UserGroupInformation.getCurrentUser().getShortUserName());
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
}
@AfterClass
public static void teardownCluster() {
cluster.shutdown();
public static void assertArrayRegionsEqual(byte []buf1, int off1, byte []buf2,
int off2, int len) {
for (int i = 0; i < len; i++) {
if (buf1[off1 + i] != buf2[off2 + i]) {
Assert.fail("arrays differ at byte " + i + ". " +
"The first array has " + (int)buf1[off1 + i] +
", but the second array has " + (int)buf2[off2 + i]);
}
}
}
/**
* Test that, in the case of an error, the position and limit of a ByteBuffer
* are left unchanged. This is not mandated by ByteBufferReadable, but clients
* of this class might immediately issue a retry on failure, so it's polite.
* Similar to IOUtils#readFully(). Reads bytes in a loop.
*
* @param reader The BlockReaderLocal to read bytes from
* @param buf The ByteBuffer to read into
* @param off The offset in the buffer to read into
* @param len The number of bytes to read.
*
* @throws IOException If it could not read the requested number of bytes
*/
private static void readFully(BlockReaderLocal reader,
ByteBuffer buf, int off, int len) throws IOException {
int amt = len;
while (amt > 0) {
buf.limit(off + len);
buf.position(off);
long ret = reader.read(buf);
if (ret < 0) {
throw new EOFException( "Premature EOF from BlockReaderLocal " +
"after reading " + (len - amt) + " byte(s).");
}
amt -= ret;
off += ret;
}
}
private static interface BlockReaderLocalTest {
final int TEST_LENGTH = 12345;
public void setup(File blockFile, boolean usingChecksums)
throws IOException;
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException;
}
public void runBlockReaderLocalTest(BlockReaderLocalTest test,
boolean checksum) throws IOException {
MiniDFSCluster cluster = null;
HdfsConfiguration conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, !checksum);
conf.set(DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY, "CRC32C");
FileInputStream dataIn = null, checkIn = null;
final Path TEST_PATH = new Path("/a");
final long RANDOM_SEED = 4567L;
BlockReaderLocal blockReaderLocal = null;
FSDataInputStream fsIn = null;
byte original[] = new byte[BlockReaderLocalTest.TEST_LENGTH];
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
DFSTestUtil.createFile(fs, TEST_PATH,
BlockReaderLocalTest.TEST_LENGTH, (short)1, RANDOM_SEED);
try {
DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
} catch (InterruptedException e) {
Assert.fail("unexpected InterruptedException during " +
"waitReplication: " + e);
} catch (TimeoutException e) {
Assert.fail("unexpected TimeoutException during " +
"waitReplication: " + e);
}
fsIn = fs.open(TEST_PATH);
IOUtils.readFully(fsIn, original, 0,
BlockReaderLocalTest.TEST_LENGTH);
fsIn.close();
fsIn = null;
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
File dataFile = MiniDFSCluster.getBlockFile(0, block);
File metaFile = MiniDFSCluster.getBlockMetadataFile(0, block);
DatanodeID datanodeID = cluster.getDataNodes().get(0).getDatanodeId();
cluster.shutdown();
cluster = null;
test.setup(dataFile, checksum);
dataIn = new FileInputStream(dataFile);
checkIn = new FileInputStream(metaFile);
blockReaderLocal = new BlockReaderLocal(conf,
TEST_PATH.getName(), block, 0, -1,
dataIn, checkIn, datanodeID, checksum);
dataIn = null;
checkIn = null;
test.doTest(blockReaderLocal, original);
} finally {
if (fsIn != null) fsIn.close();
if (cluster != null) cluster.shutdown();
if (dataIn != null) dataIn.close();
if (checkIn != null) checkIn.close();
if (blockReaderLocal != null) blockReaderLocal.close(null, null);
}
}
private static class TestBlockReaderLocalImmediateClose
implements BlockReaderLocalTest {
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException { }
@Override
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException { }
}
@Test
public void testStablePositionAfterCorruptRead() throws Exception {
final short REPL_FACTOR = 1;
final long FILE_LENGTH = 512L;
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
public void testBlockReaderLocalImmediateClose() throws IOException {
runBlockReaderLocalTest(new TestBlockReaderLocalImmediateClose(), true);
runBlockReaderLocalTest(new TestBlockReaderLocalImmediateClose(), false);
}
private static class TestBlockReaderSimpleReads
implements BlockReaderLocalTest {
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException { }
@Override
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException {
byte buf[] = new byte[TEST_LENGTH];
reader.readFully(buf, 0, 512);
assertArrayRegionsEqual(original, 0, buf, 0, 512);
reader.readFully(buf, 512, 512);
assertArrayRegionsEqual(original, 512, buf, 512, 512);
reader.readFully(buf, 1024, 513);
assertArrayRegionsEqual(original, 1024, buf, 1024, 513);
reader.readFully(buf, 1537, 514);
assertArrayRegionsEqual(original, 1537, buf, 1537, 514);
}
}
@Test
public void testBlockReaderSimpleReads() throws IOException {
runBlockReaderLocalTest(new TestBlockReaderSimpleReads(), true);
}
Path path = new Path("/corrupted");
@Test
public void testBlockReaderSimpleReadsNoChecksum() throws IOException {
runBlockReaderLocalTest(new TestBlockReaderSimpleReads(), false);
}
private static class TestBlockReaderLocalArrayReads2
implements BlockReaderLocalTest {
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException { }
@Override
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException {
byte buf[] = new byte[TEST_LENGTH];
reader.readFully(buf, 0, 10);
assertArrayRegionsEqual(original, 0, buf, 0, 10);
reader.readFully(buf, 10, 100);
assertArrayRegionsEqual(original, 10, buf, 10, 100);
reader.readFully(buf, 110, 700);
assertArrayRegionsEqual(original, 110, buf, 110, 700);
reader.readFully(buf, 810, 1); // from offset 810 to offset 811
reader.readFully(buf, 811, 5);
assertArrayRegionsEqual(original, 811, buf, 811, 5);
reader.readFully(buf, 816, 900); // skip from offset 816 to offset 1716
reader.readFully(buf, 1716, 5);
assertArrayRegionsEqual(original, 1716, buf, 1716, 5);
}
}
@Test
public void testBlockReaderLocalArrayReads2() throws IOException {
runBlockReaderLocalTest(new TestBlockReaderLocalArrayReads2(),
true);
}
DFSTestUtil.createFile(fs, path, FILE_LENGTH, REPL_FACTOR, 12345L);
DFSTestUtil.waitReplication(fs, path, REPL_FACTOR);
@Test
public void testBlockReaderLocalArrayReads2NoChecksum()
throws IOException {
runBlockReaderLocalTest(new TestBlockReaderLocalArrayReads2(),
false);
}
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, path);
int blockFilesCorrupted = cluster.corruptBlockOnDataNodes(block);
assertEquals("All replicas not corrupted", REPL_FACTOR, blockFilesCorrupted);
private static class TestBlockReaderLocalByteBufferReads
implements BlockReaderLocalTest {
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException { }
@Override
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException {
ByteBuffer buf = ByteBuffer.wrap(new byte[TEST_LENGTH]);
readFully(reader, buf, 0, 10);
assertArrayRegionsEqual(original, 0, buf.array(), 0, 10);
readFully(reader, buf, 10, 100);
assertArrayRegionsEqual(original, 10, buf.array(), 10, 100);
readFully(reader, buf, 110, 700);
assertArrayRegionsEqual(original, 110, buf.array(), 110, 700);
reader.skip(1); // skip from offset 810 to offset 811
readFully(reader, buf, 811, 5);
assertArrayRegionsEqual(original, 811, buf.array(), 811, 5);
}
}
@Test
public void testBlockReaderLocalByteBufferReads()
throws IOException {
runBlockReaderLocalTest(
new TestBlockReaderLocalByteBufferReads(), true);
}
FSDataInputStream dis = cluster.getFileSystem().open(path);
ByteBuffer buf = ByteBuffer.allocateDirect((int)FILE_LENGTH);
boolean sawException = false;
try {
dis.read(buf);
} catch (ChecksumException ex) {
sawException = true;
@Test
public void testBlockReaderLocalByteBufferReadsNoChecksum()
throws IOException {
runBlockReaderLocalTest(
new TestBlockReaderLocalByteBufferReads(), false);
}
private static class TestBlockReaderLocalReadCorruptStart
implements BlockReaderLocalTest {
boolean usingChecksums = false;
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException {
RandomAccessFile bf = null;
this.usingChecksums = usingChecksums;
try {
bf = new RandomAccessFile(blockFile, "rw");
bf.write(new byte[] {0,0,0,0,0,0,0,0,0,0,0,0,0,0});
} finally {
if (bf != null) bf.close();
}
}
assertTrue(sawException);
assertEquals(0, buf.position());
assertEquals(buf.capacity(), buf.limit());
dis = cluster.getFileSystem().open(path);
buf.position(3);
buf.limit(25);
sawException = false;
try {
dis.read(buf);
} catch (ChecksumException ex) {
sawException = true;
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException {
byte buf[] = new byte[TEST_LENGTH];
if (usingChecksums) {
try {
reader.readFully(buf, 0, 10);
Assert.fail("did not detect corruption");
} catch (IOException e) {
// expected
}
} else {
reader.readFully(buf, 0, 10);
}
}
}
@Test
public void testBlockReaderLocalReadCorruptStart()
throws IOException {
runBlockReaderLocalTest(new TestBlockReaderLocalReadCorruptStart(), true);
}
private static class TestBlockReaderLocalReadCorrupt
implements BlockReaderLocalTest {
boolean usingChecksums = false;
@Override
public void setup(File blockFile, boolean usingChecksums)
throws IOException {
RandomAccessFile bf = null;
this.usingChecksums = usingChecksums;
try {
bf = new RandomAccessFile(blockFile, "rw");
bf.seek(1539);
bf.write(new byte[] {0,0,0,0,0,0,0,0,0,0,0,0,0,0});
} finally {
if (bf != null) bf.close();
}
}
assertTrue(sawException);
assertEquals(3, buf.position());
assertEquals(25, buf.limit());
public void doTest(BlockReaderLocal reader, byte original[])
throws IOException {
byte buf[] = new byte[TEST_LENGTH];
try {
reader.readFully(buf, 0, 10);
assertArrayRegionsEqual(original, 0, buf, 0, 10);
reader.readFully(buf, 10, 100);
assertArrayRegionsEqual(original, 10, buf, 10, 100);
reader.readFully(buf, 110, 700);
assertArrayRegionsEqual(original, 110, buf, 110, 700);
reader.skip(1); // skip from offset 810 to offset 811
reader.readFully(buf, 811, 5);
assertArrayRegionsEqual(original, 811, buf, 811, 5);
reader.readFully(buf, 816, 900);
if (usingChecksums) {
// We should detect the corruption when using a checksum file.
Assert.fail("did not detect corruption");
}
} catch (ChecksumException e) {
if (!usingChecksums) {
Assert.fail("didn't expect to get ChecksumException: not " +
"using checksums.");
}
}
}
}
@Test
public void testBlockReaderLocalReadCorrupt()
throws IOException {
runBlockReaderLocalTest(new TestBlockReaderLocalReadCorrupt(), true);
runBlockReaderLocalTest(new TestBlockReaderLocalReadCorrupt(), false);
}
}

View File

@ -0,0 +1,154 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Arrays;
import org.apache.hadoop.fs.ChecksumException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestBlockReaderLocalLegacy {
@BeforeClass
public static void setupCluster() throws IOException {
DFSInputStream.tcpReadsDisabledForTesting = true;
DomainSocket.disableBindPathValidation();
}
private static HdfsConfiguration getConfiguration(
TemporarySocketDirectory socketDir) throws IOException {
HdfsConfiguration conf = new HdfsConfiguration();
if (socketDir == null) {
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY, "");
} else {
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(socketDir.getDir(), "TestBlockReaderLocalLegacy.%d.sock").
getAbsolutePath());
}
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
false);
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
UserGroupInformation.getCurrentUser().getShortUserName());
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, false);
return conf;
}
/**
* Test that, in the case of an error, the position and limit of a ByteBuffer
* are left unchanged. This is not mandated by ByteBufferReadable, but clients
* of this class might immediately issue a retry on failure, so it's polite.
*/
@Test
public void testStablePositionAfterCorruptRead() throws Exception {
final short REPL_FACTOR = 1;
final long FILE_LENGTH = 512L;
HdfsConfiguration conf = getConfiguration(null);
MiniDFSCluster cluster =
new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
Path path = new Path("/corrupted");
DFSTestUtil.createFile(fs, path, FILE_LENGTH, REPL_FACTOR, 12345L);
DFSTestUtil.waitReplication(fs, path, REPL_FACTOR);
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, path);
int blockFilesCorrupted = cluster.corruptBlockOnDataNodes(block);
assertEquals("All replicas not corrupted", REPL_FACTOR, blockFilesCorrupted);
FSDataInputStream dis = cluster.getFileSystem().open(path);
ByteBuffer buf = ByteBuffer.allocateDirect((int)FILE_LENGTH);
boolean sawException = false;
try {
dis.read(buf);
} catch (ChecksumException ex) {
sawException = true;
}
assertTrue(sawException);
assertEquals(0, buf.position());
assertEquals(buf.capacity(), buf.limit());
dis = cluster.getFileSystem().open(path);
buf.position(3);
buf.limit(25);
sawException = false;
try {
dis.read(buf);
} catch (ChecksumException ex) {
sawException = true;
}
assertTrue(sawException);
assertEquals(3, buf.position());
assertEquals(25, buf.limit());
cluster.shutdown();
}
@Test
public void testBothOldAndNewShortCircuitConfigured() throws Exception {
final short REPL_FACTOR = 1;
final int FILE_LENGTH = 512;
Assume.assumeTrue(null == DomainSocket.getLoadingFailureReason());
TemporarySocketDirectory socketDir = new TemporarySocketDirectory();
HdfsConfiguration conf = getConfiguration(socketDir);
MiniDFSCluster cluster =
new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
socketDir.close();
FileSystem fs = cluster.getFileSystem();
Path path = new Path("/foo");
byte orig[] = new byte[FILE_LENGTH];
for (int i = 0; i < orig.length; i++) {
orig[i] = (byte)(i%10);
}
FSDataOutputStream fos = fs.create(path, (short)1);
fos.write(orig);
fos.close();
DFSTestUtil.waitReplication(fs, path, REPL_FACTOR);
FSDataInputStream fis = cluster.getFileSystem().open(path);
byte buf[] = new byte[FILE_LENGTH];
IOUtils.readFully(fis, buf, 0, FILE_LENGTH);
fis.close();
Assert.assertArrayEquals(orig, buf);
Arrays.equals(orig, buf);
cluster.shutdown();
}
}

View File

@ -61,7 +61,7 @@ public class TestClientBlockVerification {
util.getBlockReader(testBlock, 0, FILE_SIZE_K * 1024));
util.readAndCheckEOS(reader, FILE_SIZE_K * 1024, true);
verify(reader).sendReadResult(Status.CHECKSUM_OK);
reader.close();
reader.close(null, null);
}
/**
@ -76,7 +76,7 @@ public class TestClientBlockVerification {
// We asked the blockreader for the whole file, and only read
// half of it, so no CHECKSUM_OK
verify(reader, never()).sendReadResult(Status.CHECKSUM_OK);
reader.close();
reader.close(null, null);
}
/**
@ -92,7 +92,7 @@ public class TestClientBlockVerification {
// And read half the file
util.readAndCheckEOS(reader, FILE_SIZE_K * 1024 / 2, true);
verify(reader).sendReadResult(Status.CHECKSUM_OK);
reader.close();
reader.close(null, null);
}
/**
@ -111,7 +111,7 @@ public class TestClientBlockVerification {
util.getBlockReader(testBlock, startOffset, length));
util.readAndCheckEOS(reader, length, true);
verify(reader).sendReadResult(Status.CHECKSUM_OK);
reader.close();
reader.close(null, null);
}
}
}

View File

@ -18,28 +18,20 @@
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.spy;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.security.PrivilegedExceptionAction;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.security.token.Token;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockito.Matchers;
import org.mockito.Mockito;
@ -55,59 +47,31 @@ public class TestConnCache {
static final int BLOCK_SIZE = 4096;
static final int FILE_SIZE = 3 * BLOCK_SIZE;
final static int CACHE_SIZE = 4;
final static long CACHE_EXPIRY_MS = 200;
static Configuration conf = null;
static MiniDFSCluster cluster = null;
static FileSystem fs = null;
static SocketCache cache;
static final Path testFile = new Path("/testConnCache.dat");
static byte authenticData[] = null;
static BlockReaderTestUtil util = null;
/**
* A mock Answer to remember the BlockReader used.
*
* It verifies that all invocation to DFSInputStream.getBlockReader()
* use the same socket.
* use the same peer.
*/
private class MockGetBlockReader implements Answer<RemoteBlockReader2> {
public RemoteBlockReader2 reader = null;
private Socket sock = null;
private Peer peer = null;
@Override
public RemoteBlockReader2 answer(InvocationOnMock invocation) throws Throwable {
RemoteBlockReader2 prevReader = reader;
reader = (RemoteBlockReader2) invocation.callRealMethod();
if (sock == null) {
sock = reader.dnSock;
if (peer == null) {
peer = reader.getPeer();
} else if (prevReader != null) {
assertSame("DFSInputStream should use the same socket",
sock, reader.dnSock);
Assert.assertSame("DFSInputStream should use the same peer",
peer, reader.getPeer());
}
return reader;
}
}
@BeforeClass
public static void setupCluster() throws Exception {
final int REPLICATION_FACTOR = 1;
/* create a socket cache. There is only one socket cache per jvm */
cache = SocketCache.getInstance(CACHE_SIZE, CACHE_EXPIRY_MS);
util = new BlockReaderTestUtil(REPLICATION_FACTOR);
cluster = util.getCluster();
conf = util.getConf();
fs = cluster.getFileSystem();
authenticData = util.writeFile(testFile, FILE_SIZE / 1024);
}
/**
* (Optionally) seek to position, read and verify data.
*
@ -117,9 +81,10 @@ public class TestConnCache {
long pos,
byte[] buffer,
int offset,
int length)
int length,
byte[] authenticData)
throws IOException {
assertTrue("Test buffer too small", buffer.length >= offset + length);
Assert.assertTrue("Test buffer too small", buffer.length >= offset + length);
if (pos >= 0)
in.seek(pos);
@ -129,7 +94,7 @@ public class TestConnCache {
while (length > 0) {
int cnt = in.read(buffer, offset, length);
assertTrue("Error in read", cnt > 0);
Assert.assertTrue("Error in read", cnt > 0);
offset += cnt;
length -= cnt;
}
@ -144,116 +109,23 @@ public class TestConnCache {
}
}
/**
* Test the SocketCache itself.
*/
@Test
public void testSocketCache() throws Exception {
// Make a client
InetSocketAddress nnAddr =
new InetSocketAddress("localhost", cluster.getNameNodePort());
DFSClient client = new DFSClient(nnAddr, conf);
// Find out the DN addr
LocatedBlock block =
client.getNamenode().getBlockLocations(
testFile.toString(), 0, FILE_SIZE)
.getLocatedBlocks().get(0);
DataNode dn = util.getDataNode(block);
InetSocketAddress dnAddr = dn.getXferAddress();
// Make some sockets to the DN
Socket[] dnSockets = new Socket[CACHE_SIZE];
for (int i = 0; i < dnSockets.length; ++i) {
dnSockets[i] = client.socketFactory.createSocket(
dnAddr.getAddress(), dnAddr.getPort());
}
// Insert a socket to the NN
Socket nnSock = new Socket(nnAddr.getAddress(), nnAddr.getPort());
cache.put(nnSock, null);
assertSame("Read the write", nnSock, cache.get(nnAddr).sock);
cache.put(nnSock, null);
// Insert DN socks
for (Socket dnSock : dnSockets) {
cache.put(dnSock, null);
}
assertEquals("NN socket evicted", null, cache.get(nnAddr));
assertTrue("Evicted socket closed", nnSock.isClosed());
// Lookup the DN socks
for (Socket dnSock : dnSockets) {
assertEquals("Retrieve cached sockets", dnSock, cache.get(dnAddr).sock);
dnSock.close();
}
assertEquals("Cache is empty", 0, cache.size());
}
/**
* Test the SocketCache expiry.
* Verify that socket cache entries expire after the set
* expiry time.
*/
@Test
public void testSocketCacheExpiry() throws Exception {
// Make a client
InetSocketAddress nnAddr =
new InetSocketAddress("localhost", cluster.getNameNodePort());
DFSClient client = new DFSClient(nnAddr, conf);
// Find out the DN addr
LocatedBlock block =
client.getNamenode().getBlockLocations(
testFile.toString(), 0, FILE_SIZE)
.getLocatedBlocks().get(0);
DataNode dn = util.getDataNode(block);
InetSocketAddress dnAddr = dn.getXferAddress();
// Make some sockets to the DN and put in cache
Socket[] dnSockets = new Socket[CACHE_SIZE];
for (int i = 0; i < dnSockets.length; ++i) {
dnSockets[i] = client.socketFactory.createSocket(
dnAddr.getAddress(), dnAddr.getPort());
cache.put(dnSockets[i], null);
}
// Client side still has the sockets cached
assertEquals(CACHE_SIZE, client.socketCache.size());
//sleep for a second and see if it expired
Thread.sleep(CACHE_EXPIRY_MS + 1000);
// Client side has no sockets cached
assertEquals(0, client.socketCache.size());
//sleep for another second and see if
//the daemon thread runs fine on empty cache
Thread.sleep(CACHE_EXPIRY_MS + 1000);
}
/**
* Read a file served entirely from one DN. Seek around and read from
* different offsets. And verify that they all use the same socket.
*
* @throws java.io.IOException
* @throws Exception
*/
@Test
@SuppressWarnings("unchecked")
public void testReadFromOneDN() throws IOException {
LOG.info("Starting testReadFromOneDN()");
public void testReadFromOneDN() throws Exception {
BlockReaderTestUtil util = new BlockReaderTestUtil(1,
new HdfsConfiguration());
final Path testFile = new Path("/testConnCache.dat");
byte authenticData[] = util.writeFile(testFile, FILE_SIZE / 1024);
DFSClient client = new DFSClient(
new InetSocketAddress("localhost", cluster.getNameNodePort()), conf);
DFSInputStream in = spy(client.open(testFile.toString()));
new InetSocketAddress("localhost",
util.getCluster().getNameNodePort()), util.getConf());
DFSInputStream in = Mockito.spy(client.open(testFile.toString()));
LOG.info("opened " + testFile.toString());
byte[] dataBuf = new byte[BLOCK_SIZE];
MockGetBlockReader answer = new MockGetBlockReader();
@ -270,18 +142,15 @@ public class TestConnCache {
Matchers.anyString());
// Initial read
pread(in, 0, dataBuf, 0, dataBuf.length);
pread(in, 0, dataBuf, 0, dataBuf.length, authenticData);
// Read again and verify that the socket is the same
pread(in, FILE_SIZE - dataBuf.length, dataBuf, 0, dataBuf.length);
pread(in, 1024, dataBuf, 0, dataBuf.length);
pread(in, -1, dataBuf, 0, dataBuf.length); // No seek; just read
pread(in, 64, dataBuf, 0, dataBuf.length / 2);
pread(in, FILE_SIZE - dataBuf.length, dataBuf, 0, dataBuf.length,
authenticData);
pread(in, 1024, dataBuf, 0, dataBuf.length, authenticData);
// No seek; just read
pread(in, -1, dataBuf, 0, dataBuf.length, authenticData);
pread(in, 64, dataBuf, 0, dataBuf.length / 2, authenticData);
in.close();
}
@AfterClass
public static void teardownCluster() throws Exception {
util.shutdown();
}
}

View File

@ -53,7 +53,7 @@ import com.google.common.base.Joiner;
*/
public class TestDFSUpgrade {
private static final int EXPECTED_TXID = 49;
private static final int EXPECTED_TXID = 45;
private static final Log LOG = LogFactory.getLog(TestDFSUpgrade.class.getName());
private Configuration conf;
private int testCounter = 0;

View File

@ -35,6 +35,7 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster.DataNodeProperties;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
@ -70,7 +71,7 @@ public class TestDataTransferKeepalive {
.numDataNodes(1).build();
fs = cluster.getFileSystem();
dfsClient = ((DistributedFileSystem)fs).dfs;
dfsClient.socketCache.clear();
dfsClient.peerCache.clear();
String poolId = cluster.getNamesystem().getBlockPoolId();
dn = cluster.getDataNodes().get(0);
@ -93,13 +94,13 @@ public class TestDataTransferKeepalive {
DFSTestUtil.createFile(fs, TEST_FILE, 1L, (short)1, 0L);
// Clients that write aren't currently re-used.
assertEquals(0, dfsClient.socketCache.size());
assertEquals(0, dfsClient.peerCache.size());
assertXceiverCount(0);
// Reads the file, so we should get a
// cached socket, and should have an xceiver on the other side.
DFSTestUtil.readFile(fs, TEST_FILE);
assertEquals(1, dfsClient.socketCache.size());
assertEquals(1, dfsClient.peerCache.size());
assertXceiverCount(1);
// Sleep for a bit longer than the keepalive timeout
@ -110,13 +111,13 @@ public class TestDataTransferKeepalive {
// The socket is still in the cache, because we don't
// notice that it's closed until we try to read
// from it again.
assertEquals(1, dfsClient.socketCache.size());
assertEquals(1, dfsClient.peerCache.size());
// Take it out of the cache - reading should
// give an EOF.
Socket s = dfsClient.socketCache.get(dnAddr).sock;
assertNotNull(s);
assertEquals(-1, NetUtils.getInputStream(s).read());
Peer peer = dfsClient.peerCache.get(dn.getDatanodeId(), false);
assertNotNull(peer);
assertEquals(-1, peer.getInputStream().read());
}
/**
@ -175,14 +176,14 @@ public class TestDataTransferKeepalive {
}
DFSClient client = ((DistributedFileSystem)fs).dfs;
assertEquals(5, client.socketCache.size());
assertEquals(5, client.peerCache.size());
// Let all the xceivers timeout
Thread.sleep(1500);
assertXceiverCount(0);
// Client side still has the sockets cached
assertEquals(5, client.socketCache.size());
assertEquals(5, client.peerCache.size());
// Reading should not throw an exception.
DFSTestUtil.readFile(fs, TEST_FILE);

View File

@ -0,0 +1,62 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
/**
* This class tests disabling client connection caching in a single node
* mini-cluster.
*/
public class TestDisableConnCache {
static final Log LOG = LogFactory.getLog(TestDisableConnCache.class);
static final int BLOCK_SIZE = 4096;
static final int FILE_SIZE = 3 * BLOCK_SIZE;
/**
* Test that the socket cache can be disabled by setting the capacity to
* 0. Regression test for HDFS-3365.
* @throws Exception
*/
@Test
public void testDisableCache() throws Exception {
HdfsConfiguration confWithoutCache = new HdfsConfiguration();
// Configure a new instance with no peer caching, ensure that it doesn't
// cache anything
confWithoutCache.setInt(
DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY, 0);
BlockReaderTestUtil util = new BlockReaderTestUtil(1, confWithoutCache);
final Path testFile = new Path("/testConnCache.dat");
util.writeFile(testFile, FILE_SIZE / 1024);
FileSystem fsWithoutCache = FileSystem.newInstance(util.getConf());
try {
DFSTestUtil.readFile(fsWithoutCache, testFile);
assertEquals(0, ((DistributedFileSystem)fsWithoutCache).dfs.peerCache.size());
} finally {
fsWithoutCache.close();
util.shutdown();
}
}
}

View File

@ -0,0 +1,127 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import junit.framework.Assert;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.Test;
public class TestFileInputStreamCache {
static final Log LOG = LogFactory.getLog(TestFileInputStreamCache.class);
@Test
public void testCreateAndDestroy() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(10, 1000);
cache.close();
}
private static class TestFileDescriptorPair {
TemporarySocketDirectory dir = new TemporarySocketDirectory();
FileInputStream fis[];
public TestFileDescriptorPair() throws IOException {
fis = new FileInputStream[2];
for (int i = 0; i < 2; i++) {
String name = dir.getDir() + "/file" + i;
FileOutputStream fos = new FileOutputStream(name);
fos.write(1);
fos.close();
fis[i] = new FileInputStream(name);
}
}
public FileInputStream[] getFileInputStreams() {
return fis;
}
public void close() throws IOException {
IOUtils.cleanup(LOG, fis);
dir.close();
}
public boolean compareWith(FileInputStream other[]) {
if ((other == null) || (fis == null)) {
return other == fis;
}
if (fis.length != other.length) return false;
for (int i = 0; i < fis.length; i++) {
if (fis[i] != other[i]) return false;
}
return true;
}
}
@Test
public void testAddAndRetrieve() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 1000000);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertTrue(pair.compareWith(fis));
pair.close();
cache.close();
}
@Test
public void testExpiry() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 10);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
Thread.sleep(cache.getExpiryTimeMs() * 100);
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertNull(fis);
pair.close();
cache.close();
}
@Test
public void testEviction() throws Exception {
FileInputStreamCache cache = new FileInputStreamCache(1, 10000000);
DatanodeID dnId = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8080, 9090, 7070);
ExtendedBlock block = new ExtendedBlock("poolid", 123);
TestFileDescriptorPair pair = new TestFileDescriptorPair();
cache.put(dnId, block, pair.getFileInputStreams());
DatanodeID dnId2 = new DatanodeID("127.0.0.1", "localhost",
"xyzzy", 8081, 9091, 7071);
TestFileDescriptorPair pair2 = new TestFileDescriptorPair();
cache.put(dnId2, block, pair2.getFileInputStreams());
FileInputStream fis[] = cache.get(dnId, block);
Assert.assertNull(fis);
FileInputStream fis2[] = cache.get(dnId2, block);
Assert.assertTrue(pair2.compareWith(fis2));
pair.close();
cache.close();
}
}

View File

@ -17,53 +17,30 @@
*/
package org.apache.hadoop.hdfs;
import java.io.IOException;
import org.apache.commons.logging.impl.Log4JLogger;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
import org.apache.log4j.Level;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestParallelRead extends TestParallelReadUtil {
@BeforeClass
static public void setupCluster() throws Exception {
setupCluster(DEFAULT_REPLICATION_FACTOR, new HdfsConfiguration());
// This is a test of the normal (TCP) read path. For this reason, we turn
// off both short-circuit local reads and UNIX domain socket data traffic.
HdfsConfiguration conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, false);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC,
false);
// dfs.domain.socket.path should be ignored because the previous two keys
// were set to false. This is a regression test for HDFS-4473.
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY, "/will/not/be/created");
setupCluster(DEFAULT_REPLICATION_FACTOR, conf);
}
@AfterClass
static public void teardownCluster() throws Exception {
TestParallelReadUtil.teardownCluster();
}
/**
* Do parallel read several times with different number of files and threads.
*
* Note that while this is the only "test" in a junit sense, we're actually
* dispatching a lot more. Failures in the other methods (and other threads)
* need to be manually collected, which is inconvenient.
*/
@Test
public void testParallelReadCopying() throws IOException {
runTestWorkload(new CopyingReadWorkerHelper());
}
@Test
public void testParallelReadByteBuffer() throws IOException {
runTestWorkload(new DirectReadWorkerHelper());
}
@Test
public void testParallelReadMixed() throws IOException {
runTestWorkload(new MixedWorkloadHelper());
}
@Test
public void testParallelNoChecksums() throws IOException {
verifyChecksums = false;
runTestWorkload(new MixedWorkloadHelper());
}
}

View File

@ -32,12 +32,18 @@ import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.util.Time;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.junit.Assume;
import org.junit.Ignore;
import org.junit.Test;
/**
* Driver class for testing the use of DFSInputStream by multiple concurrent
* readers, using the different read APIs. See subclasses for the actual test
* cases.
* readers, using the different read APIs.
*
* This class is marked as @Ignore so that junit doesn't try to execute the
* tests in here directly. They are executed from subclasses.
*/
@Ignore
public class TestParallelReadUtil {
static final Log LOG = LogFactory.getLog(TestParallelReadUtil.class);
@ -388,4 +394,31 @@ public class TestParallelReadUtil {
util.shutdown();
}
/**
* Do parallel read several times with different number of files and threads.
*
* Note that while this is the only "test" in a junit sense, we're actually
* dispatching a lot more. Failures in the other methods (and other threads)
* need to be manually collected, which is inconvenient.
*/
@Test
public void testParallelReadCopying() throws IOException {
runTestWorkload(new CopyingReadWorkerHelper());
}
@Test
public void testParallelReadByteBuffer() throws IOException {
runTestWorkload(new DirectReadWorkerHelper());
}
@Test
public void testParallelReadMixed() throws IOException {
runTestWorkload(new MixedWorkloadHelper());
}
@Test
public void testParallelNoChecksums() throws IOException {
verifyChecksums = false;
runTestWorkload(new MixedWorkloadHelper());
}
}

View File

@ -17,25 +17,25 @@
*/
package org.apache.hadoop.hdfs;
import java.io.IOException;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
public class TestParallelLocalRead extends TestParallelReadUtil {
public class TestParallelShortCircuitLegacyRead extends TestParallelReadUtil {
@BeforeClass
static public void setupCluster() throws Exception {
DFSInputStream.tcpReadsDisabledForTesting = true;
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY, "");
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, false);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
false);
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
UserGroupInformation.getCurrentUser().getShortUserName());
DomainSocket.disableBindPathValidation();
setupCluster(1, conf);
}
@ -43,26 +43,4 @@ public class TestParallelLocalRead extends TestParallelReadUtil {
static public void teardownCluster() throws Exception {
TestParallelReadUtil.teardownCluster();
}
/**
* Do parallel read several times with different number of files and threads.
*
* Note that while this is the only "test" in a junit sense, we're actually
* dispatching a lot more. Failures in the other methods (and other threads)
* need to be manually collected, which is inconvenient.
*/
@Test
public void testParallelReadCopying() throws IOException {
runTestWorkload(new CopyingReadWorkerHelper());
}
@Test
public void testParallelReadByteBuffer() throws IOException {
runTestWorkload(new DirectReadWorkerHelper());
}
@Test
public void testParallelReadMixed() throws IOException {
runTestWorkload(new MixedWorkloadHelper());
}
}

View File

@ -0,0 +1,59 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.File;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.AfterClass;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import static org.hamcrest.CoreMatchers.*;
public class TestParallelShortCircuitRead extends TestParallelReadUtil {
private static TemporarySocketDirectory sockDir;
@BeforeClass
static public void setupCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
DFSInputStream.tcpReadsDisabledForTesting = true;
sockDir = new TemporarySocketDirectory();
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(), "TestParallelLocalRead.%d.sock").getAbsolutePath());
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
DomainSocket.disableBindPathValidation();
setupCluster(1, conf);
}
@Before
public void before() {
Assume.assumeThat(DomainSocket.getLoadingFailureReason(), equalTo(null));
}
@AfterClass
static public void teardownCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
sockDir.close();
TestParallelReadUtil.teardownCluster();
}
}

View File

@ -0,0 +1,59 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.File;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.AfterClass;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import static org.hamcrest.CoreMatchers.*;
public class TestParallelShortCircuitReadNoChecksum extends TestParallelReadUtil {
private static TemporarySocketDirectory sockDir;
@BeforeClass
static public void setupCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
DFSInputStream.tcpReadsDisabledForTesting = true;
sockDir = new TemporarySocketDirectory();
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(), "TestParallelLocalRead.%d.sock").getAbsolutePath());
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, true);
DomainSocket.disableBindPathValidation();
setupCluster(1, conf);
}
@Before
public void before() {
Assume.assumeThat(DomainSocket.getLoadingFailureReason(), equalTo(null));
}
@AfterClass
static public void teardownCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
sockDir.close();
TestParallelReadUtil.teardownCluster();
}
}

View File

@ -0,0 +1,73 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.File;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.AfterClass;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import static org.hamcrest.CoreMatchers.*;
/**
* This class tests short-circuit local reads without any FileInputStream or
* Socket caching. This is a regression test for HDFS-4417.
*/
public class TestParallelShortCircuitReadUnCached extends TestParallelReadUtil {
private static TemporarySocketDirectory sockDir;
@BeforeClass
static public void setupCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
sockDir = new TemporarySocketDirectory();
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(),
"TestParallelShortCircuitReadUnCached._PORT.sock").getAbsolutePath());
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
conf.setBoolean(DFSConfigKeys.
DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, true);
// We want to test reading from stale sockets.
conf.setInt(DFSConfigKeys.DFS_DATANODE_SOCKET_REUSE_KEEPALIVE_KEY, 1);
conf.setLong(DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_EXPIRY_MSEC_KEY,
5 * 60 * 1000);
conf.setInt(DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY, 32);
// Avoid using the FileInputStreamCache.
conf.setInt(DFSConfigKeys.
DFS_CLIENT_READ_SHORTCIRCUIT_STREAMS_CACHE_SIZE_KEY, 0);
DomainSocket.disableBindPathValidation();
DFSInputStream.tcpReadsDisabledForTesting = true;
setupCluster(1, conf);
}
@Before
public void before() {
Assume.assumeThat(DomainSocket.getLoadingFailureReason(), equalTo(null));
}
@AfterClass
static public void teardownCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
sockDir.close();
TestParallelReadUtil.teardownCluster();
}
}

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import java.io.File;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.junit.AfterClass;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import static org.hamcrest.CoreMatchers.*;
public class TestParallelUnixDomainRead extends TestParallelReadUtil {
private static TemporarySocketDirectory sockDir;
@BeforeClass
static public void setupCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
DFSInputStream.tcpReadsDisabledForTesting = true;
sockDir = new TemporarySocketDirectory();
HdfsConfiguration conf = new HdfsConfiguration();
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(), "TestParallelLocalRead.%d.sock").getAbsolutePath());
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, false);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_DOMAIN_SOCKET_DATA_TRAFFIC, true);
DomainSocket.disableBindPathValidation();
setupCluster(1, conf);
}
@Before
public void before() {
Assume.assumeThat(DomainSocket.getLoadingFailureReason(), equalTo(null));
}
@AfterClass
static public void teardownCluster() throws Exception {
if (DomainSocket.getLoadingFailureReason() != null) return;
sockDir.close();
TestParallelReadUtil.teardownCluster();
}
}

View File

@ -0,0 +1,288 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.channels.ReadableByteChannel;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.net.Peer;
import org.apache.hadoop.net.unix.DomainSocket;
import org.junit.Test;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
import com.google.common.collect.HashMultiset;
public class TestPeerCache {
static final Log LOG = LogFactory.getLog(TestPeerCache.class);
private static class FakePeer implements Peer {
private boolean closed = false;
private final boolean hasDomain;
private DatanodeID dnId;
public FakePeer(DatanodeID dnId, boolean hasDomain) {
this.dnId = dnId;
this.hasDomain = hasDomain;
}
@Override
public ReadableByteChannel getInputStreamChannel() {
throw new UnsupportedOperationException();
}
@Override
public void setReadTimeout(int timeoutMs) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public int getReceiveBufferSize() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean getTcpNoDelay() throws IOException {
return false;
}
@Override
public void setWriteTimeout(int timeoutMs) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean isClosed() {
return closed;
}
@Override
public void close() throws IOException {
closed = true;
}
@Override
public String getRemoteAddressString() {
return dnId.getInfoAddr();
}
@Override
public String getLocalAddressString() {
return "127.0.0.1:123";
}
@Override
public InputStream getInputStream() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public OutputStream getOutputStream() throws IOException {
throw new UnsupportedOperationException();
}
@Override
public boolean isLocal() {
return true;
}
@Override
public String toString() {
return "FakePeer(dnId=" + dnId + ")";
}
@Override
public DomainSocket getDomainSocket() {
if (!hasDomain) return null;
// Return a mock which throws an exception whenever any function is
// called.
return Mockito.mock(DomainSocket.class,
new Answer<Object>() {
@Override
public Object answer(InvocationOnMock invocation)
throws Throwable {
throw new RuntimeException("injected fault.");
} });
}
@Override
public boolean equals(Object o) {
if (!(o instanceof FakePeer)) return false;
FakePeer other = (FakePeer)o;
return hasDomain == other.hasDomain &&
dnId.equals(other.dnId);
}
@Override
public int hashCode() {
return dnId.hashCode() ^ (hasDomain ? 1 : 0);
}
}
@Test
public void testAddAndRetrieve() throws Exception {
PeerCache cache = new PeerCache(3, 100000);
DatanodeID dnId = new DatanodeID("192.168.0.1",
"fakehostname", "fake_storage_id",
100, 101, 102);
FakePeer peer = new FakePeer(dnId, false);
cache.put(dnId, peer);
assertTrue(!peer.isClosed());
assertEquals(1, cache.size());
assertEquals(peer, cache.get(dnId, false));
assertEquals(0, cache.size());
cache.close();
}
@Test
public void testExpiry() throws Exception {
final int CAPACITY = 3;
final int EXPIRY_PERIOD = 10;
PeerCache cache = new PeerCache(CAPACITY, EXPIRY_PERIOD);
DatanodeID dnIds[] = new DatanodeID[CAPACITY];
FakePeer peers[] = new FakePeer[CAPACITY];
for (int i = 0; i < CAPACITY; ++i) {
dnIds[i] = new DatanodeID("192.168.0.1",
"fakehostname_" + i, "fake_storage_id",
100, 101, 102);
peers[i] = new FakePeer(dnIds[i], false);
}
for (int i = 0; i < CAPACITY; ++i) {
cache.put(dnIds[i], peers[i]);
}
// Wait for the peers to expire
Thread.sleep(EXPIRY_PERIOD * 50);
assertEquals(0, cache.size());
// make sure that the peers were closed when they were expired
for (int i = 0; i < CAPACITY; ++i) {
assertTrue(peers[i].isClosed());
}
// sleep for another second and see if
// the daemon thread runs fine on empty cache
Thread.sleep(EXPIRY_PERIOD * 50);
cache.close();
}
@Test
public void testEviction() throws Exception {
final int CAPACITY = 3;
PeerCache cache = new PeerCache(CAPACITY, 100000);
DatanodeID dnIds[] = new DatanodeID[CAPACITY + 1];
FakePeer peers[] = new FakePeer[CAPACITY + 1];
for (int i = 0; i < dnIds.length; ++i) {
dnIds[i] = new DatanodeID("192.168.0.1",
"fakehostname_" + i, "fake_storage_id_" + i,
100, 101, 102);
peers[i] = new FakePeer(dnIds[i], false);
}
for (int i = 0; i < CAPACITY; ++i) {
cache.put(dnIds[i], peers[i]);
}
// Check that the peers are cached
assertEquals(CAPACITY, cache.size());
// Add another entry and check that the first entry was evicted
cache.put(dnIds[CAPACITY], peers[CAPACITY]);
assertEquals(CAPACITY, cache.size());
assertSame(null, cache.get(dnIds[0], false));
// Make sure that the other entries are still there
for (int i = 1; i < CAPACITY; ++i) {
Peer peer = cache.get(dnIds[i], false);
assertSame(peers[i], peer);
assertTrue(!peer.isClosed());
peer.close();
}
assertEquals(1, cache.size());
cache.close();
}
@Test
public void testMultiplePeersWithSameKey() throws Exception {
final int CAPACITY = 3;
PeerCache cache = new PeerCache(CAPACITY, 100000);
DatanodeID dnId = new DatanodeID("192.168.0.1",
"fakehostname", "fake_storage_id",
100, 101, 102);
HashMultiset<FakePeer> peers = HashMultiset.create(CAPACITY);
for (int i = 0; i < CAPACITY; ++i) {
FakePeer peer = new FakePeer(dnId, false);
peers.add(peer);
cache.put(dnId, peer);
}
// Check that all of the peers ended up in the cache
assertEquals(CAPACITY, cache.size());
while (!peers.isEmpty()) {
Peer peer = cache.get(dnId, false);
assertTrue(peer != null);
assertTrue(!peer.isClosed());
peers.remove(peer);
}
assertEquals(0, cache.size());
cache.close();
}
@Test
public void testDomainSocketPeers() throws Exception {
final int CAPACITY = 3;
PeerCache cache = new PeerCache(CAPACITY, 100000);
DatanodeID dnId = new DatanodeID("192.168.0.1",
"fakehostname", "fake_storage_id",
100, 101, 102);
HashMultiset<FakePeer> peers = HashMultiset.create(CAPACITY);
for (int i = 0; i < CAPACITY; ++i) {
FakePeer peer = new FakePeer(dnId, i == CAPACITY - 1);
peers.add(peer);
cache.put(dnId, peer);
}
// Check that all of the peers ended up in the cache
assertEquals(CAPACITY, cache.size());
// Test that get(requireDomainPeer=true) finds the peer with the
// domain socket.
Peer peer = cache.get(dnId, true);
assertTrue(peer.getDomainSocket() != null);
peers.remove(peer);
// Test that get(requireDomainPeer=true) returns null when there are
// no more peers with domain sockets.
peer = cache.get(dnId, true);
assertTrue(peer == null);
// Check that all of the other peers ended up in the cache.
while (!peers.isEmpty()) {
peer = cache.get(dnId, false);
assertTrue(peer != null);
assertTrue(!peer.isClosed());
peers.remove(peer);
}
assertEquals(0, cache.size());
cache.close();
}
}

View File

@ -99,7 +99,7 @@ public class TestSafeMode {
*/
@Test
public void testManualSafeMode() throws IOException {
fs = (DistributedFileSystem)cluster.getFileSystem();
fs = cluster.getFileSystem();
Path file1 = new Path("/tmp/testManualSafeMode/file1");
Path file2 = new Path("/tmp/testManualSafeMode/file2");
@ -112,7 +112,7 @@ public class TestSafeMode {
// now bring up just the NameNode.
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0).format(false).build();
cluster.waitActive();
dfs = (DistributedFileSystem)cluster.getFileSystem();
dfs = cluster.getFileSystem();
assertTrue("No datanode is started. Should be in SafeMode",
dfs.setSafeMode(SafeModeAction.SAFEMODE_GET));
@ -322,11 +322,11 @@ public class TestSafeMode {
fs.rename(file1, new Path("file2"));
}});
try {
fs.setTimes(file1, 0, 0);
} catch (IOException ioe) {
fail("Set times failed while in SM");
}
runFsFun("Set time while in SM", new FSRun() {
@Override
public void run(FileSystem fs) throws IOException {
fs.setTimes(file1, 0, 0);
}});
try {
DFSTestUtil.readFile(fs, file1);
@ -350,7 +350,7 @@ public class TestSafeMode {
conf.setInt(DFSConfigKeys.DFS_NAMENODE_SAFEMODE_MIN_DATANODES_KEY, 1);
cluster.restartNameNode();
fs = (DistributedFileSystem)cluster.getFileSystem();
fs = cluster.getFileSystem();
String tipMsg = cluster.getNamesystem().getSafemode();
assertTrue("Safemode tip message looks right: " + tipMsg,
@ -375,7 +375,7 @@ public class TestSafeMode {
* @throws IOException when there's an issue connecting to the test DFS.
*/
public void testSafeModeUtils() throws IOException {
dfs = (DistributedFileSystem)cluster.getFileSystem();
dfs = cluster.getFileSystem();
// Enter safemode.
dfs.setSafeMode(SafeModeAction.SAFEMODE_ENTER);

View File

@ -21,10 +21,13 @@ import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import java.io.EOFException;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.net.URI;
import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction;
import java.util.concurrent.TimeoutException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
@ -32,7 +35,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
@ -40,15 +42,21 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.server.datanode.SimulatedFSDataset;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.net.unix.TemporarySocketDirectory;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import static org.hamcrest.CoreMatchers.*;
/**
* Test for short circuit read functionality using {@link BlockReaderLocal}.
@ -58,9 +66,24 @@ import org.junit.Test;
* system.
*/
public class TestShortCircuitLocalRead {
private static TemporarySocketDirectory sockDir;
static final String DIR = "/" + TestShortCircuitLocalRead.class.getSimpleName() + "/";
@BeforeClass
public static void init() {
sockDir = new TemporarySocketDirectory();
DomainSocket.disableBindPathValidation();
}
@AfterClass
public static void shutdown() throws IOException {
sockDir.close();
}
@Before
public void before() {
Assume.assumeThat(DomainSocket.getLoadingFailureReason(), equalTo(null));
}
static final long seed = 0xDEADBEEFL;
static final int blockSize = 5120;
boolean simulatedStorage = false;
@ -84,7 +107,9 @@ public class TestShortCircuitLocalRead {
for (int idx = 0; idx < len; idx++) {
if (expected[from + idx] != actual[idx]) {
Assert.fail(message + " byte " + (from + idx) + " differs. expected "
+ expected[from + idx] + " actual " + actual[idx]);
+ expected[from + idx] + " actual " + actual[idx] +
"\nexpected: " + StringUtils.byteToHexString(expected, from, from + len) +
"\nactual: " + StringUtils.byteToHexString(actual, 0, len));
}
}
}
@ -96,11 +121,13 @@ public class TestShortCircuitLocalRead {
/** Check file content, reading as user {@code readingUser} */
static void checkFileContent(URI uri, Path name, byte[] expected,
int readOffset, String readingUser, Configuration conf,
boolean shortCircuitFails)
boolean legacyShortCircuitFails)
throws IOException, InterruptedException {
// Ensure short circuit is enabled
DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
assertTrue(fs.getClient().getShortCircuitLocalReads());
if (legacyShortCircuitFails) {
assertTrue(fs.getClient().useLegacyBlockReaderLocal());
}
FSDataInputStream stm = fs.open(name);
byte[] actual = new byte[expected.length-readOffset];
@ -127,9 +154,8 @@ public class TestShortCircuitLocalRead {
}
checkData(actual, readOffset, expected, "Read 3");
if (shortCircuitFails) {
// short circuit should be disabled due to failure
assertFalse(fs.getClient().getShortCircuitLocalReads());
if (legacyShortCircuitFails) {
assertFalse(fs.getClient().useLegacyBlockReaderLocal());
}
stm.close();
}
@ -145,11 +171,13 @@ public class TestShortCircuitLocalRead {
/** Check the file content, reading as user {@code readingUser} */
static void checkFileContentDirect(URI uri, Path name, byte[] expected,
int readOffset, String readingUser, Configuration conf,
boolean shortCircuitFails)
boolean legacyShortCircuitFails)
throws IOException, InterruptedException {
// Ensure short circuit is enabled
DistributedFileSystem fs = getFileSystem(readingUser, uri, conf);
assertTrue(fs.getClient().getShortCircuitLocalReads());
if (legacyShortCircuitFails) {
assertTrue(fs.getClient().useLegacyBlockReaderLocal());
}
HdfsDataInputStream stm = (HdfsDataInputStream)fs.open(name);
@ -180,33 +208,45 @@ public class TestShortCircuitLocalRead {
nread += nbytes;
}
checkData(arrayFromByteBuffer(actual), readOffset, expected, "Read 3");
if (shortCircuitFails) {
// short circuit should be disabled due to failure
assertFalse(fs.getClient().getShortCircuitLocalReads());
if (legacyShortCircuitFails) {
assertFalse(fs.getClient().useLegacyBlockReaderLocal());
}
stm.close();
}
public void doTestShortCircuitReadLegacy(boolean ignoreChecksum, int size,
int readOffset, String shortCircuitUser, String readingUser,
boolean legacyShortCircuitFails) throws IOException, InterruptedException {
doTestShortCircuitReadImpl(ignoreChecksum, size, readOffset,
shortCircuitUser, readingUser, legacyShortCircuitFails);
}
public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
int readOffset) throws IOException, InterruptedException {
String shortCircuitUser = getCurrentUser();
doTestShortCircuitRead(ignoreChecksum, size, readOffset, shortCircuitUser,
shortCircuitUser, false);
doTestShortCircuitReadImpl(ignoreChecksum, size, readOffset,
null, getCurrentUser(), false);
}
/**
* Test that file data can be read by reading the block file
* directly from the local store.
*/
public void doTestShortCircuitRead(boolean ignoreChecksum, int size,
public void doTestShortCircuitReadImpl(boolean ignoreChecksum, int size,
int readOffset, String shortCircuitUser, String readingUser,
boolean shortCircuitFails) throws IOException, InterruptedException {
boolean legacyShortCircuitFails) throws IOException, InterruptedException {
Configuration conf = new Configuration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
ignoreChecksum);
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
shortCircuitUser);
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
new File(sockDir.getDir(),
"TestShortCircuitLocalRead._PORT.sock").getAbsolutePath());
if (shortCircuitUser != null) {
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
shortCircuitUser);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADERLOCAL, true);
}
if (simulatedStorage) {
SimulatedFSDataset.setFactory(conf);
}
@ -228,9 +268,9 @@ public class TestShortCircuitLocalRead {
URI uri = cluster.getURI();
checkFileContent(uri, file1, fileData, readOffset, readingUser, conf,
shortCircuitFails);
legacyShortCircuitFails);
checkFileContentDirect(uri, file1, fileData, readOffset, readingUser,
conf, shortCircuitFails);
conf, legacyShortCircuitFails);
} finally {
fs.close();
cluster.shutdown();
@ -255,6 +295,12 @@ public class TestShortCircuitLocalRead {
doTestShortCircuitRead(true, 13, 5);
}
@Test(timeout=10000)
public void testLocalReadLegacy() throws Exception {
doTestShortCircuitReadLegacy(true, 13, 0, getCurrentUser(),
getCurrentUser(), false);
}
/**
* Try a short circuit from a reader that is not allowed to
* to use short circuit. The test ensures reader falls back to non
@ -262,7 +308,7 @@ public class TestShortCircuitLocalRead {
*/
@Test(timeout=10000)
public void testLocalReadFallback() throws Exception {
doTestShortCircuitRead(true, 13, 0, getCurrentUser(), "notallowed", true);
doTestShortCircuitReadLegacy(true, 13, 0, getCurrentUser(), "notallowed", true);
}
@Test(timeout=10000)
@ -276,7 +322,7 @@ public class TestShortCircuitLocalRead {
doTestShortCircuitRead(false, 10*blockSize+100, 777);
doTestShortCircuitRead(true, 10*blockSize+100, 777);
}
private ClientDatanodeProtocol getProxy(UserGroupInformation ugi,
final DatanodeID dnInfo, final Configuration conf) throws IOException,
InterruptedException {
@ -301,21 +347,15 @@ public class TestShortCircuitLocalRead {
}
@Test(timeout=10000)
public void testGetBlockLocalPathInfo() throws IOException, InterruptedException {
public void testDeprecatedGetBlockLocalPathInfoRpc()
throws IOException, InterruptedException {
final Configuration conf = new Configuration();
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
"alloweduser1,alloweduser2");
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1)
.format(true).build();
cluster.waitActive();
final DataNode dn = cluster.getDataNodes().get(0);
FileSystem fs = cluster.getFileSystem();
try {
DFSTestUtil.createFile(fs, new Path("/tmp/x"), 16, (short) 1, 23);
UserGroupInformation aUgi1 =
UserGroupInformation.createRemoteUser("alloweduser1");
UserGroupInformation aUgi2 =
UserGroupInformation.createRemoteUser("alloweduser2");
LocatedBlocks lb = cluster.getNameNode().getRpcServer()
.getBlockLocations("/tmp/x", 0, 16);
// Create a new block object, because the block inside LocatedBlock at
@ -323,29 +363,11 @@ public class TestShortCircuitLocalRead {
ExtendedBlock blk = new ExtendedBlock(lb.get(0).getBlock());
Token<BlockTokenIdentifier> token = lb.get(0).getBlockToken();
final DatanodeInfo dnInfo = lb.get(0).getLocations()[0];
ClientDatanodeProtocol proxy = getProxy(aUgi1, dnInfo, conf);
// This should succeed
BlockLocalPathInfo blpi = proxy.getBlockLocalPathInfo(blk, token);
Assert.assertEquals(
DataNodeTestUtils.getFSDataset(dn).getBlockLocalPathInfo(blk).getBlockPath(),
blpi.getBlockPath());
// Try with the other allowed user
proxy = getProxy(aUgi2, dnInfo, conf);
// This should succeed as well
blpi = proxy.getBlockLocalPathInfo(blk, token);
Assert.assertEquals(
DataNodeTestUtils.getFSDataset(dn).getBlockLocalPathInfo(blk).getBlockPath(),
blpi.getBlockPath());
// Now try with a disallowed user
UserGroupInformation bUgi = UserGroupInformation
.createRemoteUser("notalloweduser");
proxy = getProxy(bUgi, dnInfo, conf);
ClientDatanodeProtocol proxy =
DFSUtil.createClientDatanodeProtocolProxy(dnInfo, conf, 60000, false);
try {
proxy.getBlockLocalPathInfo(blk, token);
Assert.fail("The call should have failed as " + bUgi.getShortUserName()
Assert.fail("The call should have failed as this user "
+ " is not allowed to call getBlockLocalPathInfo");
} catch (IOException ex) {
Assert.assertTrue(ex.getMessage().contains(
@ -363,8 +385,9 @@ public class TestShortCircuitLocalRead {
Configuration conf = new Configuration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
conf.set(DFSConfigKeys.DFS_BLOCK_LOCAL_PATH_ACCESS_USER_KEY,
getCurrentUser());
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
"/tmp/testSkipWithVerifyChecksum._PORT");
DomainSocket.disableBindPathValidation();
if (simulatedStorage) {
SimulatedFSDataset.setFactory(conf);
}
@ -402,6 +425,88 @@ public class TestShortCircuitLocalRead {
cluster.shutdown();
}
}
@Test
public void testHandleTruncatedBlockFile() throws IOException {
MiniDFSCluster cluster = null;
HdfsConfiguration conf = new HdfsConfiguration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, true);
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY, false);
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
"/tmp/testHandleTruncatedBlockFile._PORT");
conf.set(DFSConfigKeys.DFS_CHECKSUM_TYPE_KEY, "CRC32C");
final Path TEST_PATH = new Path("/a");
final Path TEST_PATH2 = new Path("/b");
final long RANDOM_SEED = 4567L;
final long RANDOM_SEED2 = 4568L;
FSDataInputStream fsIn = null;
final int TEST_LENGTH = 3456;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
DFSTestUtil.createFile(fs, TEST_PATH,
TEST_LENGTH, (short)1, RANDOM_SEED);
DFSTestUtil.createFile(fs, TEST_PATH2,
TEST_LENGTH, (short)1, RANDOM_SEED2);
fsIn = cluster.getFileSystem().open(TEST_PATH2);
byte original[] = new byte[TEST_LENGTH];
IOUtils.readFully(fsIn, original, 0, TEST_LENGTH);
fsIn.close();
fsIn = null;
try {
DFSTestUtil.waitReplication(fs, TEST_PATH, (short)1);
} catch (InterruptedException e) {
Assert.fail("unexpected InterruptedException during " +
"waitReplication: " + e);
} catch (TimeoutException e) {
Assert.fail("unexpected TimeoutException during " +
"waitReplication: " + e);
}
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
File dataFile = MiniDFSCluster.getBlockFile(0, block);
cluster.shutdown();
cluster = null;
RandomAccessFile raf = null;
try {
raf = new RandomAccessFile(dataFile, "rw");
raf.setLength(0);
} finally {
if (raf != null) raf.close();
}
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).format(false).build();
cluster.waitActive();
fs = cluster.getFileSystem();
fsIn = fs.open(TEST_PATH);
try {
byte buf[] = new byte[100];
fsIn.seek(2000);
fsIn.readFully(buf, 0, buf.length);
Assert.fail("shouldn't be able to read from corrupt 0-length " +
"block file.");
} catch (IOException e) {
DFSClient.LOG.error("caught exception ", e);
}
fsIn.close();
fsIn = null;
// We should still be able to read the other file.
// This is important because it indicates that we detected that the
// previous block was corrupt, rather than blaming the problem on
// communication.
fsIn = fs.open(TEST_PATH2);
byte buf[] = new byte[original.length];
fsIn.readFully(buf, 0, buf.length);
TestBlockReaderLocal.assertArrayRegionsEqual(original, 0, buf, 0,
original.length);
fsIn.close();
fsIn = null;
} finally {
if (fsIn != null) fsIn.close();
if (cluster != null) cluster.shutdown();
}
}
/**
* Test to run benchmarks between short circuit read vs regular read with
@ -424,6 +529,8 @@ public class TestShortCircuitLocalRead {
// Setup create a file
final Configuration conf = new Configuration();
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_KEY, shortcircuit);
conf.set(DFSConfigKeys.DFS_DOMAIN_SOCKET_PATH_KEY,
"/tmp/TestShortCircuitLocalRead._PORT");
conf.setBoolean(DFSConfigKeys.DFS_CLIENT_READ_SHORTCIRCUIT_SKIP_CHECKSUM_KEY,
checksum);

View File

@ -1,171 +0,0 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertSame;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.spy;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.security.PrivilegedExceptionAction;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.security.token.Token;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;
import org.mockito.Matchers;
import org.mockito.Mockito;
import org.mockito.invocation.InvocationOnMock;
import org.mockito.stubbing.Answer;
/**
* This class tests the client connection caching in a single node
* mini-cluster.
*/
public class TestSocketCache {
static final Log LOG = LogFactory.getLog(TestSocketCache.class);
static final int BLOCK_SIZE = 4096;
static final int FILE_SIZE = 3 * BLOCK_SIZE;
final static int CACHE_SIZE = 4;
final static long CACHE_EXPIRY_MS = 200;
static Configuration conf = null;
static MiniDFSCluster cluster = null;
static FileSystem fs = null;
static SocketCache cache;
static final Path testFile = new Path("/testConnCache.dat");
static byte authenticData[] = null;
static BlockReaderTestUtil util = null;
/**
* A mock Answer to remember the BlockReader used.
*
* It verifies that all invocation to DFSInputStream.getBlockReader()
* use the same socket.
*/
private class MockGetBlockReader implements Answer<RemoteBlockReader2> {
public RemoteBlockReader2 reader = null;
private Socket sock = null;
@Override
public RemoteBlockReader2 answer(InvocationOnMock invocation) throws Throwable {
RemoteBlockReader2 prevReader = reader;
reader = (RemoteBlockReader2) invocation.callRealMethod();
if (sock == null) {
sock = reader.dnSock;
} else if (prevReader != null) {
assertSame("DFSInputStream should use the same socket",
sock, reader.dnSock);
}
return reader;
}
}
@BeforeClass
public static void setupCluster() throws Exception {
final int REPLICATION_FACTOR = 1;
HdfsConfiguration confWithoutCache = new HdfsConfiguration();
confWithoutCache.setInt(
DFSConfigKeys.DFS_CLIENT_SOCKET_CACHE_CAPACITY_KEY, 0);
util = new BlockReaderTestUtil(REPLICATION_FACTOR, confWithoutCache);
cluster = util.getCluster();
conf = util.getConf();
authenticData = util.writeFile(testFile, FILE_SIZE / 1024);
}
/**
* (Optionally) seek to position, read and verify data.
*
* Seek to specified position if pos is non-negative.
*/
private void pread(DFSInputStream in,
long pos,
byte[] buffer,
int offset,
int length)
throws IOException {
assertTrue("Test buffer too small", buffer.length >= offset + length);
if (pos >= 0)
in.seek(pos);
LOG.info("Reading from file of size " + in.getFileLength() +
" at offset " + in.getPos());
while (length > 0) {
int cnt = in.read(buffer, offset, length);
assertTrue("Error in read", cnt > 0);
offset += cnt;
length -= cnt;
}
// Verify
for (int i = 0; i < length; ++i) {
byte actual = buffer[i];
byte expect = authenticData[(int)pos + i];
assertEquals("Read data mismatch at file offset " + (pos + i) +
". Expects " + expect + "; got " + actual,
actual, expect);
}
}
/**
* Test that the socket cache can be disabled by setting the capacity to
* 0. Regression test for HDFS-3365.
*/
@Test
public void testDisableCache() throws IOException {
LOG.info("Starting testDisableCache()");
// Configure a new instance with no caching, ensure that it doesn't
// cache anything
FileSystem fsWithoutCache = FileSystem.newInstance(conf);
try {
DFSTestUtil.readFile(fsWithoutCache, testFile);
assertEquals(0, ((DistributedFileSystem)fsWithoutCache).dfs.socketCache.size());
} finally {
fsWithoutCache.close();
}
}
@AfterClass
public static void teardownCluster() throws Exception {
util.shutdown();
}
}

View File

@ -42,6 +42,7 @@ import org.apache.hadoop.hdfs.DFSClient;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@ -145,8 +146,9 @@ public class TestBlockTokenWithDFS {
String file = BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid", block.getBlockId());
blockReader = BlockReaderFactory.newBlockReader(
conf, s, file, block,
lblock.getBlockToken(), 0, -1, null);
conf, file, block, lblock.getBlockToken(), 0, -1,
true, "TestBlockTokenWithDFS", TcpPeerServer.peerFromSocket(s),
nodes[0], null, false);
} catch (IOException ex) {
if (ex instanceof InvalidBlockTokenException) {

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.hdfs.server.datanode;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@ -962,6 +963,12 @@ public class SimulatedFSDataset implements FsDatasetSpi<FsVolumeSpi> {
public BlockLocalPathInfo getBlockLocalPathInfo(ExtendedBlock b) {
throw new UnsupportedOperationException();
}
@Override
public FileInputStream[] getShortCircuitFdsForRead(ExtendedBlock block)
throws IOException {
throw new UnsupportedOperationException();
}
@Override
public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks)

View File

@ -32,11 +32,13 @@ import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.BlockReader;
import org.apache.hadoop.hdfs.BlockReaderFactory;
import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSTestUtil;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -280,10 +282,11 @@ public class TestDataNodeVolumeFailure {
String file = BlockReaderFactory.getFileName(targetAddr,
"test-blockpoolid",
block.getBlockId());
BlockReaderFactory.newBlockReader(conf, s, file, block, lblock
.getBlockToken(), 0, -1, null);
// nothing - if it fails - it will throw and exception
BlockReader blockReader =
BlockReaderFactory.newBlockReader(conf, file, block,
lblock.getBlockToken(), 0, -1, true, "TestDataNodeVolumeFailure",
TcpPeerServer.peerFromSocket(s), datanode, null, false);
blockReader.close(null, null);
}
/**

View File

@ -1129,7 +1129,7 @@ public class TestCheckpoint {
throw new IOException(e);
}
final int EXPECTED_TXNS_FIRST_SEG = 12;
final int EXPECTED_TXNS_FIRST_SEG = 11;
// the following steps should have happened:
// edits_inprogress_1 -> edits_1-12 (finalized)

View File

@ -17,6 +17,7 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import static org.junit.Assert.assertEquals;
import java.io.File;
@ -30,12 +31,14 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenSecretManager;
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.security.token.Token;
import org.junit.Test;
import static org.mockito.Mockito.*;
/**
* This class tests the creation and validation of a checkpoint.
@ -163,4 +166,70 @@ public class TestSecurityTokenEditLog {
if(cluster != null) cluster.shutdown();
}
}
@Test(timeout=10000)
public void testEditsForCancelOnTokenExpire() throws IOException,
InterruptedException {
long renewInterval = 2000;
Configuration conf = new Configuration();
conf.setBoolean(
DFSConfigKeys.DFS_NAMENODE_DELEGATION_TOKEN_ALWAYS_USE_KEY, true);
conf.setLong(DFS_NAMENODE_DELEGATION_TOKEN_RENEW_INTERVAL_KEY, renewInterval);
conf.setLong(DFS_NAMENODE_DELEGATION_TOKEN_MAX_LIFETIME_KEY, renewInterval*2);
Text renewer = new Text(UserGroupInformation.getCurrentUser().getUserName());
FSImage fsImage = mock(FSImage.class);
FSEditLog log = mock(FSEditLog.class);
doReturn(log).when(fsImage).getEditLog();
FSNamesystem fsn = new FSNamesystem(conf, fsImage);
DelegationTokenSecretManager dtsm = fsn.getDelegationTokenSecretManager();
try {
dtsm.startThreads();
// get two tokens
Token<DelegationTokenIdentifier> token1 = fsn.getDelegationToken(renewer);
Token<DelegationTokenIdentifier> token2 = fsn.getDelegationToken(renewer);
DelegationTokenIdentifier ident1 =
(DelegationTokenIdentifier)token1.decodeIdentifier();
DelegationTokenIdentifier ident2 =
(DelegationTokenIdentifier)token2.decodeIdentifier();
// verify we got the tokens
verify(log, times(1)).logGetDelegationToken(eq(ident1), anyLong());
verify(log, times(1)).logGetDelegationToken(eq(ident2), anyLong());
// this is a little tricky because DTSM doesn't let us set scan interval
// so need to periodically sleep, then stop/start threads to force scan
// renew first token 1/2 to expire
Thread.sleep(renewInterval/2);
fsn.renewDelegationToken(token2);
verify(log, times(1)).logRenewDelegationToken(eq(ident2), anyLong());
// force scan and give it a little time to complete
dtsm.stopThreads(); dtsm.startThreads();
Thread.sleep(250);
// no token has expired yet
verify(log, times(0)).logCancelDelegationToken(eq(ident1));
verify(log, times(0)).logCancelDelegationToken(eq(ident2));
// sleep past expiration of 1st non-renewed token
Thread.sleep(renewInterval/2);
dtsm.stopThreads(); dtsm.startThreads();
Thread.sleep(250);
// non-renewed token should have implicitly been cancelled
verify(log, times(1)).logCancelDelegationToken(eq(ident1));
verify(log, times(0)).logCancelDelegationToken(eq(ident2));
// sleep past expiration of 2nd renewed token
Thread.sleep(renewInterval/2);
dtsm.stopThreads(); dtsm.startThreads();
Thread.sleep(250);
// both tokens should have been implicitly cancelled by now
verify(log, times(1)).logCancelDelegationToken(eq(ident1));
verify(log, times(1)).logCancelDelegationToken(eq(ident2));
} finally {
dtsm.stopThreads();
}
}
}

View File

@ -14,10 +14,11 @@ Trunk (Unreleased)
MAPREDUCE-4887. Add RehashPartitioner, to smooth distributions
with poor implementations of Object#hashCode(). (Radim Kolar via cutting)
IMPROVEMENTS
HADOOP-8562. Enhancements to support Hadoop on Windows Server and Windows
Azure environments. (See breakdown of tasks below for subtasks and
contributors)
MAPREDUCE-3787. [Gridmix] Optimize job monitoring and STRESS mode for
faster job submission. (amarrk)
IMPROVEMENTS
MAPREDUCE-3481. [Gridmix] Improve Gridmix STRESS mode. (amarrk)
@ -30,9 +31,6 @@ Trunk (Unreleased)
MAPREDUCE-2733. [Gridmix] Gridmix3 cpu emulation system tests.
(Vinay Thota via amarrk)
MAPREDUCE-3008. Improvements to cumulative CPU emulation for short running
tasks in Gridmix. (amarrk)
MAPREDUCE-2836. Provide option to fail jobs when submitted to non-existent
fair scheduler pools. (Ahmed Radwan via todd)
@ -71,39 +69,14 @@ Trunk (Unreleased)
MAPREDUCE-4735. Make arguments in TestDFSIO case insensitive.
(Brandon Li via suresh)
MAPREDUCE-5014. Extend Distcp to accept a custom CopyListing.
(Srikanth Sundarrajan via amareshwari)
BUG FIXES
MAPREDUCE-4272. SortedRanges.Range#compareTo is not spec compliant.
(Yu Gao via llu)
MAPREDUCE-4356. [Rumen] Provide access to the method
ParsedTask.obtainTaskAttempts(). (ravigummadi)
MAPREDUCE-4100. [Gridmix] Bug fixed in compression emulation feature for
map only jobs. (amarrk)
MAPREDUCE-4149. [Rumen] Rumen fails to parse certain counter
strings. (ravigummadi)
MAPREDUCE-4083. [Gridmix] NPE in cpu emulation. (amarrk)
MAPREDUCE-4087. [Gridmix] GenerateDistCacheData job of Gridmix can
become slow in some cases (ravigummadi).
MAPREDUCE-3953. [Gridmix] Gridmix throws NPE and does not simulate a
job if the trace contains null taskStatus for a task.
(ravigummadi)
MAPREDUCE-3829. [Gridmix] Gridmix should give better error message when
input data directory already exists and -generate opton is
given.(ravigummadi)
MAPREDUCE-2722. [Gridmix] Gridmix simulated job's map's hdfsBytesRead
counter is wrong when compressed input is used.(ravigummadi)
MAPREDUCE-3757. [Rumen] Fixed Rumen Folder to adjust shuffleFinished and
sortFinished times when needed.(ravigummadi)
MAPREDUCE-3194. "mapred mradmin" command is broken in mrv2
(Jason Lowe via bobby)
@ -155,10 +128,34 @@ Trunk (Unreleased)
MAPREDUCE-5012. Typo in javadoc for IdentityMapper class. (Adam Monsen
via suresh)
MAPREDUCE-5006. Fix failing streaming tests due to MAPREDUCE-4994.
(Sandy Ryza via tomwhite)
MAPREDUCE-5078. TestMRAppMaster fails on Windows due to mismatched path
separators. (Chris Nauroth via sseth)
Release 2.0.5-alpha - UNRELEASED
MAPREDUCE-4885. Streaming tests have multiple failures on Windows. (Chris
Nauroth via bikas)
BREAKDOWN OF HADOOP-8562 SUBTASKS
MAPREDUCE-4739. Some MapReduce tests fail to find winutils.
(Chris Nauroth via suresh)
MAPREDUCE-4780. MapReduce distribution build fails on Windows.
(Chris Nauroth via suresh)
MAPREDUCE-4790. MapReduce build script would be more readable using abspath.
(Chris Nauroth via suresh)
MAPREDUCE-4869. Fix TestMapReduceChildJVM. (Chris Nauroth via acmurthy)
MAPREDUCE-4870. Fix TestMRJobsWithHistoryService. (Chris Nauroth via acmurthy)
MAPREDUCE-4983. Fixed various platform specific assumptions in various tests,
so that they can pass on Windows too. (Chris Nauroth via vinodkv)
HADOOP-9372. Fix bad timeout annotations on tests.
(Arpit Agarwal via suresh)
Release 2.0.5-beta - UNRELEASED
INCOMPATIBLE CHANGES
@ -166,31 +163,40 @@ Release 2.0.5-alpha - UNRELEASED
IMPROVEMENTS
MAPREDUCE-3008. Improvements to cumulative CPU emulation for short running
tasks in Gridmix. (amarrk via tgraves)
MAPREDUCE-5033. mapred shell script should respect usage flags
(--help -help -h). (Andrew Wang via atm)
MAPREDUCE-4892. Modify CombineFileInputFormat to not skew input slits'
allocation on small clusters. (Bikas Saha via vinodkv)
MAPREDUCE-4990. Construct debug strings conditionally in
ShuffleHandler.Shuffle#sendMapOutput(). (kkambatl via tucu)
MAPREDUCE-4875. coverage fixing for org.apache.hadoop.mapred
(Aleksey Gorshkov via bobby)
MAPREDUCE-5129. Allow tags to JobHistory for deeper analytics. (billie via
acmurthy)
OPTIMIZATIONS
MAPREDUCE-3787. [Gridmix] Optimize job monitoring and STRESS mode for
faster job submission. (amarrk via tgraves)
BUG FIXES
MAPREDUCE-5079. Changes job recovery to restore state directly from job
history, instaed of simulating state machine events.
(Jason Lowe and Robert Parker via sseth)
MAPREDUCE-5113. Streaming input/output types are ignored with java
mapper/reducer. (sandyr via tucu)
MAPREDUCE-4981. Add WordMean, WordMedian, WordStandardDeviation
to ExamplesDriver. (Plamen Jeliazkov via shv)
MAPREDUCE-5098. Fix findbugs warnings in gridmix. (kkambatl via tucu)
MAPREUDUCE-5059. Change average merge time on Job overview page to be the
time delta between the end of the shuffle and the start of the reduce.
(Omkar Vinit Joshi via vinodkv)
MAPREDUCE-5086. MR app master deletes staging dir when sent a reboot
command from the RM. (Jian He via jlowe)
Release 2.0.4-beta - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES
IMPROVEMENTS
MAPREDUCE-5033. mapred shell script should respect usage flags
(--help -help -h). (Andrew Wang via atm)
MAPREDUCE-4985. Add compression option to TestDFSIO usage.
(Plamen Jeliazkov via shv)
OPTIMIZATIONS
@ -218,15 +224,108 @@ Release 2.0.4-beta - UNRELEASED
MAPREDUCE-5008. Merger progress miscounts with respect to EOF_MARKER.
(Sandy Ryza via tomwhite)
MAPREDUCE-5117. Changed MRClientProtocolPBClientImpl to be closeable and thus
fix failures in renewal of HistoryServer's delegations tokens. (Siddharth
Seth via vinodkv)
MAPREDUCE-4693. History server should include counters for failed tasks.
(Xuan Gong via sseth)
MAPREDUCE-4896. mapred queue -info spits out ugly exception when queue does
not exist. (sandyr via tucu)
MAPREDUCE-3685. Fix bugs in MergeManager to ensure compression codec is
appropriately used and that on-disk segments are correctly sorted on
file-size. (Anty Rao and Ravi Prakash via acmurthy)
MAPREDUCE-4571. TestHsWebServicesJobs fails on jdk7. (tgraves via tucu)
MAPREDUCE-4716. TestHsWebServicesJobsQuery.testJobsQueryStateInvalid
fails with jdk7. (tgraves via tucu)
MAPREDUCE-5075. DistCp leaks input file handles since ThrottledInputStream
does not close the wrapped InputStream. (Chris Nauroth via szetszwo)
MAPREDUCE-3872. Fix an event handling races in ContainerLauncherImpl.
(Robert Kanter via sseth)
MAPREDUCE-5062. Fix MR AM to read max-retries from the RM. (Zhijie Shen via
vinodkv)
MAPREDUCE-3829. [Gridmix] Gridmix should give better error message when
input data directory already exists and -generate opton is
given.(ravigummadi via tgraves)
MAPREDUCE-2722. [Gridmix] Gridmix simulated job's map's hdfsBytesRead
counter is wrong when compressed input is used.(ravigummadi via tgraves)
MAPREDUCE-3953. [Gridmix] Gridmix throws NPE and does not simulate a
job if the trace contains null taskStatus for a task. (ravigummadi via
tgraves)
MAPREDUCE-4087. [Gridmix] GenerateDistCacheData job of Gridmix can
become slow in some cases (ravigummadi via tgraves).
MAPREDUCE-5077. Remove mapreduce.util.ResourceCalculatorPlugin and related
code. (Karthik Kambatla via sseth)
MAPREDUCE-4083. [Gridmix] NPE in cpu emulation. (amarrk via tgraves)
MAPREDUCE-4100. [Gridmix] Bug fixed in compression emulation feature for
map only jobs. (amarrk via tgraves)
MAPREDUCE-4356. [Rumen] Provide access to the method
ParsedTask.obtainTaskAttempts(). (ravigummadi via tgraves)
MAPREDUCE-4149. [Rumen] Rumen fails to parse certain counter
strings. (ravigummadi via tgraves)
MAPREDUCE-3757. [Rumen] Fixed Rumen Folder to adjust shuffleFinished and
sortFinished times when needed. (Ravi Gummadi via tgraves)
MAPREDUCE-5138. Fix LocalDistributedCacheManager after YARN-112. (Omkar Vinit
Joshi via vinodkv)
MAPREDUCE-5113. Streaming input/output types are ignored with java
mapper/reducer. (sandyr via tucu)
MAPREDUCE-5098. Fix findbugs warnings in gridmix. (kkambatl via tucu)
MAPREDUCE-5086. MR app master deletes staging dir when sent a reboot
command from the RM. (Jian He via jlowe)
MAPREDUCE-5137. AM web UI: clicking on Map Task results in 500 error
(Thomas Graves via jlowe)
MAPREDUCE-5136. TestJobImpl->testJobNoTasks fails with IBM JAVA (Amir
Sanjar via jlowe)
MAPREDUCE-5139. Update MR AM to use the modified startContainer API after
YARN-486. (Xuan Gong via vinodkv)
Release 2.0.4-alpha - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES
IMPROVEMENTS
OPTIMIZATIONS
BUG FIXES
MAPREDUCE-5006. Fix failing streaming tests due to MAPREDUCE-4994.
(Sandy Ryza via tomwhite)
MAPREDUCE-5088. MR Client gets an renewer token exception while Oozie is
submitting a job (Daryn Sharp via cos)
MAPREDUCE-5138. Fix LocalDistributedCacheManager after YARN-112. (Omkar Vinit
Joshi via vinodkv)
MAPREDUCE-5117. Changed MRClientProtocolPBClientImpl to be closeable and thus
fix failures in renewal of HistoryServer's delegations tokens. (Siddharth
Seth via vinodkv)
MAPREDUCE-5083. MiniMRCluster should use a random component when creating an
actual cluster (Siddharth Seth via hitesh)
MAPREDUCE-5094. Disabled memory monitoring by default in MiniMRYarnCluster
to avoid some downstream tests failing. (Siddharth Seth via vinodkv)
Release 2.0.3-alpha - 2013-02-06
@ -744,6 +843,18 @@ Release 2.0.0-alpha - 05-23-2012
MAPREDUCE-4444. nodemanager fails to start when one of the local-dirs is
bad (Jason Lowe via bobby)
Release 0.23.8 - UNRELEASED
INCOMPATIBLE CHANGES
NEW FEATURES
IMPROVEMENTS
OPTIMIZATIONS
BUG FIXES
Release 0.23.7 - UNRELEASED
INCOMPATIBLE CHANGES
@ -758,6 +869,12 @@ Release 0.23.7 - UNRELEASED
MAPREDUCE-4989. JSONify DataTables input data for Attempts page (Ravi
Prakash via jlowe)
MAPREDUCE-5027. Shuffle does not limit number of outstanding connections
(Robert Parker via jeagles)
MAPREDUCE-4972. Coverage fixing for org.apache.hadoop.mapreduce.jobhistory
(Aleksey Gorshkov via bobby)
OPTIMIZATIONS
MAPREDUCE-4946. Fix a performance problem for large jobs by reducing the
@ -777,11 +894,36 @@ Release 0.23.7 - UNRELEASED
MAPREDUCE-5009. Killing the Task Attempt slated for commit does not clear
the value from the Task commitAttempt member (Robert Parker via jeagles)
MAPREDUCE-4871. AM uses mapreduce.jobtracker.split.metainfo.maxsize but
mapred-default has mapreduce.job.split.metainfo.maxsize (Jason Lowe via
jeagles)
MAPREDUCE-4794. DefaultSpeculator generates error messages on normal
shutdown (Jason Lowe via jeagles)
MAPREDUCE-5043. Fetch failure processing can cause AM event queue to
backup and eventually OOM (Jason Lowe via bobby)
MAPREDUCE-5023. History Server Web Services missing Job Counters (Ravi
Prakash via tgraves)
MAPREDUCE-5060. Fetch failures that time out only count against the first
map task (Robert Joseph Evans via jlowe)
MAPREDUCE-5042. Reducer unable to fetch for a map task that was recovered
(Jason Lowe via bobby)
MAPREDUCE-5053. java.lang.InternalError from decompression codec cause
reducer to fail (Robert Parker via jeagles)
MAPREDUCE-4991. coverage for gridmix (Aleksey Gorshkov via tgraves)
MAPREDUCE-5007. fix coverage org.apache.hadoop.mapreduce.v2.hs (Aleksey
Gorshkov via tgraves)
MAPREDUCE-5137. AM web UI: clicking on Map Task results in 500 error
(Thomas Graves via jlowe)
Release 0.23.6 - UNRELEASED
INCOMPATIBLE CHANGES

View File

@ -24,9 +24,12 @@ import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.io.IOUtils;
@ -46,6 +49,7 @@ import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskAttemptID;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.AMStartedEvent;
import org.apache.hadoop.mapreduce.jobhistory.EventReader;
@ -54,6 +58,9 @@ import org.apache.hadoop.mapreduce.jobhistory.HistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryCopyService;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEventHandler;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.JobInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.mapreduce.security.token.JobTokenSecretManager;
@ -61,6 +68,7 @@ import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.mapreduce.v2.api.records.AMInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskState;
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
import org.apache.hadoop.mapreduce.v2.app.client.ClientService;
import org.apache.hadoop.mapreduce.v2.app.client.MRClientService;
@ -74,6 +82,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.TaskAttempt;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobStartEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
@ -84,8 +93,6 @@ import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherEvent;
import org.apache.hadoop.mapreduce.v2.app.launcher.ContainerLauncherImpl;
import org.apache.hadoop.mapreduce.v2.app.local.LocalContainerAllocator;
import org.apache.hadoop.mapreduce.v2.app.metrics.MRAppMetrics;
import org.apache.hadoop.mapreduce.v2.app.recover.Recovery;
import org.apache.hadoop.mapreduce.v2.app.recover.RecoveryService;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocator;
import org.apache.hadoop.mapreduce.v2.app.rm.ContainerAllocatorEvent;
import org.apache.hadoop.mapreduce.v2.app.rm.RMCommunicator;
@ -94,6 +101,7 @@ import org.apache.hadoop.mapreduce.v2.app.rm.RMHeartbeatHandler;
import org.apache.hadoop.mapreduce.v2.app.speculate.DefaultSpeculator;
import org.apache.hadoop.mapreduce.v2.app.speculate.Speculator;
import org.apache.hadoop.mapreduce.v2.app.speculate.SpeculatorEvent;
import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@ -167,7 +175,6 @@ public class MRAppMaster extends CompositeService {
private AppContext context;
private Dispatcher dispatcher;
private ClientService clientService;
private Recovery recoveryServ;
private ContainerAllocator containerAllocator;
private ContainerLauncher containerLauncher;
private EventHandler<CommitterEvent> committerEventHandler;
@ -180,7 +187,6 @@ public class MRAppMaster extends CompositeService {
private OutputCommitter committer;
private JobEventDispatcher jobEventDispatcher;
private JobHistoryEventHandler jobHistoryEventHandler;
private boolean inRecovery = false;
private SpeculatorEventDispatcher speculatorEventDispatcher;
private Job job;
@ -193,6 +199,8 @@ public class MRAppMaster extends CompositeService {
private String shutDownMessage = null;
JobStateInternal forcedState = null;
private long recoveredJobStartTime = 0;
public MRAppMaster(ApplicationAttemptId applicationAttemptId,
ContainerId containerId, String nmHost, int nmPort, int nmHttpPort,
long appSubmitTime, int maxAppAttempts) {
@ -340,34 +348,9 @@ public class MRAppMaster extends CompositeService {
}
} else {
committer = createOutputCommitter(conf);
boolean recoveryEnabled = conf.getBoolean(
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
boolean recoverySupportedByCommitter = committer.isRecoverySupported();
// If a shuffle secret was not provided by the job client then this app
// attempt will generate one. However that disables recovery if there
// are reducers as the shuffle secret would be app attempt specific.
boolean shuffleKeyValidForRecovery = (numReduceTasks > 0 &&
TokenCache.getShuffleSecretKey(fsTokens) != null);
if (recoveryEnabled && recoverySupportedByCommitter
&& shuffleKeyValidForRecovery && appAttemptID.getAttemptId() > 1) {
LOG.info("Recovery is enabled. "
+ "Will try to recover from previous life on best effort basis.");
recoveryServ = createRecoveryService(context);
addIfService(recoveryServ);
dispatcher = recoveryServ.getDispatcher();
clock = recoveryServ.getClock();
inRecovery = true;
} else {
LOG.info("Not starting RecoveryService: recoveryEnabled: "
+ recoveryEnabled + " recoverySupportedByCommitter: "
+ recoverySupportedByCommitter + " shuffleKeyValidForRecovery: "
+ shuffleKeyValidForRecovery + " ApplicationAttemptID: "
+ appAttemptID.getAttemptId());
dispatcher = createDispatcher();
addIfService(dispatcher);
}
dispatcher = createDispatcher();
addIfService(dispatcher);
//service to handle requests from JobClient
clientService = createClientService(context);
@ -595,15 +578,6 @@ public class MRAppMaster extends CompositeService {
return new JobFinishEventHandler();
}
/**
* Create the recovery service.
* @return an instance of the recovery service.
*/
protected Recovery createRecoveryService(AppContext appContext) {
return new RecoveryService(appContext.getApplicationAttemptId(),
appContext.getClock(), getCommitter(), isNewApiCommitter());
}
/** Create and initialize (but don't start) a single job.
* @param forcedState a state to force the job into or null for normal operation.
* @param diagnostic a diagnostic message to include with the job.
@ -615,7 +589,8 @@ public class MRAppMaster extends CompositeService {
Job newJob =
new JobImpl(jobId, appAttemptID, conf, dispatcher.getEventHandler(),
taskAttemptListener, jobTokenSecretManager, fsTokens, clock,
completedTasksFromPreviousRun, metrics, newApiCommitter,
completedTasksFromPreviousRun, metrics,
committer, newApiCommitter,
currentUser.getUserName(), appSubmitTime, amInfos, context,
forcedState, diagnostic);
((RunningAppContext) context).jobs.put(newJob.getID(), newJob);
@ -978,18 +953,8 @@ public class MRAppMaster extends CompositeService {
public void start() {
amInfos = new LinkedList<AMInfo>();
// Pull completedTasks etc from recovery
if (inRecovery) {
completedTasksFromPreviousRun = recoveryServ.getCompletedTasks();
amInfos = recoveryServ.getAMInfos();
} else {
// Get the amInfos anyways irrespective of whether recovery is enabled or
// not IF this is not the first AM generation
if (appAttemptID.getAttemptId() != 1) {
amInfos.addAll(readJustAMInfos());
}
}
completedTasksFromPreviousRun = new HashMap<TaskId, TaskInfo>();
processRecovery();
// Current an AMInfo for the current AM generation.
AMInfo amInfo =
@ -1051,13 +1016,105 @@ public class MRAppMaster extends CompositeService {
startJobs();
}
private void processRecovery() {
if (appAttemptID.getAttemptId() == 1) {
return; // no need to recover on the first attempt
}
boolean recoveryEnabled = getConfig().getBoolean(
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE,
MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE_DEFAULT);
boolean recoverySupportedByCommitter =
committer != null && committer.isRecoverySupported();
// If a shuffle secret was not provided by the job client then this app
// attempt will generate one. However that disables recovery if there
// are reducers as the shuffle secret would be app attempt specific.
int numReduceTasks = getConfig().getInt(MRJobConfig.NUM_REDUCES, 0);
boolean shuffleKeyValidForRecovery = (numReduceTasks > 0 &&
TokenCache.getShuffleSecretKey(fsTokens) != null);
if (recoveryEnabled && recoverySupportedByCommitter
&& shuffleKeyValidForRecovery) {
LOG.info("Recovery is enabled. "
+ "Will try to recover from previous life on best effort basis.");
try {
parsePreviousJobHistory();
} catch (IOException e) {
LOG.warn("Unable to parse prior job history, aborting recovery", e);
// try to get just the AMInfos
amInfos.addAll(readJustAMInfos());
}
} else {
LOG.info("Will not try to recover. recoveryEnabled: "
+ recoveryEnabled + " recoverySupportedByCommitter: "
+ recoverySupportedByCommitter + " shuffleKeyValidForRecovery: "
+ shuffleKeyValidForRecovery + " ApplicationAttemptID: "
+ appAttemptID.getAttemptId());
// Get the amInfos anyways whether recovery is enabled or not
amInfos.addAll(readJustAMInfos());
}
}
private static FSDataInputStream getPreviousJobHistoryStream(
Configuration conf, ApplicationAttemptId appAttemptId)
throws IOException {
Path historyFile = JobHistoryUtils.getPreviousJobHistoryPath(conf,
appAttemptId);
LOG.info("Previous history file is at " + historyFile);
return historyFile.getFileSystem(conf).open(historyFile);
}
private void parsePreviousJobHistory() throws IOException {
FSDataInputStream in = getPreviousJobHistoryStream(getConfig(),
appAttemptID);
JobHistoryParser parser = new JobHistoryParser(in);
JobInfo jobInfo = parser.parse();
Exception parseException = parser.getParseException();
if (parseException != null) {
LOG.info("Got an error parsing job-history file" +
", ignoring incomplete events.", parseException);
}
Map<org.apache.hadoop.mapreduce.TaskID, TaskInfo> taskInfos = jobInfo
.getAllTasks();
for (TaskInfo taskInfo : taskInfos.values()) {
if (TaskState.SUCCEEDED.toString().equals(taskInfo.getTaskStatus())) {
Iterator<Entry<TaskAttemptID, TaskAttemptInfo>> taskAttemptIterator =
taskInfo.getAllTaskAttempts().entrySet().iterator();
while (taskAttemptIterator.hasNext()) {
Map.Entry<TaskAttemptID, TaskAttemptInfo> currentEntry = taskAttemptIterator.next();
if (!jobInfo.getAllCompletedTaskAttempts().containsKey(currentEntry.getKey())) {
taskAttemptIterator.remove();
}
}
completedTasksFromPreviousRun
.put(TypeConverter.toYarn(taskInfo.getTaskId()), taskInfo);
LOG.info("Read from history task "
+ TypeConverter.toYarn(taskInfo.getTaskId()));
}
}
LOG.info("Read completed tasks from history "
+ completedTasksFromPreviousRun.size());
recoveredJobStartTime = jobInfo.getLaunchTime();
// recover AMInfos
List<JobHistoryParser.AMInfo> jhAmInfoList = jobInfo.getAMInfos();
if (jhAmInfoList != null) {
for (JobHistoryParser.AMInfo jhAmInfo : jhAmInfoList) {
AMInfo amInfo = MRBuilderUtils.newAMInfo(jhAmInfo.getAppAttemptId(),
jhAmInfo.getStartTime(), jhAmInfo.getContainerId(),
jhAmInfo.getNodeManagerHost(), jhAmInfo.getNodeManagerPort(),
jhAmInfo.getNodeManagerHttpPort());
amInfos.add(amInfo);
}
}
}
private List<AMInfo> readJustAMInfos() {
List<AMInfo> amInfos = new ArrayList<AMInfo>();
FSDataInputStream inputStream = null;
try {
inputStream =
RecoveryService.getPreviousJobHistoryFileStream(getConfig(),
appAttemptID);
inputStream = getPreviousJobHistoryStream(getConfig(), appAttemptID);
EventReader jobHistoryEventReader = new EventReader(inputStream);
// All AMInfos are contiguous. Track when the first AMStartedEvent
@ -1108,7 +1165,8 @@ public class MRAppMaster extends CompositeService {
@SuppressWarnings("unchecked")
protected void startJobs() {
/** create a job-start event to get this ball rolling */
JobEvent startJobEvent = new JobEvent(job.getID(), JobEventType.JOB_START);
JobEvent startJobEvent = new JobStartEvent(job.getID(),
recoveredJobStartTime);
/** send the job-start event. this triggers the job execution. */
dispatcher.getEventHandler().handle(startJobEvent);
}

View File

@ -1,4 +1,4 @@
/*
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@ -15,6 +15,25 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
@InterfaceAudience.Private
package org.apache.hadoop.mapreduce.v2.app.recover;
import org.apache.hadoop.classification.InterfaceAudience;
package org.apache.hadoop.mapreduce.v2.app.job.event;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
public class JobStartEvent extends JobEvent {
long recoveredJobStartTime;
public JobStartEvent(JobId jobID) {
this(jobID, 0);
}
public JobStartEvent(JobId jobID, long recoveredJobStartTime) {
super(jobID, JobEventType.JOB_START);
this.recoveredJobStartTime = recoveredJobStartTime;
}
public long getRecoveredJobStartTime() {
return recoveredJobStartTime;
}
}

View File

@ -26,6 +26,7 @@ public enum TaskAttemptEventType {
//Producer:Task
TA_SCHEDULE,
TA_RESCHEDULE,
TA_RECOVER,
//Producer:Client, Task
TA_KILL,

View File

@ -0,0 +1,50 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.app.job.event;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.v2.api.records.TaskAttemptId;
public class TaskAttemptRecoverEvent extends TaskAttemptEvent {
private TaskAttemptInfo taInfo;
private OutputCommitter committer;
private boolean recoverAttemptOutput;
public TaskAttemptRecoverEvent(TaskAttemptId id, TaskAttemptInfo taInfo,
OutputCommitter committer, boolean recoverOutput) {
super(id, TaskAttemptEventType.TA_RECOVER);
this.taInfo = taInfo;
this.committer = committer;
this.recoverAttemptOutput = recoverOutput;
}
public TaskAttemptInfo getTaskAttemptInfo() {
return taInfo;
}
public OutputCommitter getCommitter() {
return committer;
}
public boolean getRecoverOutput() {
return recoverAttemptOutput;
}
}

View File

@ -28,6 +28,7 @@ public enum TaskEventType {
//Producer:Job
T_SCHEDULE,
T_RECOVER,
//Producer:Speculator
T_ADD_SPEC_ATTEMPT,

View File

@ -0,0 +1,50 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.mapreduce.v2.app.job.event;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
public class TaskRecoverEvent extends TaskEvent {
private TaskInfo taskInfo;
private OutputCommitter committer;
private boolean recoverTaskOutput;
public TaskRecoverEvent(TaskId taskID, TaskInfo taskInfo,
OutputCommitter committer, boolean recoverTaskOutput) {
super(taskID, TaskEventType.T_RECOVER);
this.taskInfo = taskInfo;
this.committer = committer;
this.recoverTaskOutput = recoverTaskOutput;
}
public TaskInfo getTaskInfo() {
return taskInfo;
}
public OutputCommitter getOutputCommitter() {
return committer;
}
public boolean getRecoverTaskOutput() {
return recoverTaskOutput;
}
}

View File

@ -49,6 +49,7 @@ import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.JobACL;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.JobFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
@ -92,6 +93,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.JobEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobFinishEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobSetupFailedEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobStartEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskAttemptCompletedEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskAttemptFetchFailureEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.JobTaskEvent;
@ -101,6 +103,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptKillEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskRecoverEvent;
import org.apache.hadoop.mapreduce.v2.app.metrics.MRAppMetrics;
import org.apache.hadoop.mapreduce.v2.util.MRApps;
import org.apache.hadoop.mapreduce.v2.util.MRBuilderUtils;
@ -159,6 +162,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
private final Lock writeLock;
private final JobId jobId;
private final String jobName;
private final OutputCommitter committer;
private final boolean newApiCommitter;
private final org.apache.hadoop.mapreduce.JobID oldJobId;
private final TaskAttemptListener taskAttemptListener;
@ -602,7 +606,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
JobTokenSecretManager jobTokenSecretManager,
Credentials fsTokenCredentials, Clock clock,
Map<TaskId, TaskInfo> completedTasksFromPreviousRun, MRAppMetrics metrics,
boolean newApiCommitter, String userName,
OutputCommitter committer, boolean newApiCommitter, String userName,
long appSubmitTime, List<AMInfo> amInfos, AppContext appContext,
JobStateInternal forcedState, String forcedDiagnostic) {
this.applicationAttemptId = applicationAttemptId;
@ -618,6 +622,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
this.queueName = conf.get(MRJobConfig.QUEUE_NAME, "default");
this.appSubmitTime = appSubmitTime;
this.oldJobId = TypeConverter.fromYarn(jobId);
this.committer = committer;
this.newApiCommitter = newApiCommitter;
this.taskAttemptListener = taskAttemptListener;
@ -888,10 +893,16 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
}
}
protected void scheduleTasks(Set<TaskId> taskIDs) {
protected void scheduleTasks(Set<TaskId> taskIDs,
boolean recoverTaskOutput) {
for (TaskId taskID : taskIDs) {
eventHandler.handle(new TaskEvent(taskID,
TaskEventType.T_SCHEDULE));
TaskInfo taskInfo = completedTasksFromPreviousRun.remove(taskID);
if (taskInfo != null) {
eventHandler.handle(new TaskRecoverEvent(taskID, taskInfo,
committer, recoverTaskOutput));
} else {
eventHandler.handle(new TaskEvent(taskID, TaskEventType.T_SCHEDULE));
}
}
}
@ -1421,7 +1432,7 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
job.conf, splits[i],
job.taskAttemptListener,
job.jobToken, job.fsTokens,
job.clock, job.completedTasksFromPreviousRun,
job.clock,
job.applicationAttemptId.getAttemptId(),
job.metrics, job.appContext);
job.addTask(task);
@ -1439,7 +1450,6 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
job.conf, job.numMapTasks,
job.taskAttemptListener, job.jobToken,
job.fsTokens, job.clock,
job.completedTasksFromPreviousRun,
job.applicationAttemptId.getAttemptId(),
job.metrics, job.appContext);
job.addTask(task);
@ -1475,8 +1485,8 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
@Override
public void transition(JobImpl job, JobEvent event) {
job.setupProgress = 1.0f;
job.scheduleTasks(job.mapTasks); // schedule (i.e., start) the maps
job.scheduleTasks(job.reduceTasks);
job.scheduleTasks(job.mapTasks, job.numReduceTasks == 0);
job.scheduleTasks(job.reduceTasks, true);
// If we have no tasks, just transition to job completed
if (job.numReduceTasks == 0 && job.numMapTasks == 0) {
@ -1507,7 +1517,12 @@ public class JobImpl implements org.apache.hadoop.mapreduce.v2.app.job.Job,
*/
@Override
public void transition(JobImpl job, JobEvent event) {
job.startTime = job.clock.getTime();
JobStartEvent jse = (JobStartEvent) event;
if (jse.getRecoveredJobStartTime() != 0) {
job.startTime = jse.getRecoveredJobStartTime();
} else {
job.startTime = job.clock.getTime();
}
JobInitedEvent jie =
new JobInitedEvent(job.oldJobId,
job.startTime,

View File

@ -18,17 +18,13 @@
package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapTaskAttemptImpl;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
import org.apache.hadoop.mapreduce.split.JobSplit.TaskSplitMetaInfo;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
import org.apache.hadoop.mapreduce.v2.app.AppContext;
import org.apache.hadoop.mapreduce.v2.app.TaskAttemptListener;
@ -49,11 +45,10 @@ public class MapTaskImpl extends TaskImpl {
TaskAttemptListener taskAttemptListener,
Token<JobTokenIdentifier> jobToken,
Credentials credentials, Clock clock,
Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics, AppContext appContext) {
int appAttemptId, MRAppMetrics metrics, AppContext appContext) {
super(jobId, TaskType.MAP, partition, eventHandler, remoteJobConfFile,
conf, taskAttemptListener, jobToken, credentials, clock,
completedTasksFromPreviousRun, startCount, metrics, appContext);
appAttemptId, metrics, appContext);
this.taskSplitMetaInfo = taskSplitMetaInfo;
}

View File

@ -18,16 +18,12 @@
package org.apache.hadoop.mapreduce.v2.app.job.impl;
import java.util.Map;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.ReduceTaskAttemptImpl;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskInfo;
import org.apache.hadoop.mapreduce.security.token.JobTokenIdentifier;
import org.apache.hadoop.mapreduce.v2.api.records.JobId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskId;
import org.apache.hadoop.mapreduce.v2.api.records.TaskType;
import org.apache.hadoop.mapreduce.v2.app.AppContext;
import org.apache.hadoop.mapreduce.v2.app.TaskAttemptListener;
@ -47,11 +43,10 @@ public class ReduceTaskImpl extends TaskImpl {
int numMapTasks, TaskAttemptListener taskAttemptListener,
Token<JobTokenIdentifier> jobToken,
Credentials credentials, Clock clock,
Map<TaskId, TaskInfo> completedTasksFromPreviousRun, int startCount,
MRAppMetrics metrics, AppContext appContext) {
int appAttemptId, MRAppMetrics metrics, AppContext appContext) {
super(jobId, TaskType.REDUCE, partition, eventHandler, jobFile, conf,
taskAttemptListener, jobToken, credentials, clock,
completedTasksFromPreviousRun, startCount, metrics, appContext);
appAttemptId, metrics, appContext);
this.numMapTasks = numMapTasks;
}

View File

@ -56,10 +56,12 @@ import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Counters;
import org.apache.hadoop.mapreduce.JobCounter;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskCounter;
import org.apache.hadoop.mapreduce.TypeConverter;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryEvent;
import org.apache.hadoop.mapreduce.jobhistory.JobHistoryParser.TaskAttemptInfo;
import org.apache.hadoop.mapreduce.jobhistory.MapAttemptFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.ReduceAttemptFinishedEvent;
import org.apache.hadoop.mapreduce.jobhistory.TaskAttemptStartedEvent;
@ -89,6 +91,7 @@ import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptDiagnosticsUpdate
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptEventType;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptKillEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptRecoverEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptStatusUpdateEvent.TaskAttemptStatus;
import org.apache.hadoop.mapreduce.v2.app.job.event.TaskEventType;
@ -111,6 +114,7 @@ import org.apache.hadoop.yarn.Clock;
import org.apache.hadoop.yarn.YarnException;
import org.apache.hadoop.yarn.api.ApplicationConstants.Environment;
import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
import org.apache.hadoop.yarn.api.records.Container;
import org.apache.hadoop.yarn.api.records.ContainerId;
import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
import org.apache.hadoop.yarn.api.records.ContainerToken;
@ -204,6 +208,11 @@ public abstract class TaskAttemptImpl implements
TaskAttemptEventType.TA_KILL, new KilledTransition())
.addTransition(TaskAttemptStateInternal.NEW, TaskAttemptStateInternal.FAILED,
TaskAttemptEventType.TA_FAILMSG, new FailedTransition())
.addTransition(TaskAttemptStateInternal.NEW,
EnumSet.of(TaskAttemptStateInternal.FAILED,
TaskAttemptStateInternal.KILLED,
TaskAttemptStateInternal.SUCCEEDED),
TaskAttemptEventType.TA_RECOVER, new RecoverTransition())
.addTransition(TaskAttemptStateInternal.NEW,
TaskAttemptStateInternal.NEW,
TaskAttemptEventType.TA_DIAGNOSTICS_UPDATE,
@ -759,8 +768,8 @@ public abstract class TaskAttemptImpl implements
// The null fields are per-container and will be constructed for each
// container separately.
ContainerLaunchContext container = BuilderUtils
.newContainerLaunchContext(null, conf
.get(MRJobConfig.USER_NAME), null, localResources,
.newContainerLaunchContext(conf
.get(MRJobConfig.USER_NAME), localResources,
environment, null, serviceData, taskCredentialsBuffer,
applicationACLs);
@ -769,10 +778,9 @@ public abstract class TaskAttemptImpl implements
static ContainerLaunchContext createContainerLaunchContext(
Map<ApplicationAccessType, String> applicationACLs,
ContainerId containerID, Configuration conf,
Token<JobTokenIdentifier> jobToken, Task remoteTask,
Configuration conf, Token<JobTokenIdentifier> jobToken, Task remoteTask,
final org.apache.hadoop.mapred.JobID oldJobId,
Resource assignedCapability, WrappedJvmID jvmID,
WrappedJvmID jvmID,
TaskAttemptListener taskAttemptListener,
Credentials credentials) {
@ -805,7 +813,7 @@ public abstract class TaskAttemptImpl implements
// Construct the actual Container
ContainerLaunchContext container = BuilderUtils.newContainerLaunchContext(
containerID, commonContainerSpec.getUser(), assignedCapability,
commonContainerSpec.getUser(),
commonContainerSpec.getLocalResources(), myEnv, commands,
myServiceData, commonContainerSpec.getContainerTokens().duplicate(),
applicationACLs);
@ -1082,6 +1090,102 @@ public abstract class TaskAttemptImpl implements
this.avataar = avataar;
}
@SuppressWarnings("unchecked")
public TaskAttemptStateInternal recover(TaskAttemptInfo taInfo,
OutputCommitter committer, boolean recoverOutput) {
containerID = taInfo.getContainerId();
containerNodeId = ConverterUtils.toNodeId(taInfo.getHostname() + ":"
+ taInfo.getPort());
containerMgrAddress = StringInterner.weakIntern(
containerNodeId.toString());
nodeHttpAddress = StringInterner.weakIntern(taInfo.getHostname() + ":"
+ taInfo.getHttpPort());
computeRackAndLocality();
launchTime = taInfo.getStartTime();
finishTime = (taInfo.getFinishTime() != -1) ?
taInfo.getFinishTime() : clock.getTime();
shufflePort = taInfo.getShufflePort();
trackerName = taInfo.getHostname();
httpPort = taInfo.getHttpPort();
sendLaunchedEvents();
reportedStatus.id = attemptId;
reportedStatus.progress = 1.0f;
reportedStatus.counters = taInfo.getCounters();
reportedStatus.stateString = taInfo.getState();
reportedStatus.phase = Phase.CLEANUP;
reportedStatus.mapFinishTime = taInfo.getMapFinishTime();
reportedStatus.shuffleFinishTime = taInfo.getShuffleFinishTime();
reportedStatus.sortFinishTime = taInfo.getSortFinishTime();
addDiagnosticInfo(taInfo.getError());
boolean needToClean = false;
String recoveredState = taInfo.getTaskStatus();
if (recoverOutput
&& TaskAttemptState.SUCCEEDED.toString().equals(recoveredState)) {
TaskAttemptContext tac = new TaskAttemptContextImpl(conf,
TypeConverter.fromYarn(attemptId));
try {
committer.recoverTask(tac);
LOG.info("Recovered output from task attempt " + attemptId);
} catch (Exception e) {
LOG.error("Unable to recover task attempt " + attemptId, e);
LOG.info("Task attempt " + attemptId + " will be recovered as KILLED");
recoveredState = TaskAttemptState.KILLED.toString();
needToClean = true;
}
}
TaskAttemptStateInternal attemptState;
if (TaskAttemptState.SUCCEEDED.toString().equals(recoveredState)) {
attemptState = TaskAttemptStateInternal.SUCCEEDED;
reportedStatus.taskState = TaskAttemptState.SUCCEEDED;
eventHandler.handle(createJobCounterUpdateEventTASucceeded(this));
logAttemptFinishedEvent(attemptState);
} else if (TaskAttemptState.FAILED.toString().equals(recoveredState)) {
attemptState = TaskAttemptStateInternal.FAILED;
reportedStatus.taskState = TaskAttemptState.FAILED;
eventHandler.handle(createJobCounterUpdateEventTAFailed(this, false));
TaskAttemptUnsuccessfulCompletionEvent tauce =
createTaskAttemptUnsuccessfulCompletionEvent(this,
TaskAttemptStateInternal.FAILED);
eventHandler.handle(
new JobHistoryEvent(attemptId.getTaskId().getJobId(), tauce));
} else {
if (!TaskAttemptState.KILLED.toString().equals(recoveredState)) {
if (String.valueOf(recoveredState).isEmpty()) {
LOG.info("TaskAttempt" + attemptId
+ " had not completed, recovering as KILLED");
} else {
LOG.warn("TaskAttempt " + attemptId + " found in unexpected state "
+ recoveredState + ", recovering as KILLED");
}
addDiagnosticInfo("Killed during application recovery");
needToClean = true;
}
attemptState = TaskAttemptStateInternal.KILLED;
reportedStatus.taskState = TaskAttemptState.KILLED;
eventHandler.handle(createJobCounterUpdateEventTAKilled(this, false));
TaskAttemptUnsuccessfulCompletionEvent tauce =
createTaskAttemptUnsuccessfulCompletionEvent(this,
TaskAttemptStateInternal.KILLED);
eventHandler.handle(
new JobHistoryEvent(attemptId.getTaskId().getJobId(), tauce));
}
if (needToClean) {
TaskAttemptContext tac = new TaskAttemptContextImpl(conf,
TypeConverter.fromYarn(attemptId));
try {
committer.abortTask(tac);
} catch (Exception e) {
LOG.warn("Task cleanup failed for attempt " + attemptId, e);
}
}
return attemptState;
}
private static TaskAttemptState getExternalState(
TaskAttemptStateInternal smState) {
switch (smState) {
@ -1122,6 +1226,24 @@ public abstract class TaskAttemptImpl implements
}
}
private void computeRackAndLocality() {
nodeRackName = RackResolver.resolve(
containerNodeId.getHost()).getNetworkLocation();
locality = Locality.OFF_SWITCH;
if (dataLocalHosts.size() > 0) {
String cHost = resolveHost(containerNodeId.getHost());
if (dataLocalHosts.contains(cHost)) {
locality = Locality.NODE_LOCAL;
}
}
if (locality == Locality.OFF_SWITCH) {
if (dataLocalRacks.contains(nodeRackName)) {
locality = Locality.RACK_LOCAL;
}
}
}
private static long computeSlotMillis(TaskAttemptImpl taskAttempt) {
TaskType taskType = taskAttempt.getID().getTaskId().getTaskType();
int slotMemoryReq =
@ -1141,6 +1263,18 @@ public abstract class TaskAttemptImpl implements
return slotMillisIncrement;
}
private static JobCounterUpdateEvent createJobCounterUpdateEventTASucceeded(
TaskAttemptImpl taskAttempt) {
long slotMillis = computeSlotMillis(taskAttempt);
TaskId taskId = taskAttempt.attemptId.getTaskId();
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskId.getJobId());
jce.addCounterUpdate(
taskId.getTaskType() == TaskType.MAP ?
JobCounter.SLOTS_MILLIS_MAPS : JobCounter.SLOTS_MILLIS_REDUCES,
slotMillis);
return jce;
}
private static JobCounterUpdateEvent createJobCounterUpdateEventTAFailed(
TaskAttemptImpl taskAttempt, boolean taskAlreadyCompleted) {
TaskType taskType = taskAttempt.getID().getTaskId().getTaskType();
@ -1210,6 +1344,26 @@ public abstract class TaskAttemptImpl implements
return tauce;
}
@SuppressWarnings("unchecked")
private void sendLaunchedEvents() {
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(attemptId.getTaskId()
.getJobId());
jce.addCounterUpdate(attemptId.getTaskId().getTaskType() == TaskType.MAP ?
JobCounter.TOTAL_LAUNCHED_MAPS : JobCounter.TOTAL_LAUNCHED_REDUCES, 1);
eventHandler.handle(jce);
LOG.info("TaskAttempt: [" + attemptId
+ "] using containerId: [" + containerID + " on NM: ["
+ containerMgrAddress + "]");
TaskAttemptStartedEvent tase =
new TaskAttemptStartedEvent(TypeConverter.fromYarn(attemptId),
TypeConverter.fromYarn(attemptId.getTaskId().getTaskType()),
launchTime, trackerName, httpPort, shufflePort, containerID,
locality.toString(), avataar.toString());
eventHandler.handle(
new JobHistoryEvent(attemptId.getTaskId().getJobId(), tase));
}
private WrappedProgressSplitsBlock getProgressSplitBlock() {
readLock.lock();
try {
@ -1342,8 +1496,6 @@ public abstract class TaskAttemptImpl implements
taskAttempt.containerNodeId.toString());
taskAttempt.nodeHttpAddress = StringInterner.weakIntern(
cEvent.getContainer().getNodeHttpAddress());
taskAttempt.nodeRackName = RackResolver.resolve(
taskAttempt.containerNodeId.getHost()).getNetworkLocation();
taskAttempt.containerToken = cEvent.getContainer().getContainerToken();
taskAttempt.assignedCapability = cEvent.getContainer().getResource();
// this is a _real_ Task (classic Hadoop mapred flavor):
@ -1354,32 +1506,18 @@ public abstract class TaskAttemptImpl implements
taskAttempt.taskAttemptListener.registerPendingTask(
taskAttempt.remoteTask, taskAttempt.jvmID);
taskAttempt.locality = Locality.OFF_SWITCH;
if (taskAttempt.dataLocalHosts.size() > 0) {
String cHost = taskAttempt.resolveHost(
taskAttempt.containerNodeId.getHost());
if (taskAttempt.dataLocalHosts.contains(cHost)) {
taskAttempt.locality = Locality.NODE_LOCAL;
}
}
if (taskAttempt.locality == Locality.OFF_SWITCH) {
if (taskAttempt.dataLocalRacks.contains(taskAttempt.nodeRackName)) {
taskAttempt.locality = Locality.RACK_LOCAL;
}
}
taskAttempt.computeRackAndLocality();
//launch the container
//create the container object to be launched for a given Task attempt
ContainerLaunchContext launchContext = createContainerLaunchContext(
cEvent.getApplicationACLs(), taskAttempt.containerID,
taskAttempt.conf, taskAttempt.jobToken, taskAttempt.remoteTask,
taskAttempt.oldJobId, taskAttempt.assignedCapability,
taskAttempt.jvmID, taskAttempt.taskAttemptListener,
taskAttempt.credentials);
cEvent.getApplicationACLs(), taskAttempt.conf, taskAttempt.jobToken,
taskAttempt.remoteTask, taskAttempt.oldJobId, taskAttempt.jvmID,
taskAttempt.taskAttemptListener, taskAttempt.credentials);
taskAttempt.eventHandler.handle(new ContainerRemoteLaunchEvent(
taskAttempt.attemptId, taskAttempt.containerID,
taskAttempt.containerMgrAddress, taskAttempt.containerToken,
launchContext, taskAttempt.remoteTask));
launchContext, taskAttempt.assignedCapability, taskAttempt.remoteTask));
// send event to speculator that our container needs are satisfied
taskAttempt.eventHandler.handle
@ -1471,27 +1609,7 @@ public abstract class TaskAttemptImpl implements
// Costly?
taskAttempt.trackerName = nodeHttpInetAddr.getHostName();
taskAttempt.httpPort = nodeHttpInetAddr.getPort();
JobCounterUpdateEvent jce =
new JobCounterUpdateEvent(taskAttempt.attemptId.getTaskId()
.getJobId());
jce.addCounterUpdate(
taskAttempt.attemptId.getTaskId().getTaskType() == TaskType.MAP ?
JobCounter.TOTAL_LAUNCHED_MAPS: JobCounter.TOTAL_LAUNCHED_REDUCES
, 1);
taskAttempt.eventHandler.handle(jce);
LOG.info("TaskAttempt: [" + taskAttempt.attemptId
+ "] using containerId: [" + taskAttempt.containerID + " on NM: ["
+ taskAttempt.containerMgrAddress + "]");
TaskAttemptStartedEvent tase =
new TaskAttemptStartedEvent(TypeConverter.fromYarn(taskAttempt.attemptId),
TypeConverter.fromYarn(taskAttempt.attemptId.getTaskId().getTaskType()),
taskAttempt.launchTime,
nodeHttpInetAddr.getHostName(), nodeHttpInetAddr.getPort(),
taskAttempt.shufflePort, taskAttempt.containerID,
taskAttempt.locality.toString(), taskAttempt.avataar.toString());
taskAttempt.eventHandler.handle
(new JobHistoryEvent(taskAttempt.attemptId.getTaskId().getJobId(), tase));
taskAttempt.sendLaunchedEvents();
taskAttempt.eventHandler.handle
(new SpeculatorEvent
(taskAttempt.attemptId, true, taskAttempt.clock.getTime()));
@ -1540,14 +1658,8 @@ public abstract class TaskAttemptImpl implements
TaskAttemptEvent event) {
//set the finish time
taskAttempt.setFinishTime();
long slotMillis = computeSlotMillis(taskAttempt);
TaskId taskId = taskAttempt.attemptId.getTaskId();
JobCounterUpdateEvent jce = new JobCounterUpdateEvent(taskId.getJobId());
jce.addCounterUpdate(
taskId.getTaskType() == TaskType.MAP ?
JobCounter.SLOTS_MILLIS_MAPS : JobCounter.SLOTS_MILLIS_REDUCES,
slotMillis);
taskAttempt.eventHandler.handle(jce);
taskAttempt.eventHandler.handle(
createJobCounterUpdateEventTASucceeded(taskAttempt));
taskAttempt.logAttemptFinishedEvent(TaskAttemptStateInternal.SUCCEEDED);
taskAttempt.eventHandler.handle(new TaskTAttemptEvent(
taskAttempt.attemptId,
@ -1585,6 +1697,18 @@ public abstract class TaskAttemptImpl implements
}
}
private static class RecoverTransition implements
MultipleArcTransition<TaskAttemptImpl, TaskAttemptEvent, TaskAttemptStateInternal> {
@Override
public TaskAttemptStateInternal transition(TaskAttemptImpl taskAttempt,
TaskAttemptEvent event) {
TaskAttemptRecoverEvent tare = (TaskAttemptRecoverEvent) event;
return taskAttempt.recover(tare.getTaskAttemptInfo(),
tare.getCommitter(), tare.getRecoverOutput());
}
}
@SuppressWarnings({ "unchecked" })
private void logAttemptFinishedEvent(TaskAttemptStateInternal state) {
//Log finished events only if an attempt started.

Some files were not shown because too many files have changed in this diff Show More