Merge r1550130 through r1555020 from trunk.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-5535@1555021 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tsz-wo Sze 2014-01-03 07:26:52 +00:00
commit 498f9674ff
427 changed files with 18824 additions and 7189 deletions

View File

@ -105,6 +105,9 @@ Trunk (Unreleased)
HADOOP-9833 move slf4j to version 1.7.5 (Kousuke Saruta via stevel) HADOOP-9833 move slf4j to version 1.7.5 (Kousuke Saruta via stevel)
HADOOP-10141. Create KeyProvider API to separate encryption key storage
from the applications. (omalley)
BUG FIXES BUG FIXES
HADOOP-9451. Fault single-layer config if node group topology is enabled. HADOOP-9451. Fault single-layer config if node group topology is enabled.
@ -280,6 +283,8 @@ Trunk (Unreleased)
HDFS-5471. CacheAdmin -listPools fails when user lacks permissions to view HDFS-5471. CacheAdmin -listPools fails when user lacks permissions to view
all pools (Andrew Wang via Colin Patrick McCabe) all pools (Andrew Wang via Colin Patrick McCabe)
HADOOP-10044 Improve the javadoc of rpc code (sanjay Radia)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-7761. Improve the performance of raw comparisons. (todd) HADOOP-7761. Improve the performance of raw comparisons. (todd)
@ -395,6 +400,16 @@ Release 2.4.0 - UNRELEASED
HADOOP-10102. Update commons IO from 2.1 to 2.4 (Akira Ajisaka via stevel) HADOOP-10102. Update commons IO from 2.1 to 2.4 (Akira Ajisaka via stevel)
HADOOP-10168. fix javadoc of ReflectionUtils#copy. (Thejas Nair via suresh)
HADOOP-10164. Allow UGI to login with a known Subject (bobby)
HADOOP-10169. Remove the unnecessary synchronized in JvmMetrics class.
(Liang Xie via jing9)
HADOOP-10198. DomainSocket: add support for socketpair.
(Colin Patrick McCabe via wang)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-9748. Reduce blocking on UGI.ensureInitialized (daryn) HADOOP-9748. Reduce blocking on UGI.ensureInitialized (daryn)
@ -402,6 +417,11 @@ Release 2.4.0 - UNRELEASED
HADOOP-10047. Add a direct-buffer based apis for compression. (Gopal V HADOOP-10047. Add a direct-buffer based apis for compression. (Gopal V
via acmurthy) via acmurthy)
HADOOP-10172. Cache SASL server factories (daryn)
HADOOP-10173. Remove UGI from DIGEST-MD5 SASL server creation (daryn via
kihwal)
BUG FIXES BUG FIXES
HADOOP-9964. Fix deadlocks in TestHttpServer by synchronize HADOOP-9964. Fix deadlocks in TestHttpServer by synchronize
@ -465,6 +485,19 @@ Release 2.4.0 - UNRELEASED
HADOOP-10058. TestMetricsSystemImpl#testInitFirstVerifyStopInvokedImmediately HADOOP-10058. TestMetricsSystemImpl#testInitFirstVerifyStopInvokedImmediately
fails on trunk (Chen He via jeagles) fails on trunk (Chen He via jeagles)
HADOOP-8753. LocalDirAllocator throws "ArithmeticException: / by zero" when
there is no available space on configured local dir. (Benoy Antony via hitesh)
HADOOP-10106. Incorrect thread name in RPC log messages. (Ming Ma via jing9)
HADOOP-9611 mvn-rpmbuild against google-guice > 3.0 yields missing cglib
dependency (Timothy St. Clair via stevel)
HADOOP-10171. TestRPC fails intermittently on jkd7 (Mit Desai via jeagles)
HADOOP-10147 HDFS-5678 Upgrade to commons-logging 1.1.3 to avoid potential
deadlock in MiniDFSCluster (stevel)
Release 2.3.0 - UNRELEASED Release 2.3.0 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -538,6 +571,15 @@ Release 2.3.0 - UNRELEASED
HADOOP-10081. Client.setupIOStreams can leak socket resources on exception HADOOP-10081. Client.setupIOStreams can leak socket resources on exception
or error (Tsuyoshi OZAWA via jlowe) or error (Tsuyoshi OZAWA via jlowe)
HADOOP-10087. UserGroupInformation.getGroupNames() fails to return primary
group first when JniBasedUnixGroupsMappingWithFallback is used (cmccabe)
HADOOP-10175. Har files system authority should preserve userinfo.
(Chuan Liu via cnauroth)
HADOOP-10090. Jobtracker metrics not updated properly after execution
of a mapreduce job. (ivanmi)
Release 2.2.0 - 2013-10-13 Release 2.2.0 - 2013-10-13
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES

View File

@ -209,6 +209,10 @@
<artifactId>protobuf-java</artifactId> <artifactId>protobuf-java</artifactId>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.apache.hadoop</groupId> <groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-auth</artifactId> <artifactId>hadoop-auth</artifactId>

View File

@ -0,0 +1,313 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import javax.crypto.spec.SecretKeySpec;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import java.net.URI;
import java.security.Key;
import java.security.KeyStore;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.UnrecoverableKeyException;
import java.security.cert.CertificateException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
/**
* KeyProvider based on Java's KeyStore file format. The file may be stored in
* any Hadoop FileSystem using the following name mangling:
* jks://hdfs@nn1.example.com/my/keys.jks -> hdfs://nn1.example.com/my/keys.jks
* jks://file/home/owen/keys.jks -> file:///home/owen/keys.jks
*
* The password for the keystore is taken from the HADOOP_KEYSTORE_PASSWORD
* environment variable with a default of 'none'.
*
* It is expected for encrypted InputFormats and OutputFormats to copy the keys
* from the original provider into the job's Credentials object, which is
* accessed via the UserProvider. Therefore, this provider won't be used by
* MapReduce tasks.
*/
@InterfaceAudience.Private
public class JavaKeyStoreProvider extends KeyProvider {
public static final String SCHEME_NAME = "jceks";
public static final String KEYSTORE_PASSWORD_NAME =
"HADOOP_KEYSTORE_PASSWORD";
public static final String KEYSTORE_PASSWORD_DEFAULT = "none";
private final URI uri;
private final Path path;
private final FileSystem fs;
private final KeyStore keyStore;
private final char[] password;
private boolean changed = false;
private final Map<String, Metadata> cache = new HashMap<String, Metadata>();
private JavaKeyStoreProvider(URI uri, Configuration conf) throws IOException {
this.uri = uri;
path = unnestUri(uri);
fs = FileSystem.get(conf);
// Get the password from the user's environment
String pw = System.getenv(KEYSTORE_PASSWORD_NAME);
if (pw == null) {
pw = KEYSTORE_PASSWORD_DEFAULT;
}
password = pw.toCharArray();
try {
keyStore = KeyStore.getInstance(SCHEME_NAME);
if (fs.exists(path)) {
keyStore.load(fs.open(path), password);
} else {
// required to create an empty keystore. *sigh*
keyStore.load(null, password);
}
} catch (KeyStoreException e) {
throw new IOException("Can't create keystore", e);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Can't load keystore " + path, e);
} catch (CertificateException e) {
throw new IOException("Can't load keystore " + path, e);
}
}
@Override
public KeyVersion getKeyVersion(String versionName) throws IOException {
SecretKeySpec key = null;
try {
if (!keyStore.containsAlias(versionName)) {
return null;
}
key = (SecretKeySpec) keyStore.getKey(versionName, password);
} catch (KeyStoreException e) {
throw new IOException("Can't get key " + versionName + " from " +
path, e);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Can't get algorithm for key " + key + " from " +
path, e);
} catch (UnrecoverableKeyException e) {
throw new IOException("Can't recover key " + key + " from " + path, e);
}
return new KeyVersion(versionName, key.getEncoded());
}
@Override
public Metadata getMetadata(String name) throws IOException {
if (cache.containsKey(name)) {
return cache.get(name);
}
try {
if (!keyStore.containsAlias(name)) {
return null;
}
Metadata meta = ((KeyMetadata) keyStore.getKey(name, password)).metadata;
cache.put(name, meta);
return meta;
} catch (KeyStoreException e) {
throw new IOException("Can't get metadata for " + name +
" from keystore " + path, e);
} catch (NoSuchAlgorithmException e) {
throw new IOException("Can't get algorithm for " + name +
" from keystore " + path, e);
} catch (UnrecoverableKeyException e) {
throw new IOException("Can't recover key for " + name +
" from keystore " + path, e);
}
}
@Override
public KeyVersion createKey(String name, byte[] material,
Options options) throws IOException {
try {
if (keyStore.containsAlias(name) || cache.containsKey(name)) {
throw new IOException("Key " + name + " already exists in " + this);
}
} catch (KeyStoreException e) {
throw new IOException("Problem looking up key " + name + " in " + this,
e);
}
Metadata meta = new Metadata(options.getCipher(), options.getBitLength(),
new Date(), 1);
if (options.getBitLength() != 8 * material.length) {
throw new IOException("Wrong key length. Required " +
options.getBitLength() + ", but got " + (8 * material.length));
}
cache.put(name, meta);
String versionName = buildVersionName(name, 0);
return innerSetKeyVersion(versionName, material, meta.getCipher());
}
@Override
public void deleteKey(String name) throws IOException {
Metadata meta = getMetadata(name);
if (meta == null) {
throw new IOException("Key " + name + " does not exist in " + this);
}
for(int v=0; v < meta.getVersions(); ++v) {
String versionName = buildVersionName(name, v);
try {
if (keyStore.containsAlias(versionName)) {
keyStore.deleteEntry(versionName);
}
} catch (KeyStoreException e) {
throw new IOException("Problem removing " + versionName + " from " +
this, e);
}
}
try {
if (keyStore.containsAlias(name)) {
keyStore.deleteEntry(name);
}
} catch (KeyStoreException e) {
throw new IOException("Problem removing " + name + " from " + this, e);
}
cache.remove(name);
changed = true;
}
KeyVersion innerSetKeyVersion(String versionName, byte[] material,
String cipher) throws IOException {
try {
keyStore.setKeyEntry(versionName, new SecretKeySpec(material, cipher),
password, null);
} catch (KeyStoreException e) {
throw new IOException("Can't store key " + versionName + " in " + this,
e);
}
changed = true;
return new KeyVersion(versionName, material);
}
@Override
public KeyVersion rollNewVersion(String name,
byte[] material) throws IOException {
Metadata meta = getMetadata(name);
if (meta == null) {
throw new IOException("Key " + name + " not found");
}
if (meta.getBitLength() != 8 * material.length) {
throw new IOException("Wrong key length. Required " +
meta.getBitLength() + ", but got " + (8 * material.length));
}
int nextVersion = meta.addVersion();
String versionName = buildVersionName(name, nextVersion);
return innerSetKeyVersion(versionName, material, meta.getCipher());
}
@Override
public void flush() throws IOException {
if (!changed) {
return;
}
// put all of the updates into the keystore
for(Map.Entry<String, Metadata> entry: cache.entrySet()) {
try {
keyStore.setKeyEntry(entry.getKey(), new KeyMetadata(entry.getValue()),
password, null);
} catch (KeyStoreException e) {
throw new IOException("Can't set metadata key " + entry.getKey(),e );
}
}
// write out the keystore
FSDataOutputStream out = fs.create(path, true);
try {
keyStore.store(out, password);
} catch (KeyStoreException e) {
throw new IOException("Can't store keystore " + this, e);
} catch (NoSuchAlgorithmException e) {
throw new IOException("No such algorithm storing keystore " + this, e);
} catch (CertificateException e) {
throw new IOException("Certificate exception storing keystore " + this,
e);
}
out.close();
changed = false;
}
@Override
public String toString() {
return uri.toString();
}
/**
* The factory to create JksProviders, which is used by the ServiceLoader.
*/
public static class Factory extends KeyProviderFactory {
@Override
public KeyProvider createProvider(URI providerName,
Configuration conf) throws IOException {
if (SCHEME_NAME.equals(providerName.getScheme())) {
return new JavaKeyStoreProvider(providerName, conf);
}
return null;
}
}
/**
* An adapter between a KeyStore Key and our Metadata. This is used to store
* the metadata in a KeyStore even though isn't really a key.
*/
public static class KeyMetadata implements Key, Serializable {
private Metadata metadata;
private final static long serialVersionUID = 8405872419967874451L;
private KeyMetadata(Metadata meta) {
this.metadata = meta;
}
@Override
public String getAlgorithm() {
return metadata.getCipher();
}
@Override
public String getFormat() {
return "KeyMetadata";
}
@Override
public byte[] getEncoded() {
return new byte[0];
}
private void writeObject(ObjectOutputStream out) throws IOException {
byte[] serialized = metadata.serialize();
out.writeInt(serialized.length);
out.write(serialized);
}
private void readObject(ObjectInputStream in
) throws IOException, ClassNotFoundException {
byte[] buf = new byte[in.readInt()];
in.readFully(buf);
metadata = new Metadata(buf);
}
}
}

View File

@ -0,0 +1,384 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.URI;
import java.util.Date;
import java.util.List;
import com.google.gson.stream.JsonReader;
import com.google.gson.stream.JsonWriter;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
/**
* A provider of secret key material for Hadoop applications. Provides an
* abstraction to separate key storage from users of encryption. It
* is intended to support getting or storing keys in a variety of ways,
* including third party bindings.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public abstract class KeyProvider {
public static final String DEFAULT_CIPHER_NAME =
"hadoop.security.key.default.cipher";
public static final String DEFAULT_CIPHER = "AES/CTR/NoPadding";
public static final String DEFAULT_BITLENGTH_NAME =
"hadoop.security.key.default.bitlength";
public static final int DEFAULT_BITLENGTH = 256;
/**
* The combination of both the key version name and the key material.
*/
public static class KeyVersion {
private final String versionName;
private final byte[] material;
protected KeyVersion(String versionName,
byte[] material) {
this.versionName = versionName;
this.material = material;
}
public String getVersionName() {
return versionName;
}
public byte[] getMaterial() {
return material;
}
public String toString() {
StringBuilder buf = new StringBuilder();
buf.append("key(");
buf.append(versionName);
buf.append(")=");
if (material == null) {
buf.append("null");
} else {
for(byte b: material) {
buf.append(' ');
int right = b & 0xff;
if (right < 0x10) {
buf.append('0');
}
buf.append(Integer.toHexString(right));
}
}
return buf.toString();
}
}
/**
* Key metadata that is associated with the key.
*/
public static class Metadata {
private final static String CIPHER_FIELD = "cipher";
private final static String BIT_LENGTH_FIELD = "bitLength";
private final static String CREATED_FIELD = "created";
private final static String VERSIONS_FIELD = "versions";
private final String cipher;
private final int bitLength;
private final Date created;
private int versions;
protected Metadata(String cipher, int bitLength,
Date created, int versions) {
this.cipher = cipher;
this.bitLength = bitLength;
this.created = created;
this.versions = versions;
}
public Date getCreated() {
return created;
}
public String getCipher() {
return cipher;
}
/**
* Get the algorithm from the cipher.
* @return the algorithm name
*/
public String getAlgorithm() {
int slash = cipher.indexOf('/');
if (slash == - 1) {
return cipher;
} else {
return cipher.substring(0, slash);
}
}
public int getBitLength() {
return bitLength;
}
public int getVersions() {
return versions;
}
protected int addVersion() {
return versions++;
}
/**
* Serialize the metadata to a set of bytes.
* @return the serialized bytes
* @throws IOException
*/
protected byte[] serialize() throws IOException {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
JsonWriter writer = new JsonWriter(new OutputStreamWriter(buffer));
writer.beginObject();
if (cipher != null) {
writer.name(CIPHER_FIELD).value(cipher);
}
if (bitLength != 0) {
writer.name(BIT_LENGTH_FIELD).value(bitLength);
}
if (created != null) {
writer.name(CREATED_FIELD).value(created.getTime());
}
writer.name(VERSIONS_FIELD).value(versions);
writer.endObject();
writer.flush();
return buffer.toByteArray();
}
/**
* Deserialize a new metadata object from a set of bytes.
* @param bytes the serialized metadata
* @throws IOException
*/
protected Metadata(byte[] bytes) throws IOException {
String cipher = null;
int bitLength = 0;
Date created = null;
int versions = 0;
JsonReader reader = new JsonReader(new InputStreamReader
(new ByteArrayInputStream(bytes)));
reader.beginObject();
while (reader.hasNext()) {
String field = reader.nextName();
if (CIPHER_FIELD.equals(field)) {
cipher = reader.nextString();
} else if (BIT_LENGTH_FIELD.equals(field)) {
bitLength = reader.nextInt();
} else if (CREATED_FIELD.equals(field)) {
created = new Date(reader.nextLong());
} else if (VERSIONS_FIELD.equals(field)) {
versions = reader.nextInt();
}
}
reader.endObject();
this.cipher = cipher;
this.bitLength = bitLength;
this.created = created;
this.versions = versions;
}
}
/**
* Options when creating key objects.
*/
public static class Options {
private String cipher;
private int bitLength;
public Options(Configuration conf) {
cipher = conf.get(DEFAULT_CIPHER_NAME, DEFAULT_CIPHER);
bitLength = conf.getInt(DEFAULT_BITLENGTH_NAME, DEFAULT_BITLENGTH);
}
public Options setCipher(String cipher) {
this.cipher = cipher;
return this;
}
public Options setBitLength(int bitLength) {
this.bitLength = bitLength;
return this;
}
protected String getCipher() {
return cipher;
}
protected int getBitLength() {
return bitLength;
}
}
/**
* A helper function to create an options object.
* @param conf the configuration to use
* @return a new options object
*/
public static Options options(Configuration conf) {
return new Options(conf);
}
/**
* Get the key material for a specific version of the key. This method is used
* when decrypting data.
* @param versionName the name of a specific version of the key
* @return the key material
* @throws IOException
*/
public abstract KeyVersion getKeyVersion(String versionName
) throws IOException;
/**
* Get the current version of the key, which should be used for encrypting new
* data.
* @param name the base name of the key
* @return the version name of the current version of the key or null if the
* key version doesn't exist
* @throws IOException
*/
public KeyVersion getCurrentKey(String name) throws IOException {
Metadata meta = getMetadata(name);
if (meta == null) {
return null;
}
return getKeyVersion(buildVersionName(name, meta.getVersions() - 1));
}
/**
* Get metadata about the key.
* @param name the basename of the key
* @return the key's metadata or null if the key doesn't exist
* @throws IOException
*/
public abstract Metadata getMetadata(String name) throws IOException;
/**
* Create a new key. The given key must not already exist.
* @param name the base name of the key
* @param material the key material for the first version of the key.
* @param options the options for the new key.
* @return the version name of the first version of the key.
* @throws IOException
*/
public abstract KeyVersion createKey(String name, byte[] material,
Options options) throws IOException;
/**
* Delete the given key.
* @param name the name of the key to delete
* @throws IOException
*/
public abstract void deleteKey(String name) throws IOException;
/**
* Roll a new version of the given key.
* @param name the basename of the key
* @param material the new key material
* @return the name of the new version of the key
* @throws IOException
*/
public abstract KeyVersion rollNewVersion(String name,
byte[] material
) throws IOException;
/**
* Ensures that any changes to the keys are written to persistent store.
* @throws IOException
*/
public abstract void flush() throws IOException;
/**
* Split the versionName in to a base name. Converts "/aaa/bbb/3" to
* "/aaa/bbb".
* @param versionName the version name to split
* @return the base name of the key
* @throws IOException
*/
public static String getBaseName(String versionName) throws IOException {
int div = versionName.lastIndexOf('@');
if (div == -1) {
throw new IOException("No version in key path " + versionName);
}
return versionName.substring(0, div);
}
/**
* Build a version string from a basename and version number. Converts
* "/aaa/bbb" and 3 to "/aaa/bbb@3".
* @param name the basename of the key
* @param version the version of the key
* @return the versionName of the key.
*/
protected static String buildVersionName(String name, int version) {
return name + "@" + version;
}
/**
* Convert a nested URI to decode the underlying path. The translation takes
* the authority and parses it into the underlying scheme and authority.
* For example, "myscheme://hdfs@nn/my/path" is converted to
* "hdfs://nn/my/path".
* @param nestedUri the URI from the nested URI
* @return the unnested path
*/
public static Path unnestUri(URI nestedUri) {
String[] parts = nestedUri.getAuthority().split("@", 2);
StringBuilder result = new StringBuilder(parts[0]);
result.append("://");
if (parts.length == 2) {
result.append(parts[1]);
}
result.append(nestedUri.getPath());
if (nestedUri.getQuery() != null) {
result.append("?");
result.append(nestedUri.getQuery());
}
if (nestedUri.getFragment() != null) {
result.append("#");
result.append(nestedUri.getFragment());
}
return new Path(result.toString());
}
/**
* Find the provider with the given key.
* @param providerList the list of providers
* @param keyName the key name we are looking for
* @return the KeyProvider that has the key
*/
public static KeyProvider findProvider(List<KeyProvider> providerList,
String keyName) throws IOException {
for(KeyProvider provider: providerList) {
if (provider.getMetadata(keyName) != null) {
return provider;
}
}
throw new IOException("Can't find KeyProvider for key " + keyName);
}
}

View File

@ -0,0 +1,76 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.ServiceLoader;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
/**
* A factory to create a list of KeyProvider based on the path given in a
* Configuration. It uses a service loader interface to find the available
* KeyProviders and create them based on the list of URIs.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public abstract class KeyProviderFactory {
public static final String KEY_PROVIDER_PATH =
"hadoop.security.key.provider.path";
public abstract KeyProvider createProvider(URI providerName,
Configuration conf
) throws IOException;
private static final ServiceLoader<KeyProviderFactory> serviceLoader =
ServiceLoader.load(KeyProviderFactory.class);
public static List<KeyProvider> getProviders(Configuration conf
) throws IOException {
List<KeyProvider> result = new ArrayList<KeyProvider>();
for(String path: conf.getStringCollection(KEY_PROVIDER_PATH)) {
try {
URI uri = new URI(path);
boolean found = false;
for(KeyProviderFactory factory: serviceLoader) {
KeyProvider kp = factory.createProvider(uri, conf);
if (kp != null) {
result.add(kp);
found = true;
break;
}
}
if (!found) {
throw new IOException("No KeyProviderFactory for " + uri + " in " +
KEY_PROVIDER_PATH);
}
} catch (URISyntaxException error) {
throw new IOException("Bad configuration of " + KEY_PROVIDER_PATH +
" at " + path, error);
}
}
return result;
}
}

View File

@ -0,0 +1,145 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import java.io.IOException;
import java.net.URI;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
/**
* A KeyProvider factory for UGIs. It uses the credentials object associated
* with the current user to find keys. This provider is created using a
* URI of "user:///".
*/
@InterfaceAudience.Private
public class UserProvider extends KeyProvider {
public static final String SCHEME_NAME = "user";
private final UserGroupInformation user;
private final Credentials credentials;
private final Map<String, Metadata> cache = new HashMap<String, Metadata>();
private UserProvider() throws IOException {
user = UserGroupInformation.getCurrentUser();
credentials = user.getCredentials();
}
@Override
public KeyVersion getKeyVersion(String versionName) {
byte[] bytes = credentials.getSecretKey(new Text(versionName));
if (bytes == null) {
return null;
}
return new KeyVersion(versionName, bytes);
}
@Override
public Metadata getMetadata(String name) throws IOException {
if (cache.containsKey(name)) {
return cache.get(name);
}
byte[] serialized = credentials.getSecretKey(new Text(name));
if (serialized == null) {
return null;
}
Metadata result = new Metadata(serialized);
cache.put(name, result);
return result;
}
@Override
public KeyVersion createKey(String name, byte[] material,
Options options) throws IOException {
Text nameT = new Text(name);
if (credentials.getSecretKey(nameT) != null) {
throw new IOException("Key " + name + " already exists in " + this);
}
if (options.getBitLength() != 8 * material.length) {
throw new IOException("Wrong key length. Required " +
options.getBitLength() + ", but got " + (8 * material.length));
}
Metadata meta = new Metadata(options.getCipher(), options.getBitLength(),
new Date(), 1);
cache.put(name, meta);
String versionName = buildVersionName(name, 0);
credentials.addSecretKey(nameT, meta.serialize());
credentials.addSecretKey(new Text(versionName), material);
return new KeyVersion(versionName, material);
}
@Override
public void deleteKey(String name) throws IOException {
Metadata meta = getMetadata(name);
if (meta == null) {
throw new IOException("Key " + name + " does not exist in " + this);
}
for(int v=0; v < meta.getVersions(); ++v) {
credentials.removeSecretKey(new Text(buildVersionName(name, v)));
}
credentials.removeSecretKey(new Text(name));
cache.remove(name);
}
@Override
public KeyVersion rollNewVersion(String name,
byte[] material) throws IOException {
Metadata meta = getMetadata(name);
if (meta == null) {
throw new IOException("Key " + name + " not found");
}
if (meta.getBitLength() != 8 * material.length) {
throw new IOException("Wrong key length. Required " +
meta.getBitLength() + ", but got " + (8 * material.length));
}
int nextVersion = meta.addVersion();
credentials.addSecretKey(new Text(name), meta.serialize());
String versionName = buildVersionName(name, nextVersion);
credentials.addSecretKey(new Text(versionName), material);
return new KeyVersion(versionName, material);
}
@Override
public String toString() {
return SCHEME_NAME + ":///";
}
@Override
public void flush() {
user.addCredentials(credentials);
}
public static class Factory extends KeyProviderFactory {
@Override
public KeyProvider createProvider(URI providerName,
Configuration conf) throws IOException {
if (SCHEME_NAME.equals(providerName.getScheme())) {
return new UserProvider();
}
return null;
}
}
}

View File

@ -294,6 +294,10 @@ public class HarFileSystem extends FileSystem {
private String getHarAuth(URI underLyingUri) { private String getHarAuth(URI underLyingUri) {
String auth = underLyingUri.getScheme() + "-"; String auth = underLyingUri.getScheme() + "-";
if (underLyingUri.getHost() != null) { if (underLyingUri.getHost() != null) {
if (underLyingUri.getUserInfo() != null) {
auth += underLyingUri.getUserInfo();
auth += "@";
}
auth += underLyingUri.getHost(); auth += underLyingUri.getHost();
if (underLyingUri.getPort() != -1) { if (underLyingUri.getPort() != -1) {
auth += ":"; auth += ":";

View File

@ -365,6 +365,10 @@ public class LocalDirAllocator {
totalAvailable += availableOnDisk[i]; totalAvailable += availableOnDisk[i];
} }
if (totalAvailable == 0){
throw new DiskErrorException("No space available in any of the local directories.");
}
// Keep rolling the wheel till we get a valid path // Keep rolling the wheel till we get a valid path
Random r = new java.util.Random(); Random r = new java.util.Random();
while (numDirsSearched < numDirs && returnPath == null) { while (numDirsSearched < numDirs && returnPath == null) {

View File

@ -305,12 +305,13 @@ public class HttpServer implements FilterContainer {
} }
} }
if (endpoints.size() == 0) { if (endpoints.size() == 0 && connector == null) {
throw new HadoopIllegalArgumentException("No endpoints specified"); throw new HadoopIllegalArgumentException("No endpoints specified");
} }
if (hostName == null) { if (hostName == null) {
hostName = endpoints.get(0).getHost(); hostName = endpoints.size() == 0 ? connector.getHost() : endpoints.get(
0).getHost();
} }
if (this.conf == null) { if (this.conf == null) {

View File

@ -37,10 +37,24 @@ public class RpcConstants {
public static final int INVALID_RETRY_COUNT = -1; public static final int INVALID_RETRY_COUNT = -1;
/**
* The Rpc-connection header is as follows
* +----------------------------------+
* | "hrpc" 4 bytes |
* +----------------------------------+
* | Version (1 byte) |
* +----------------------------------+
* | Service Class (1 byte) |
* +----------------------------------+
* | AuthProtocol (1 byte) |
* +----------------------------------+
*/
/** /**
* The first four bytes of Hadoop RPC connections * The first four bytes of Hadoop RPC connections
*/ */
public static final ByteBuffer HEADER = ByteBuffer.wrap("hrpc".getBytes()); public static final ByteBuffer HEADER = ByteBuffer.wrap("hrpc".getBytes());
public static final int HEADER_LEN_AFTER_HRPC_PART = 3; // 3 bytes that follow
// 1 : Introduce ping and server does not throw away RPCs // 1 : Introduce ping and server does not throw away RPCs
// 3 : Introduce the protocol into the RPC connection header // 3 : Introduce the protocol into the RPC connection header

View File

@ -551,14 +551,14 @@ public abstract class Server {
@Override @Override
public void run() { public void run() {
LOG.info("Starting " + getName()); LOG.info("Starting " + Thread.currentThread().getName());
try { try {
doRunLoop(); doRunLoop();
} finally { } finally {
try { try {
readSelector.close(); readSelector.close();
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.error("Error closing read selector in " + this.getName(), ioe); LOG.error("Error closing read selector in " + Thread.currentThread().getName(), ioe);
} }
} }
} }
@ -589,7 +589,7 @@ public abstract class Server {
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
if (running) { // unexpected -- log it if (running) { // unexpected -- log it
LOG.info(getName() + " unexpectedly interrupted", e); LOG.info(Thread.currentThread().getName() + " unexpectedly interrupted", e);
} }
} catch (IOException ex) { } catch (IOException ex) {
LOG.error("Error in Reader", ex); LOG.error("Error in Reader", ex);
@ -620,7 +620,7 @@ public abstract class Server {
@Override @Override
public void run() { public void run() {
LOG.info(getName() + ": starting"); LOG.info(Thread.currentThread().getName() + ": starting");
SERVER.set(Server.this); SERVER.set(Server.this);
connectionManager.startIdleScan(); connectionManager.startIdleScan();
while (running) { while (running) {
@ -652,7 +652,7 @@ public abstract class Server {
closeCurrentConnection(key, e); closeCurrentConnection(key, e);
} }
} }
LOG.info("Stopping " + this.getName()); LOG.info("Stopping " + Thread.currentThread().getName());
synchronized (this) { synchronized (this) {
try { try {
@ -710,14 +710,14 @@ public abstract class Server {
try { try {
count = c.readAndProcess(); count = c.readAndProcess();
} catch (InterruptedException ieo) { } catch (InterruptedException ieo) {
LOG.info(getName() + ": readAndProcess caught InterruptedException", ieo); LOG.info(Thread.currentThread().getName() + ": readAndProcess caught InterruptedException", ieo);
throw ieo; throw ieo;
} catch (Exception e) { } catch (Exception e) {
// a WrappedRpcServerException is an exception that has been sent // a WrappedRpcServerException is an exception that has been sent
// to the client, so the stacktrace is unnecessary; any other // to the client, so the stacktrace is unnecessary; any other
// exceptions are unexpected internal server errors and thus the // exceptions are unexpected internal server errors and thus the
// stacktrace should be logged // stacktrace should be logged
LOG.info(getName() + ": readAndProcess from client " + LOG.info(Thread.currentThread().getName() + ": readAndProcess from client " +
c.getHostAddress() + " threw exception [" + e + "]", c.getHostAddress() + " threw exception [" + e + "]",
(e instanceof WrappedRpcServerException) ? null : e); (e instanceof WrappedRpcServerException) ? null : e);
count = -1; //so that the (count < 0) block is executed count = -1; //so that the (count < 0) block is executed
@ -740,7 +740,7 @@ public abstract class Server {
try { try {
acceptChannel.socket().close(); acceptChannel.socket().close();
} catch (IOException e) { } catch (IOException e) {
LOG.info(getName() + ":Exception in closing listener socket. " + e); LOG.info(Thread.currentThread().getName() + ":Exception in closing listener socket. " + e);
} }
} }
for (Reader r : readers) { for (Reader r : readers) {
@ -773,16 +773,16 @@ public abstract class Server {
@Override @Override
public void run() { public void run() {
LOG.info(getName() + ": starting"); LOG.info(Thread.currentThread().getName() + ": starting");
SERVER.set(Server.this); SERVER.set(Server.this);
try { try {
doRunLoop(); doRunLoop();
} finally { } finally {
LOG.info("Stopping " + this.getName()); LOG.info("Stopping " + Thread.currentThread().getName());
try { try {
writeSelector.close(); writeSelector.close();
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.error("Couldn't close write selector in " + this.getName(), ioe); LOG.error("Couldn't close write selector in " + Thread.currentThread().getName(), ioe);
} }
} }
} }
@ -803,7 +803,7 @@ public abstract class Server {
doAsyncWrite(key); doAsyncWrite(key);
} }
} catch (IOException e) { } catch (IOException e) {
LOG.info(getName() + ": doAsyncWrite threw exception " + e); LOG.info(Thread.currentThread().getName() + ": doAsyncWrite threw exception " + e);
} }
} }
long now = Time.now(); long now = Time.now();
@ -918,7 +918,7 @@ public abstract class Server {
call = responseQueue.removeFirst(); call = responseQueue.removeFirst();
SocketChannel channel = call.connection.channel; SocketChannel channel = call.connection.channel;
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to " + call); LOG.debug(Thread.currentThread().getName() + ": responding to " + call);
} }
// //
// Send as much data as we can in the non-blocking fashion // Send as much data as we can in the non-blocking fashion
@ -937,7 +937,7 @@ public abstract class Server {
done = false; // more calls pending to be sent. done = false; // more calls pending to be sent.
} }
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to " + call LOG.debug(Thread.currentThread().getName() + ": responding to " + call
+ " Wrote " + numBytes + " bytes."); + " Wrote " + numBytes + " bytes.");
} }
} else { } else {
@ -965,7 +965,7 @@ public abstract class Server {
} }
} }
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": responding to " + call LOG.debug(Thread.currentThread().getName() + ": responding to " + call
+ " Wrote partial " + numBytes + " bytes."); + " Wrote partial " + numBytes + " bytes.");
} }
} }
@ -973,7 +973,7 @@ public abstract class Server {
} }
} finally { } finally {
if (error && call != null) { if (error && call != null) {
LOG.warn(getName()+", call " + call + ": output error"); LOG.warn(Thread.currentThread().getName()+", call " + call + ": output error");
done = true; // error. no more data for this channel. done = true; // error. no more data for this channel.
closeConnection(call.connection); closeConnection(call.connection);
} }
@ -1105,6 +1105,9 @@ public abstract class Server {
this.channel = channel; this.channel = channel;
this.lastContact = lastContact; this.lastContact = lastContact;
this.data = null; this.data = null;
// the buffer is initialized to read the "hrpc" and after that to read
// the length of the Rpc-packet (i.e 4 bytes)
this.dataLengthBuffer = ByteBuffer.allocate(4); this.dataLengthBuffer = ByteBuffer.allocate(4);
this.unwrappedData = null; this.unwrappedData = null;
this.unwrappedDataLengthBuffer = ByteBuffer.allocate(4); this.unwrappedDataLengthBuffer = ByteBuffer.allocate(4);
@ -1200,7 +1203,16 @@ public abstract class Server {
} }
} }
private Throwable getCauseForInvalidToken(IOException e) { /**
* Some exceptions ({@link RetriableException} and {@link StandbyException})
* that are wrapped as a cause of parameter e are unwrapped so that they can
* be sent as the true cause to the client side. In case of
* {@link InvalidToken} we go one level deeper to get the true cause.
*
* @param e the exception that may have a cause we want to unwrap.
* @return the true cause for some exceptions.
*/
private Throwable getTrueCause(IOException e) {
Throwable cause = e; Throwable cause = e;
while (cause != null) { while (cause != null) {
if (cause instanceof RetriableException) { if (cause instanceof RetriableException) {
@ -1223,6 +1235,18 @@ public abstract class Server {
return e; return e;
} }
/**
* Process saslMessage and send saslResponse back
* @param saslMessage received SASL message
* @throws WrappedRpcServerException setup failed due to SASL negotiation
* failure, premature or invalid connection context, or other state
* errors. This exception needs to be sent to the client. This
* exception will wrap {@link RetriableException},
* {@link InvalidToken}, {@link StandbyException} or
* {@link SaslException}.
* @throws IOException if sending reply fails
* @throws InterruptedException
*/
private void saslProcess(RpcSaslProto saslMessage) private void saslProcess(RpcSaslProto saslMessage)
throws WrappedRpcServerException, IOException, InterruptedException { throws WrappedRpcServerException, IOException, InterruptedException {
if (saslContextEstablished) { if (saslContextEstablished) {
@ -1239,7 +1263,7 @@ public abstract class Server {
// attempting user could be null // attempting user could be null
AUDITLOG.warn(AUTH_FAILED_FOR + this.toString() + ":" AUDITLOG.warn(AUTH_FAILED_FOR + this.toString() + ":"
+ attemptingUser + " (" + e.getLocalizedMessage() + ")"); + attemptingUser + " (" + e.getLocalizedMessage() + ")");
throw (IOException) getCauseForInvalidToken(e); throw (IOException) getTrueCause(e);
} }
if (saslServer != null && saslServer.isComplete()) { if (saslServer != null && saslServer.isComplete()) {
@ -1274,13 +1298,26 @@ public abstract class Server {
} }
} }
/**
* Process a saslMessge.
* @param saslMessage received SASL message
* @return the sasl response to send back to client
* @throws SaslException if authentication or generating response fails,
* or SASL protocol mixup
* @throws IOException if a SaslServer cannot be created
* @throws AccessControlException if the requested authentication type
* is not supported or trying to re-attempt negotiation.
* @throws InterruptedException
*/
private RpcSaslProto processSaslMessage(RpcSaslProto saslMessage) private RpcSaslProto processSaslMessage(RpcSaslProto saslMessage)
throws IOException, InterruptedException { throws SaslException, IOException, AccessControlException,
InterruptedException {
RpcSaslProto saslResponse = null; RpcSaslProto saslResponse = null;
final SaslState state = saslMessage.getState(); // required final SaslState state = saslMessage.getState(); // required
switch (state) { switch (state) {
case NEGOTIATE: { case NEGOTIATE: {
if (sentNegotiate) { if (sentNegotiate) {
// FIXME shouldn't this be SaslException?
throw new AccessControlException( throw new AccessControlException(
"Client already attempted negotiation"); "Client already attempted negotiation");
} }
@ -1402,12 +1439,30 @@ public abstract class Server {
} }
} }
/**
* This method reads in a non-blocking fashion from the channel:
* this method is called repeatedly when data is present in the channel;
* when it has enough data to process one rpc it processes that rpc.
*
* On the first pass, it processes the connectionHeader,
* connectionContext (an outOfBand RPC) and at most one RPC request that
* follows that. On future passes it will process at most one RPC request.
*
* Quirky things: dataLengthBuffer (4 bytes) is used to read "hrpc" OR
* rpc request length.
*
* @return -1 in case of error, else num bytes read so far
* @throws WrappedRpcServerException - an exception that has already been
* sent back to the client that does not require verbose logging
* by the Listener thread
* @throws IOException - internal error that should not be returned to
* client, typically failure to respond to client
* @throws InterruptedException
*/
public int readAndProcess() public int readAndProcess()
throws WrappedRpcServerException, IOException, InterruptedException { throws WrappedRpcServerException, IOException, InterruptedException {
while (true) { while (true) {
/* Read at most one RPC. If the header is not read completely yet // dataLengthBuffer is used to read "hrpc" or the rpc-packet length
* then iterate until we read first RPC or until there is no data left.
*/
int count = -1; int count = -1;
if (dataLengthBuffer.remaining() > 0) { if (dataLengthBuffer.remaining() > 0) {
count = channelRead(channel, dataLengthBuffer); count = channelRead(channel, dataLengthBuffer);
@ -1416,9 +1471,11 @@ public abstract class Server {
} }
if (!connectionHeaderRead) { if (!connectionHeaderRead) {
//Every connection is expected to send the header. // Every connection is expected to send the header;
// so far we read "hrpc" of the connection header.
if (connectionHeaderBuf == null) { if (connectionHeaderBuf == null) {
connectionHeaderBuf = ByteBuffer.allocate(3); // for the bytes that follow "hrpc", in the connection header
connectionHeaderBuf = ByteBuffer.allocate(HEADER_LEN_AFTER_HRPC_PART);
} }
count = channelRead(channel, connectionHeaderBuf); count = channelRead(channel, connectionHeaderBuf);
if (count < 0 || connectionHeaderBuf.remaining() > 0) { if (count < 0 || connectionHeaderBuf.remaining() > 0) {
@ -1451,27 +1508,30 @@ public abstract class Server {
// this may switch us into SIMPLE // this may switch us into SIMPLE
authProtocol = initializeAuthContext(connectionHeaderBuf.get(2)); authProtocol = initializeAuthContext(connectionHeaderBuf.get(2));
dataLengthBuffer.clear(); dataLengthBuffer.clear(); // clear to next read rpc packet len
connectionHeaderBuf = null; connectionHeaderBuf = null;
connectionHeaderRead = true; connectionHeaderRead = true;
continue; continue; // connection header read, now read 4 bytes rpc packet len
} }
if (data == null) { if (data == null) { // just read 4 bytes - length of RPC packet
dataLengthBuffer.flip(); dataLengthBuffer.flip();
dataLength = dataLengthBuffer.getInt(); dataLength = dataLengthBuffer.getInt();
checkDataLength(dataLength); checkDataLength(dataLength);
// Set buffer for reading EXACTLY the RPC-packet length and no more.
data = ByteBuffer.allocate(dataLength); data = ByteBuffer.allocate(dataLength);
} }
// Now read the RPC packet
count = channelRead(channel, data); count = channelRead(channel, data);
if (data.remaining() == 0) { if (data.remaining() == 0) {
dataLengthBuffer.clear(); dataLengthBuffer.clear(); // to read length of future rpc packets
data.flip(); data.flip();
boolean isHeaderRead = connectionContextRead; boolean isHeaderRead = connectionContextRead;
processOneRpc(data.array()); processOneRpc(data.array());
data = null; data = null;
// the last rpc-request we processed could have simply been the
// connectionContext; if so continue to read the first RPC.
if (!isHeaderRead) { if (!isHeaderRead) {
continue; continue;
} }
@ -1508,8 +1568,16 @@ public abstract class Server {
return authProtocol; return authProtocol;
} }
/**
* Process the Sasl's Negotiate request, including the optimization of
* accelerating token negotiation.
* @return the response to Negotiate request - the list of enabled
* authMethods and challenge if the TOKENS are supported.
* @throws SaslException - if attempt to generate challenge fails.
* @throws IOException - if it fails to create the SASL server for Tokens
*/
private RpcSaslProto buildSaslNegotiateResponse() private RpcSaslProto buildSaslNegotiateResponse()
throws IOException, InterruptedException { throws InterruptedException, SaslException, IOException {
RpcSaslProto negotiateMessage = negotiateResponse; RpcSaslProto negotiateMessage = negotiateResponse;
// accelerate token negotiation by sending initial challenge // accelerate token negotiation by sending initial challenge
// in the negotiation response // in the negotiation response
@ -1635,8 +1703,11 @@ public abstract class Server {
/** /**
* Process a wrapped RPC Request - unwrap the SASL packet and process * Process a wrapped RPC Request - unwrap the SASL packet and process
* each embedded RPC request * each embedded RPC request
* @param buf - SASL wrapped request of one or more RPCs * @param inBuf - SASL wrapped request of one or more RPCs
* @throws IOException - SASL packet cannot be unwrapped * @throws IOException - SASL packet cannot be unwrapped
* @throws WrappedRpcServerException - an exception that has already been
* sent back to the client that does not require verbose logging
* by the Listener thread
* @throws InterruptedException * @throws InterruptedException
*/ */
private void unwrapPacketAndProcessRpcs(byte[] inBuf) private void unwrapPacketAndProcessRpcs(byte[] inBuf)
@ -1677,13 +1748,21 @@ public abstract class Server {
} }
/** /**
* Process an RPC Request - handle connection setup and decoding of * Process one RPC Request from buffer read from socket stream
* request into a Call * - decode rpc in a rpc-Call
* - handle out-of-band RPC requests such as the initial connectionContext
* - A successfully decoded RpcCall will be deposited in RPC-Q and
* its response will be sent later when the request is processed.
*
* Prior to this call the connectionHeader ("hrpc...") has been handled and
* if SASL then SASL has been established and the buf we are passed
* has been unwrapped from SASL.
*
* @param buf - contains the RPC request header and the rpc request * @param buf - contains the RPC request header and the rpc request
* @throws IOException - internal error that should not be returned to * @throws IOException - internal error that should not be returned to
* client, typically failure to respond to client * client, typically failure to respond to client
* @throws WrappedRpcServerException - an exception to be sent back to * @throws WrappedRpcServerException - an exception that is sent back to the
* the client that does not require verbose logging by the * client in this method and does not require verbose logging by the
* Listener thread * Listener thread
* @throws InterruptedException * @throws InterruptedException
*/ */
@ -1753,8 +1832,11 @@ public abstract class Server {
} }
/** /**
* Process an RPC Request - the connection headers and context must * Process an RPC Request
* have been already read * - the connection headers and context must have been already read.
* - Based on the rpcKind, decode the rpcRequest.
* - A successfully decoded RpcCall will be deposited in RPC-Q and
* its response will be sent later when the request is processed.
* @param header - RPC request header * @param header - RPC request header
* @param dis - stream to request payload * @param dis - stream to request payload
* @throws WrappedRpcServerException - due to fatal rpc layer issues such * @throws WrappedRpcServerException - due to fatal rpc layer issues such
@ -1803,7 +1885,8 @@ public abstract class Server {
* @param dis - stream to request payload * @param dis - stream to request payload
* @throws WrappedRpcServerException - setup failed due to SASL * @throws WrappedRpcServerException - setup failed due to SASL
* negotiation failure, premature or invalid connection context, * negotiation failure, premature or invalid connection context,
* or other state errors * or other state errors. This exception needs to be sent to the
* client.
* @throws IOException - failed to send a response back to the client * @throws IOException - failed to send a response back to the client
* @throws InterruptedException * @throws InterruptedException
*/ */
@ -1928,7 +2011,7 @@ public abstract class Server {
@Override @Override
public void run() { public void run() {
LOG.debug(getName() + ": starting"); LOG.debug(Thread.currentThread().getName() + ": starting");
SERVER.set(Server.this); SERVER.set(Server.this);
ByteArrayOutputStream buf = ByteArrayOutputStream buf =
new ByteArrayOutputStream(INITIAL_RESP_BUF_SIZE); new ByteArrayOutputStream(INITIAL_RESP_BUF_SIZE);
@ -1936,7 +2019,7 @@ public abstract class Server {
try { try {
final Call call = callQueue.take(); // pop the queue; maybe blocked here final Call call = callQueue.take(); // pop the queue; maybe blocked here
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug(getName() + ": " + call + " for RpcKind " + call.rpcKind); LOG.debug(Thread.currentThread().getName() + ": " + call + " for RpcKind " + call.rpcKind);
} }
String errorClass = null; String errorClass = null;
String error = null; String error = null;
@ -1969,7 +2052,7 @@ public abstract class Server {
if (e instanceof UndeclaredThrowableException) { if (e instanceof UndeclaredThrowableException) {
e = e.getCause(); e = e.getCause();
} }
String logMsg = getName() + ", call " + call + ": error: " + e; String logMsg = Thread.currentThread().getName() + ", call " + call + ": error: " + e;
if (e instanceof RuntimeException || e instanceof Error) { if (e instanceof RuntimeException || e instanceof Error) {
// These exception types indicate something is probably wrong // These exception types indicate something is probably wrong
// on the server side, as opposed to just a normal exceptional // on the server side, as opposed to just a normal exceptional
@ -2018,13 +2101,13 @@ public abstract class Server {
} }
} catch (InterruptedException e) { } catch (InterruptedException e) {
if (running) { // unexpected -- log it if (running) { // unexpected -- log it
LOG.info(getName() + " unexpectedly interrupted", e); LOG.info(Thread.currentThread().getName() + " unexpectedly interrupted", e);
} }
} catch (Exception e) { } catch (Exception e) {
LOG.info(getName() + " caught an exception", e); LOG.info(Thread.currentThread().getName() + " caught an exception", e);
} }
} }
LOG.debug(getName() + ": exiting"); LOG.debug(Thread.currentThread().getName() + ": exiting");
} }
} }

View File

@ -24,10 +24,8 @@ import java.lang.management.MemoryUsage;
import java.lang.management.ThreadInfo; import java.lang.management.ThreadInfo;
import java.lang.management.ThreadMXBean; import java.lang.management.ThreadMXBean;
import java.lang.management.GarbageCollectorMXBean; import java.lang.management.GarbageCollectorMXBean;
import java.util.Map;
import java.util.List; import java.util.List;
import java.util.concurrent.ConcurrentHashMap;
import com.google.common.collect.Maps;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.log.metrics.EventCounter; import org.apache.hadoop.log.metrics.EventCounter;
@ -67,7 +65,8 @@ public class JvmMetrics implements MetricsSource {
ManagementFactory.getGarbageCollectorMXBeans(); ManagementFactory.getGarbageCollectorMXBeans();
final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean(); final ThreadMXBean threadMXBean = ManagementFactory.getThreadMXBean();
final String processName, sessionId; final String processName, sessionId;
final Map<String, MetricsInfo[]> gcInfoCache = Maps.newHashMap(); final ConcurrentHashMap<String, MetricsInfo[]> gcInfoCache =
new ConcurrentHashMap<String, MetricsInfo[]>();
JvmMetrics(String processName, String sessionId) { JvmMetrics(String processName, String sessionId) {
this.processName = processName; this.processName = processName;
@ -123,13 +122,17 @@ public class JvmMetrics implements MetricsSource {
.addCounter(GcTimeMillis, timeMillis); .addCounter(GcTimeMillis, timeMillis);
} }
private synchronized MetricsInfo[] getGcInfo(String gcName) { private MetricsInfo[] getGcInfo(String gcName) {
MetricsInfo[] gcInfo = gcInfoCache.get(gcName); MetricsInfo[] gcInfo = gcInfoCache.get(gcName);
if (gcInfo == null) { if (gcInfo == null) {
gcInfo = new MetricsInfo[2]; gcInfo = new MetricsInfo[2];
gcInfo[0] = Interns.info("GcCount"+ gcName, "GC Count for "+ gcName); gcInfo[0] = Interns.info("GcCount" + gcName, "GC Count for " + gcName);
gcInfo[1] = Interns.info("GcTimeMillis"+ gcName, "GC Time for "+ gcName); gcInfo[1] = Interns
gcInfoCache.put(gcName, gcInfo); .info("GcTimeMillis" + gcName, "GC Time for " + gcName);
MetricsInfo[] previousGcInfo = gcInfoCache.putIfAbsent(gcName, gcInfo);
if (previousGcInfo != null) {
return previousGcInfo;
}
} }
return gcInfo; return gcInfo;
} }

View File

@ -276,6 +276,24 @@ public class DomainSocket implements Closeable {
return new DomainSocket(path, fd); return new DomainSocket(path, fd);
} }
/**
* Create a pair of UNIX domain sockets which are connected to each other
* by calling socketpair(2).
*
* @return An array of two UNIX domain sockets connected to
* each other.
* @throws IOException on error.
*/
public static DomainSocket[] socketpair() throws IOException {
int fds[] = socketpair0();
return new DomainSocket[] {
new DomainSocket("(anonymous0)", fds[0]),
new DomainSocket("(anonymous1)", fds[1])
};
}
private static native int[] socketpair0() throws IOException;
private static native int accept0(int fd) throws IOException; private static native int accept0(int fd) throws IOException;
/** /**

View File

@ -134,6 +134,14 @@ public class Credentials implements Writable {
secretKeysMap.put(alias, key); secretKeysMap.put(alias, key);
} }
/**
* Remove the key for a given alias.
* @param alias the alias for the key
*/
public void removeSecretKey(Text alias) {
secretKeysMap.remove(alias);
}
/** /**
* Convenience method for reading a token storage file, and loading the Tokens * Convenience method for reading a token storage file, and loading the Tokens
* therein in the passed UGI * therein in the passed UGI

View File

@ -25,6 +25,10 @@ import java.io.DataOutput;
import java.io.IOException; import java.io.IOException;
import java.security.PrivilegedExceptionAction; import java.security.PrivilegedExceptionAction;
import java.security.Security; import java.security.Security;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
@ -38,6 +42,7 @@ import javax.security.sasl.RealmCallback;
import javax.security.sasl.Sasl; import javax.security.sasl.Sasl;
import javax.security.sasl.SaslException; import javax.security.sasl.SaslException;
import javax.security.sasl.SaslServer; import javax.security.sasl.SaslServer;
import javax.security.sasl.SaslServerFactory;
import org.apache.commons.codec.binary.Base64; import org.apache.commons.codec.binary.Base64;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
@ -63,6 +68,7 @@ public class SaslRpcServer {
public static final String SASL_DEFAULT_REALM = "default"; public static final String SASL_DEFAULT_REALM = "default";
public static final Map<String, String> SASL_PROPS = public static final Map<String, String> SASL_PROPS =
new TreeMap<String, String>(); new TreeMap<String, String>();
private static SaslServerFactory saslFactory;
public static enum QualityOfProtection { public static enum QualityOfProtection {
AUTHENTICATION("auth"), AUTHENTICATION("auth"),
@ -125,7 +131,7 @@ public class SaslRpcServer {
public SaslServer create(Connection connection, public SaslServer create(Connection connection,
SecretManager<TokenIdentifier> secretManager SecretManager<TokenIdentifier> secretManager
) throws IOException, InterruptedException { ) throws IOException, InterruptedException {
UserGroupInformation ugi = UserGroupInformation.getCurrentUser(); UserGroupInformation ugi = null;
final CallbackHandler callback; final CallbackHandler callback;
switch (authMethod) { switch (authMethod) {
case TOKEN: { case TOKEN: {
@ -133,6 +139,7 @@ public class SaslRpcServer {
break; break;
} }
case KERBEROS: { case KERBEROS: {
ugi = UserGroupInformation.getCurrentUser();
if (serverId.isEmpty()) { if (serverId.isEmpty()) {
throw new AccessControlException( throw new AccessControlException(
"Kerberos principal name does NOT have the expected " "Kerberos principal name does NOT have the expected "
@ -147,14 +154,20 @@ public class SaslRpcServer {
"Server does not support SASL " + authMethod); "Server does not support SASL " + authMethod);
} }
SaslServer saslServer = ugi.doAs( final SaslServer saslServer;
if (ugi != null) {
saslServer = ugi.doAs(
new PrivilegedExceptionAction<SaslServer>() { new PrivilegedExceptionAction<SaslServer>() {
@Override @Override
public SaslServer run() throws SaslException { public SaslServer run() throws SaslException {
return Sasl.createSaslServer(mechanism, protocol, serverId, return saslFactory.createSaslServer(mechanism, protocol, serverId,
SaslRpcServer.SASL_PROPS, callback); SaslRpcServer.SASL_PROPS, callback);
} }
}); });
} else {
saslServer = saslFactory.createSaslServer(mechanism, protocol, serverId,
SaslRpcServer.SASL_PROPS, callback);
}
if (saslServer == null) { if (saslServer == null) {
throw new AccessControlException( throw new AccessControlException(
"Unable to find SASL server implementation for " + mechanism); "Unable to find SASL server implementation for " + mechanism);
@ -180,6 +193,7 @@ public class SaslRpcServer {
SASL_PROPS.put(Sasl.QOP, saslQOP.getSaslQop()); SASL_PROPS.put(Sasl.QOP, saslQOP.getSaslQop());
SASL_PROPS.put(Sasl.SERVER_AUTH, "true"); SASL_PROPS.put(Sasl.SERVER_AUTH, "true");
Security.addProvider(new SaslPlainServer.SecurityProvider()); Security.addProvider(new SaslPlainServer.SecurityProvider());
saslFactory = new FastSaslServerFactory(SASL_PROPS);
} }
static String encodeIdentifier(byte[] identifier) { static String encodeIdentifier(byte[] identifier) {
@ -363,4 +377,47 @@ public class SaslRpcServer {
} }
} }
} }
// Sasl.createSaslServer is 100-200X slower than caching the factories!
private static class FastSaslServerFactory implements SaslServerFactory {
private final Map<String,List<SaslServerFactory>> factoryCache =
new HashMap<String,List<SaslServerFactory>>();
FastSaslServerFactory(Map<String,?> props) {
final Enumeration<SaslServerFactory> factories =
Sasl.getSaslServerFactories();
while (factories.hasMoreElements()) {
SaslServerFactory factory = factories.nextElement();
for (String mech : factory.getMechanismNames(props)) {
if (!factoryCache.containsKey(mech)) {
factoryCache.put(mech, new ArrayList<SaslServerFactory>());
}
factoryCache.get(mech).add(factory);
}
}
}
@Override
public SaslServer createSaslServer(String mechanism, String protocol,
String serverName, Map<String,?> props, CallbackHandler cbh)
throws SaslException {
SaslServer saslServer = null;
List<SaslServerFactory> factories = factoryCache.get(mechanism);
if (factories != null) {
for (SaslServerFactory factory : factories) {
saslServer = factory.createSaslServer(
mechanism, protocol, serverName, props, cbh);
if (saslServer != null) {
break;
}
}
}
return saslServer;
}
@Override
public String[] getMechanismNames(Map<String, ?> props) {
return factoryCache.keySet().toArray(new String[0]);
}
}
} }

View File

@ -682,9 +682,26 @@ public class UserGroupInformation {
public synchronized public synchronized
static UserGroupInformation getLoginUser() throws IOException { static UserGroupInformation getLoginUser() throws IOException {
if (loginUser == null) { if (loginUser == null) {
loginUserFromSubject(null);
}
return loginUser;
}
/**
* Log in a user using the given subject
* @parma subject the subject to use when logging in a user, or null to
* create a new subject.
* @throws IOException if login fails
*/
@InterfaceAudience.Public
@InterfaceStability.Evolving
public synchronized
static void loginUserFromSubject(Subject subject) throws IOException {
ensureInitialized(); ensureInitialized();
try { try {
Subject subject = new Subject(); if (subject == null) {
subject = new Subject();
}
LoginContext login = LoginContext login =
newLoginContext(authenticationMethod.getLoginAppName(), newLoginContext(authenticationMethod.getLoginAppName(),
subject, new HadoopConfiguration()); subject, new HadoopConfiguration());
@ -719,8 +736,6 @@ public class UserGroupInformation {
LOG.debug("UGI loginUser:"+loginUser); LOG.debug("UGI loginUser:"+loginUser);
} }
} }
return loginUser;
}
@InterfaceAudience.Private @InterfaceAudience.Private
@InterfaceStability.Unstable @InterfaceStability.Unstable

View File

@ -275,8 +275,9 @@ public class ReflectionUtils {
/** /**
* Make a copy of the writable object using serialization to a buffer * Make a copy of the writable object using serialization to a buffer
* @param dst the object to copy from * @param src the object to copy from
* @param src the object to copy into, which is destroyed * @param dst the object to copy into, which is destroyed
* @return dst param (the copy)
* @throws IOException * @throws IOException
*/ */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")

View File

@ -928,8 +928,10 @@ public class StringUtils {
* @param args List of arguments. * @param args List of arguments.
* @return null if the option was not found; the value of the * @return null if the option was not found; the value of the
* option otherwise. * option otherwise.
* @throws IllegalArgumentException if the option's argument is not present
*/ */
public static String popOptionWithArgument(String name, List<String> args) { public static String popOptionWithArgument(String name, List<String> args)
throws IllegalArgumentException {
String val = null; String val = null;
for (Iterator<String> iter = args.iterator(); iter.hasNext(); ) { for (Iterator<String> iter = args.iterator(); iter.hasNext(); ) {
String cur = iter.next(); String cur = iter.next();
@ -939,7 +941,7 @@ public class StringUtils {
} else if (cur.equals(name)) { } else if (cur.equals(name)) {
iter.remove(); iter.remove();
if (!iter.hasNext()) { if (!iter.hasNext()) {
throw new RuntimeException("option " + name + " requires 1 " + throw new IllegalArgumentException("option " + name + " requires 1 " +
"argument."); "argument.");
} }
val = iter.next(); val = iter.next();

View File

@ -364,6 +364,50 @@ JNIEnv *env, jclass clazz, jstring path)
return fd; return fd;
} }
#define SOCKETPAIR_ARRAY_LEN 2
JNIEXPORT jarray JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_socketpair0(
JNIEnv *env, jclass clazz)
{
jarray arr = NULL;
int idx, err, fds[SOCKETPAIR_ARRAY_LEN] = { -1, -1 };
jthrowable jthr = NULL;
arr = (*env)->NewIntArray(env, SOCKETPAIR_ARRAY_LEN);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
if (socketpair(PF_UNIX, SOCK_STREAM, 0, fds) < 0) {
err = errno;
jthr = newSocketException(env, err,
"socketpair(2) error: %s", terror(err));
goto done;
}
(*env)->SetIntArrayRegion(env, arr, 0, SOCKETPAIR_ARRAY_LEN, fds);
jthr = (*env)->ExceptionOccurred(env);
if (jthr) {
(*env)->ExceptionClear(env);
goto done;
}
done:
if (jthr) {
(*env)->DeleteLocalRef(env, arr);
arr = NULL;
for (idx = 0; idx < SOCKETPAIR_ARRAY_LEN; idx++) {
if (fds[idx] >= 0) {
close(fds[idx]);
fds[idx] = -1;
}
}
(*env)->Throw(env, jthr);
}
return arr;
}
JNIEXPORT jint JNICALL JNIEXPORT jint JNICALL
Java_org_apache_hadoop_net_unix_DomainSocket_accept0( Java_org_apache_hadoop_net_unix_DomainSocket_accept0(
JNIEnv *env, jclass clazz, jint fd) JNIEnv *env, jclass clazz, jint fd)

View File

@ -122,13 +122,43 @@ int hadoop_user_info_fetch(struct hadoop_user_info *uinfo,
} }
} }
static int put_primary_gid_first(struct hadoop_user_info *uinfo)
{
int i, num_gids = uinfo->num_gids;
gid_t first_gid;
gid_t gid;
gid_t primary = uinfo->pwd.pw_gid;
if (num_gids < 1) {
// There are no gids, but we expected at least one.
return EINVAL;
}
first_gid = uinfo->gids[0];
if (first_gid == primary) {
// First gid is already the primary.
return 0;
}
for (i = 1; i < num_gids; i++) {
gid = uinfo->gids[i];
if (gid == primary) {
// swap first gid and this gid.
uinfo->gids[0] = gid;
uinfo->gids[i] = first_gid;
return 0;
}
}
// Did not find the primary gid in the list.
return EINVAL;
}
int hadoop_user_info_getgroups(struct hadoop_user_info *uinfo) int hadoop_user_info_getgroups(struct hadoop_user_info *uinfo)
{ {
int ret, ngroups; int ret, ngroups;
gid_t *ngids; gid_t *ngids;
if (!uinfo->pwd.pw_name) { if (!uinfo->pwd.pw_name) {
return EINVAL; // invalid user info // invalid user info
return EINVAL;
} }
uinfo->num_gids = 0; uinfo->num_gids = 0;
if (!uinfo->gids) { if (!uinfo->gids) {
@ -141,8 +171,12 @@ int hadoop_user_info_getgroups(struct hadoop_user_info *uinfo)
ngroups = uinfo->gids_size; ngroups = uinfo->gids_size;
ret = getgrouplist(uinfo->pwd.pw_name, uinfo->pwd.pw_gid, ret = getgrouplist(uinfo->pwd.pw_name, uinfo->pwd.pw_gid,
uinfo->gids, &ngroups); uinfo->gids, &ngroups);
if (ret != -1) { if (ret > 0) {
uinfo->num_gids = ngroups; uinfo->num_gids = ngroups;
ret = put_primary_gid_first(uinfo);
if (ret) {
return ret;
}
return 0; return 0;
} }
ngids = realloc(uinfo->gids, sizeof(uinfo->gids[0]) * ngroups); ngids = realloc(uinfo->gids, sizeof(uinfo->gids[0]) * ngroups);
@ -153,11 +187,12 @@ int hadoop_user_info_getgroups(struct hadoop_user_info *uinfo)
uinfo->gids_size = ngroups; uinfo->gids_size = ngroups;
ret = getgrouplist(uinfo->pwd.pw_name, uinfo->pwd.pw_gid, ret = getgrouplist(uinfo->pwd.pw_name, uinfo->pwd.pw_gid,
uinfo->gids, &ngroups); uinfo->gids, &ngroups);
if (ret != -1) { if (ret < 0) {
uinfo->num_gids = ngroups;
return 0;
}
return EIO; return EIO;
}
uinfo->num_gids = ngroups;
ret = put_primary_gid_first(uinfo);
return ret;
} }
#ifdef USER_TESTING #ifdef USER_TESTING

View File

@ -0,0 +1,17 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
org.apache.hadoop.crypto.key.JavaKeyStoreProvider$Factory
org.apache.hadoop.crypto.key.UserProvider$Factory

View File

@ -0,0 +1,112 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.junit.Test;
import java.io.IOException;
import java.net.URI;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertArrayEquals;
public class TestKeyProvider {
@Test
public void testBuildVersionName() throws Exception {
assertEquals("/a/b@3", KeyProvider.buildVersionName("/a/b", 3));
assertEquals("/aaa@12", KeyProvider.buildVersionName("/aaa", 12));
}
@Test
public void testParseVersionName() throws Exception {
assertEquals("/a/b", KeyProvider.getBaseName("/a/b@3"));
assertEquals("/aaa", KeyProvider.getBaseName("/aaa@112"));
try {
KeyProvider.getBaseName("no-slashes");
assertTrue("should have thrown", false);
} catch (IOException e) {
assertTrue(true);
}
}
@Test
public void testKeyMaterial() throws Exception {
byte[] key1 = new byte[]{1,2,3,4};
KeyProvider.KeyVersion obj = new KeyProvider.KeyVersion("key1@1", key1);
assertEquals("key1@1", obj.getVersionName());
assertArrayEquals(new byte[]{1,2,3,4}, obj.getMaterial());
}
@Test
public void testMetadata() throws Exception {
DateFormat format = new SimpleDateFormat("y/m/d");
Date date = format.parse("2013/12/25");
KeyProvider.Metadata meta = new KeyProvider.Metadata("myCipher", 100,
date, 123);
assertEquals("myCipher", meta.getCipher());
assertEquals(100, meta.getBitLength());
assertEquals(date, meta.getCreated());
assertEquals(123, meta.getVersions());
KeyProvider.Metadata second = new KeyProvider.Metadata(meta.serialize());
assertEquals(meta.getCipher(), second.getCipher());
assertEquals(meta.getBitLength(), second.getBitLength());
assertEquals(meta.getCreated(), second.getCreated());
assertEquals(meta.getVersions(), second.getVersions());
int newVersion = second.addVersion();
assertEquals(123, newVersion);
assertEquals(124, second.getVersions());
assertEquals(123, meta.getVersions());
}
@Test
public void testOptions() throws Exception {
Configuration conf = new Configuration();
conf.set(KeyProvider.DEFAULT_CIPHER_NAME, "myCipher");
conf.setInt(KeyProvider.DEFAULT_BITLENGTH_NAME, 512);
KeyProvider.Options options = KeyProvider.options(conf);
assertEquals("myCipher", options.getCipher());
assertEquals(512, options.getBitLength());
options.setCipher("yourCipher");
options.setBitLength(128);
assertEquals("yourCipher", options.getCipher());
assertEquals(128, options.getBitLength());
options = KeyProvider.options(new Configuration());
assertEquals(KeyProvider.DEFAULT_CIPHER, options.getCipher());
assertEquals(KeyProvider.DEFAULT_BITLENGTH, options.getBitLength());
}
@Test
public void testUnnestUri() throws Exception {
assertEquals(new Path("hdfs://nn.example.com/my/path"),
KeyProvider.unnestUri(new URI("myscheme://hdfs@nn.example.com/my/path")));
assertEquals(new Path("hdfs://nn/my/path?foo=bar&baz=bat#yyy"),
KeyProvider.unnestUri(new URI("myscheme://hdfs@nn/my/path?foo=bar&baz=bat#yyy")));
assertEquals(new Path("inner://hdfs@nn1.example.com/my/path"),
KeyProvider.unnestUri(new URI("outer://inner@hdfs@nn1.example.com/my/path")));
assertEquals(new Path("user:///"),
KeyProvider.unnestUri(new URI("outer://user/")));
}
}

View File

@ -0,0 +1,191 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.crypto.key;
import java.io.File;
import java.io.IOException;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.Credentials;
import org.apache.hadoop.security.UserGroupInformation;
import org.junit.Test;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class TestKeyProviderFactory {
private static final File tmpDir =
new File(System.getProperty("test.build.data", "/tmp"), "key");
@Test
public void testFactory() throws Exception {
Configuration conf = new Configuration();
conf.set(KeyProviderFactory.KEY_PROVIDER_PATH,
UserProvider.SCHEME_NAME + ":///," +
JavaKeyStoreProvider.SCHEME_NAME + "://file" + tmpDir + "/test.jks");
List<KeyProvider> providers = KeyProviderFactory.getProviders(conf);
assertEquals(2, providers.size());
assertEquals(UserProvider.class, providers.get(0).getClass());
assertEquals(JavaKeyStoreProvider.class, providers.get(1).getClass());
assertEquals(UserProvider.SCHEME_NAME +
":///", providers.get(0).toString());
assertEquals(JavaKeyStoreProvider.SCHEME_NAME +
"://file" + tmpDir + "/test.jks",
providers.get(1).toString());
}
@Test
public void testFactoryErrors() throws Exception {
Configuration conf = new Configuration();
conf.set(KeyProviderFactory.KEY_PROVIDER_PATH, "unknown:///");
try {
List<KeyProvider> providers = KeyProviderFactory.getProviders(conf);
assertTrue("should throw!", false);
} catch (IOException e) {
assertEquals("No KeyProviderFactory for unknown:/// in " +
KeyProviderFactory.KEY_PROVIDER_PATH,
e.getMessage());
}
}
@Test
public void testUriErrors() throws Exception {
Configuration conf = new Configuration();
conf.set(KeyProviderFactory.KEY_PROVIDER_PATH, "unkn@own:/x/y");
try {
List<KeyProvider> providers = KeyProviderFactory.getProviders(conf);
assertTrue("should throw!", false);
} catch (IOException e) {
assertEquals("Bad configuration of " +
KeyProviderFactory.KEY_PROVIDER_PATH +
" at unkn@own:/x/y", e.getMessage());
}
}
static void checkSpecificProvider(Configuration conf,
String ourUrl) throws Exception {
KeyProvider provider = KeyProviderFactory.getProviders(conf).get(0);
byte[] key1 = new byte[32];
byte[] key2 = new byte[32];
byte[] key3 = new byte[32];
for(int i =0; i < key1.length; ++i) {
key1[i] = (byte) i;
key2[i] = (byte) (i * 2);
key3[i] = (byte) (i * 3);
}
// ensure that we get nulls when the key isn't there
assertEquals(null, provider.getKeyVersion("no-such-key"));
assertEquals(null, provider.getMetadata("key"));
// create a new key
try {
provider.createKey("key3", key3, KeyProvider.options(conf));
} catch (Exception e) {
e.printStackTrace();
throw e;
}
// check the metadata for key3
KeyProvider.Metadata meta = provider.getMetadata("key3");
assertEquals(KeyProvider.DEFAULT_CIPHER, meta.getCipher());
assertEquals(KeyProvider.DEFAULT_BITLENGTH, meta.getBitLength());
assertEquals(1, meta.getVersions());
// make sure we get back the right key
assertArrayEquals(key3, provider.getCurrentKey("key3").getMaterial());
assertEquals("key3@0", provider.getCurrentKey("key3").getVersionName());
// try recreating key3
try {
provider.createKey("key3", key3, KeyProvider.options(conf));
assertTrue("should throw", false);
} catch (IOException e) {
assertEquals("Key key3 already exists in " + ourUrl, e.getMessage());
}
provider.deleteKey("key3");
try {
provider.deleteKey("key3");
assertTrue("should throw", false);
} catch (IOException e) {
assertEquals("Key key3 does not exist in " + ourUrl, e.getMessage());
}
provider.createKey("key3", key3, KeyProvider.options(conf));
try {
provider.createKey("key4", key3,
KeyProvider.options(conf).setBitLength(8));
assertTrue("should throw", false);
} catch (IOException e) {
assertEquals("Wrong key length. Required 8, but got 256", e.getMessage());
}
provider.createKey("key4", new byte[]{1},
KeyProvider.options(conf).setBitLength(8));
provider.rollNewVersion("key4", new byte[]{2});
meta = provider.getMetadata("key4");
assertEquals(2, meta.getVersions());
assertArrayEquals(new byte[]{2},
provider.getCurrentKey("key4").getMaterial());
assertArrayEquals(new byte[]{1},
provider.getKeyVersion("key4@0").getMaterial());
assertEquals("key4@1", provider.getCurrentKey("key4").getVersionName());
try {
provider.rollNewVersion("key4", key1);
assertTrue("should throw", false);
} catch (IOException e) {
assertEquals("Wrong key length. Required 8, but got 256", e.getMessage());
}
try {
provider.rollNewVersion("no-such-key", key1);
assertTrue("should throw", false);
} catch (IOException e) {
assertEquals("Key no-such-key not found", e.getMessage());
}
provider.flush();
// get a new instance of the provider to ensure it was saved correctly
provider = KeyProviderFactory.getProviders(conf).get(0);
assertArrayEquals(new byte[]{2},
provider.getCurrentKey("key4").getMaterial());
assertArrayEquals(key3, provider.getCurrentKey("key3").getMaterial());
assertEquals("key3@0", provider.getCurrentKey("key3").getVersionName());
}
@Test
public void testUserProvider() throws Exception {
Configuration conf = new Configuration();
final String ourUrl = UserProvider.SCHEME_NAME + ":///";
conf.set(KeyProviderFactory.KEY_PROVIDER_PATH, ourUrl);
checkSpecificProvider(conf, ourUrl);
// see if the credentials are actually in the UGI
Credentials credentials =
UserGroupInformation.getCurrentUser().getCredentials();
assertArrayEquals(new byte[]{1},
credentials.getSecretKey(new Text("key4@0")));
assertArrayEquals(new byte[]{2},
credentials.getSecretKey(new Text("key4@1")));
}
@Test
public void testJksProvider() throws Exception {
Configuration conf = new Configuration();
final String ourUrl =
JavaKeyStoreProvider.SCHEME_NAME + "://file" + tmpDir + "/test.jks";
File file = new File(tmpDir, "test.jks");
file.delete();
conf.set(KeyProviderFactory.KEY_PROVIDER_PATH, ourUrl);
checkSpecificProvider(conf, ourUrl);
assertTrue(file + " should exist", file.isFile());
}
}

View File

@ -258,6 +258,22 @@ public class TestHarFileSystemBasics {
0, expectedFileNames.size()); 0, expectedFileNames.size());
} }
@Test
public void testMakeQualifiedPath() throws Exception {
// Construct a valid har file system path with authority that
// contains userinfo and port. The userinfo and port are useless
// in local fs uri. They are only used to verify har file system
// can correctly preserve the information for the underlying file system.
String harPathWithUserinfo = "har://file-user:passwd@localhost:80"
+ harPath.toUri().getPath().toString();
Path path = new Path(harPathWithUserinfo);
Path qualifiedPath = path.getFileSystem(conf).makeQualified(path);
assertTrue(String.format(
"The qualified path (%s) did not match the expected path (%s).",
qualifiedPath.toString(), harPathWithUserinfo),
qualifiedPath.toString().equals(harPathWithUserinfo));
}
// ========== Negative: // ========== Negative:
@Test @Test

View File

@ -66,6 +66,8 @@ import org.mockito.internal.util.reflection.Whitebox;
import org.mortbay.jetty.Connector; import org.mortbay.jetty.Connector;
import org.mortbay.util.ajax.JSON; import org.mortbay.util.ajax.JSON;
import static org.mockito.Mockito.*;
public class TestHttpServer extends HttpServerFunctionalTest { public class TestHttpServer extends HttpServerFunctionalTest {
static final Log LOG = LogFactory.getLog(TestHttpServer.class); static final Log LOG = LogFactory.getLog(TestHttpServer.class);
private static HttpServer server; private static HttpServer server;
@ -588,4 +590,15 @@ public class TestHttpServer extends HttpServerFunctionalTest {
assertEquals(conn.getHeaderField("Expires"), conn.getHeaderField("Date")); assertEquals(conn.getHeaderField("Expires"), conn.getHeaderField("Date"));
} }
/**
* HTTPServer.Builder should proceed if a external connector is available.
*/
@Test
public void testHttpServerBuilderWithExternalConnector() throws Exception {
Connector c = mock(Connector.class);
doReturn("localhost").when(c).getHost();
HttpServer s = new HttpServer.Builder().setName("test").setConnector(c)
.build();
s.stop();
}
} }

View File

@ -957,6 +957,7 @@ public class TestRPC {
proxy.sleep(pingInterval*4); proxy.sleep(pingInterval*4);
} finally { } finally {
if (proxy != null) RPC.stopProxy(proxy); if (proxy != null) RPC.stopProxy(proxy);
server.stop();
} }
} }

View File

@ -137,7 +137,9 @@ public class TestSaslRPC {
LOG.info("Testing QOP:"+expectedQop); LOG.info("Testing QOP:"+expectedQop);
LOG.info("---------------------------------"); LOG.info("---------------------------------");
conf = new Configuration(); conf = new Configuration();
conf.set(HADOOP_SECURITY_AUTHENTICATION, KERBEROS.toString()); // the specific tests for kerberos will enable kerberos. forcing it
// for all tests will cause tests to fail if the user has a TGT
conf.set(HADOOP_SECURITY_AUTHENTICATION, SIMPLE.toString());
conf.set("hadoop.rpc.protection", expectedQop.name().toLowerCase()); conf.set("hadoop.rpc.protection", expectedQop.name().toLowerCase());
UserGroupInformation.setConfiguration(conf); UserGroupInformation.setConfiguration(conf);
enableSecretManager = null; enableSecretManager = null;

View File

@ -0,0 +1,87 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.metrics2.impl;
import static org.junit.Assert.*;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.metrics2.MetricsSource;
import org.apache.hadoop.metrics2.MetricsTag;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.MetricsAnnotations;
import org.apache.hadoop.metrics2.lib.MetricsRegistry;
import org.apache.hadoop.metrics2.lib.MetricsSourceBuilder;
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
import org.junit.Test;
public class TestMetricsSourceAdapter {
@Test
public void testGetMetricsAndJmx() throws Exception {
// create test source with a single metric counter of value 0
TestSource source = new TestSource("test");
MetricsSourceBuilder sb = MetricsAnnotations.newSourceBuilder(source);
final MetricsSource s = sb.build();
List<MetricsTag> injectedTags = new ArrayList<MetricsTag>();
MetricsSourceAdapter sa = new MetricsSourceAdapter(
"test", "test", "test desc", s, injectedTags, null, null, 1, false);
// all metrics are initially assumed to have changed
MetricsCollectorImpl builder = new MetricsCollectorImpl();
Iterable<MetricsRecordImpl> metricsRecords = sa.getMetrics(builder, true);
// Validate getMetrics and JMX initial values
MetricsRecordImpl metricsRecord = metricsRecords.iterator().next();
assertEquals(0L,
metricsRecord.metrics().iterator().next().value().longValue());
Thread.sleep(100); // skip JMX cache TTL
assertEquals(0L, (Number)sa.getAttribute("C1"));
// change metric value
source.incrementCnt();
// validate getMetrics and JMX
builder = new MetricsCollectorImpl();
metricsRecords = sa.getMetrics(builder, true);
metricsRecord = metricsRecords.iterator().next();
assertTrue(metricsRecord.metrics().iterator().hasNext());
Thread.sleep(100); // skip JMX cache TTL
assertEquals(1L, (Number)sa.getAttribute("C1"));
}
@SuppressWarnings("unused")
@Metrics(context="test")
private static class TestSource {
@Metric("C1 desc") MutableCounterLong c1;
final MetricsRegistry registry;
TestSource(String recName) {
registry = new MetricsRegistry(recName);
}
public void incrementCnt() {
c1.incr();
}
}
}

View File

@ -420,7 +420,8 @@ public class TestDomainSocket {
* @throws IOException * @throws IOException
*/ */
void testClientServer1(final Class<? extends WriteStrategy> writeStrategyClass, void testClientServer1(final Class<? extends WriteStrategy> writeStrategyClass,
final Class<? extends ReadStrategy> readStrategyClass) throws Exception { final Class<? extends ReadStrategy> readStrategyClass,
final DomainSocket preConnectedSockets[]) throws Exception {
final String TEST_PATH = new File(sockDir.getDir(), final String TEST_PATH = new File(sockDir.getDir(),
"test_sock_client_server1").getAbsolutePath(); "test_sock_client_server1").getAbsolutePath();
final byte clientMsg1[] = new byte[] { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 }; final byte clientMsg1[] = new byte[] { 0x1, 0x2, 0x3, 0x4, 0x5, 0x6 };
@ -428,13 +429,15 @@ public class TestDomainSocket {
final byte clientMsg2 = 0x45; final byte clientMsg2 = 0x45;
final ArrayBlockingQueue<Throwable> threadResults = final ArrayBlockingQueue<Throwable> threadResults =
new ArrayBlockingQueue<Throwable>(2); new ArrayBlockingQueue<Throwable>(2);
final DomainSocket serv = DomainSocket.bindAndListen(TEST_PATH); final DomainSocket serv = (preConnectedSockets != null) ?
null : DomainSocket.bindAndListen(TEST_PATH);
Thread serverThread = new Thread() { Thread serverThread = new Thread() {
public void run(){ public void run(){
// Run server // Run server
DomainSocket conn = null; DomainSocket conn = null;
try { try {
conn = serv.accept(); conn = preConnectedSockets != null ?
preConnectedSockets[0] : serv.accept();
byte in1[] = new byte[clientMsg1.length]; byte in1[] = new byte[clientMsg1.length];
ReadStrategy reader = readStrategyClass.newInstance(); ReadStrategy reader = readStrategyClass.newInstance();
reader.init(conn); reader.init(conn);
@ -459,7 +462,8 @@ public class TestDomainSocket {
Thread clientThread = new Thread() { Thread clientThread = new Thread() {
public void run(){ public void run(){
try { try {
DomainSocket client = DomainSocket.connect(TEST_PATH); DomainSocket client = preConnectedSockets != null ?
preConnectedSockets[1] : DomainSocket.connect(TEST_PATH);
WriteStrategy writer = writeStrategyClass.newInstance(); WriteStrategy writer = writeStrategyClass.newInstance();
writer.init(client); writer.init(client);
writer.write(clientMsg1); writer.write(clientMsg1);
@ -487,25 +491,45 @@ public class TestDomainSocket {
} }
serverThread.join(120000); serverThread.join(120000);
clientThread.join(120000); clientThread.join(120000);
if (serv != null) {
serv.close(); serv.close();
} }
}
@Test(timeout=180000) @Test(timeout=180000)
public void testClientServerOutStreamInStream() throws Exception { public void testClientServerOutStreamInStream() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class, testClientServer1(OutputStreamWriteStrategy.class,
InputStreamReadStrategy.class); InputStreamReadStrategy.class, null);
}
@Test(timeout=180000)
public void testClientServerOutStreamInStreamWithSocketpair() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
InputStreamReadStrategy.class, DomainSocket.socketpair());
} }
@Test(timeout=180000) @Test(timeout=180000)
public void testClientServerOutStreamInDbb() throws Exception { public void testClientServerOutStreamInDbb() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class, testClientServer1(OutputStreamWriteStrategy.class,
DirectByteBufferReadStrategy.class); DirectByteBufferReadStrategy.class, null);
}
@Test(timeout=180000)
public void testClientServerOutStreamInDbbWithSocketpair() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
DirectByteBufferReadStrategy.class, DomainSocket.socketpair());
} }
@Test(timeout=180000) @Test(timeout=180000)
public void testClientServerOutStreamInAbb() throws Exception { public void testClientServerOutStreamInAbb() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class, testClientServer1(OutputStreamWriteStrategy.class,
ArrayBackedByteBufferReadStrategy.class); ArrayBackedByteBufferReadStrategy.class, null);
}
@Test(timeout=180000)
public void testClientServerOutStreamInAbbWithSocketpair() throws Exception {
testClientServer1(OutputStreamWriteStrategy.class,
ArrayBackedByteBufferReadStrategy.class, DomainSocket.socketpair());
} }
static private class PassedFile { static private class PassedFile {

View File

@ -85,6 +85,7 @@ class OpenFileCtx {
private volatile boolean activeState; private volatile boolean activeState;
// The stream write-back status. True means one thread is doing write back. // The stream write-back status. True means one thread is doing write back.
private volatile boolean asyncStatus; private volatile boolean asyncStatus;
private volatile long asyncWriteBackStartOffset;
/** /**
* The current offset of the file in HDFS. All the content before this offset * The current offset of the file in HDFS. All the content before this offset
@ -209,6 +210,7 @@ class OpenFileCtx {
updateLastAccessTime(); updateLastAccessTime();
activeState = true; activeState = true;
asyncStatus = false; asyncStatus = false;
asyncWriteBackStartOffset = 0;
dumpOut = null; dumpOut = null;
raf = null; raf = null;
nonSequentialWriteInMemory = new AtomicLong(0); nonSequentialWriteInMemory = new AtomicLong(0);
@ -580,6 +582,7 @@ class OpenFileCtx {
+ nextOffset.get()); + nextOffset.get());
} }
asyncStatus = true; asyncStatus = true;
asyncWriteBackStartOffset = writeCtx.getOffset();
asyncDataService.execute(new AsyncDataService.WriteBackTask(this)); asyncDataService.execute(new AsyncDataService.WriteBackTask(this));
} else { } else {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
@ -903,9 +906,11 @@ class OpenFileCtx {
/** Invoked by AsynDataService to write back to HDFS */ /** Invoked by AsynDataService to write back to HDFS */
void executeWriteBack() { void executeWriteBack() {
Preconditions.checkState(asyncStatus, Preconditions.checkState(asyncStatus,
"The openFileCtx has false async status"); "openFileCtx has false asyncStatus, fileId:" + latestAttr.getFileid());
final long startOffset = asyncWriteBackStartOffset;
try { try {
while (activeState) { while (activeState) {
// asyncStatus could be changed to false in offerNextToWrite()
WriteCtx toWrite = offerNextToWrite(); WriteCtx toWrite = offerNextToWrite();
if (toWrite != null) { if (toWrite != null) {
// Do the write // Do the write
@ -921,8 +926,18 @@ class OpenFileCtx {
+ latestAttr.getFileId()); + latestAttr.getFileId());
} }
} finally { } finally {
// make sure we reset asyncStatus to false // Make sure to reset asyncStatus to false unless a race happens
synchronized (this) {
if (startOffset == asyncWriteBackStartOffset) {
asyncStatus = false; asyncStatus = false;
} else {
LOG.info("Another asyn task is already started before this one"
+ " is finalized. fileId:" + latestAttr.getFileid()
+ " asyncStatus:" + asyncStatus + " original startOffset:"
+ startOffset + " new startOffset:" + asyncWriteBackStartOffset
+ ". Won't change asyncStatus here.");
}
}
} }
} }

View File

@ -13,6 +13,10 @@ Trunk (Unreleased)
HDFS-3125. Add JournalService to enable Journal Daemon. (suresh) HDFS-3125. Add JournalService to enable Journal Daemon. (suresh)
HDFS-2832. Heterogeneous Storages support in HDFS phase 1 - treat DataNode
as a collection of storages (see breakdown of tasks below for features and
contributors).
IMPROVEMENTS IMPROVEMENTS
HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common. HDFS-4665. Move TestNetworkTopologyWithNodeGroup to common.
@ -212,43 +216,48 @@ Trunk (Unreleased)
and INodeFileUnderConstructionWithSnapshot with FileUnderContructionFeature. and INodeFileUnderConstructionWithSnapshot with FileUnderContructionFeature.
(jing9 via szetszwo) (jing9 via szetszwo)
HDFS-5538. URLConnectionFactory should pick up the SSL related configuration
by default. (Haohui Mai via jing9)
HDFS-5286. Flatten INodeDirectory hierarchy: Replace INodeDirectoryWithQuota HDFS-5286. Flatten INodeDirectory hierarchy: Replace INodeDirectoryWithQuota
with DirectoryWithQuotaFeature. (szetszwo) with DirectoryWithQuotaFeature. (szetszwo)
HDFS-5556. Add some more NameNode cache statistics, cache pool stats HDFS-5556. Add some more NameNode cache statistics, cache pool stats
(cmccabe) (cmccabe)
HDFS-5545. Allow specifying endpoints for listeners in HttpServer. (Haohui
Mai via jing9)
HDFS-5537. Remove FileWithSnapshot interface. (jing9 via szetszwo) HDFS-5537. Remove FileWithSnapshot interface. (jing9 via szetszwo)
HDFS-5430. Support TTL on CacheDirectives. (wang) HDFS-5430. Support TTL on CacheDirectives. (wang)
HDFS-5536. Implement HTTP policy for Namenode and DataNode. (Haohui Mai via
jing9)
HDFS-5630. Hook up cache directive and pool usage statistics. (wang) HDFS-5630. Hook up cache directive and pool usage statistics. (wang)
HDFS-5312. Generate HTTP / HTTPS URL in DFSUtil#getInfoServer() based on the
configured http policy. (Haohui Mai via jing9)
HDFS-5554. Flatten INodeFile hierarchy: Replace INodeFileWithSnapshot with HDFS-5554. Flatten INodeFile hierarchy: Replace INodeFileWithSnapshot with
FileWithSnapshotFeature. (jing9 via szetszwo) FileWithSnapshotFeature. (jing9 via szetszwo)
HDFS-5629. Support HTTPS in JournalNode and SecondaryNameNode. HDFS-5647. Merge INodeDirectory.Feature and INodeFile.Feature. (Haohui Mai
(Haohui Mai via jing9) via jing9)
HDFS-5632. Flatten INodeDirectory hierarchy: Replace
INodeDirectoryWithSnapshot with DirectoryWithSnapshotFeature.
(jing9 via szetszwo)
HDFS-5431. Support cachepool-based limit management in path-based caching
(awang via cmccabe)
HDFS-5636. Enforce a max TTL per cache pool. (awang via cmccabe)
HDFS-5651. Remove dfs.namenode.caching.enabled and improve CRM locking.
(cmccabe via wang)
HDFS-5496. Make replication queue initialization asynchronous. (Vinay via HDFS-5496. Make replication queue initialization asynchronous. (Vinay via
jing9) jing9)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe) HDFS-5349. DNA_CACHE and DNA_UNCACHE should be by blockId only. (cmccabe)
HDFS-5665. Remove the unnecessary writeLock while initializing CacheManager
in FsNameSystem Ctor. (Uma Maheswara Rao G via Andrew Wang)
BUG FIXES BUG FIXES
HADOOP-9635 Fix potential Stack Overflow in DomainSocket.c (V. Karthik Kumar HADOOP-9635 Fix potential Stack Overflow in DomainSocket.c (V. Karthik Kumar
via cmccabe) via cmccabe)
@ -443,6 +452,150 @@ Trunk (Unreleased)
HDFS-5626. dfsadmin -report shows incorrect cache values. (cmccabe) HDFS-5626. dfsadmin -report shows incorrect cache values. (cmccabe)
HDFS-5406. Send incremental block reports for all storages in a
single call. (Arpit Agarwal)
HDFS-5454. DataNode UUID should be assigned prior to FsDataset
initialization. (Arpit Agarwal)
HDFS-5679. TestCacheDirectives should handle the case where native code
is not available. (wang)
HDFS-5701. Fix the CacheAdmin -addPool -maxTtl option name.
(Stephen Chu via wang)
HDFS-5708. The CacheManager throws a NPE in the DataNode logs when
processing cache reports that refer to a block not known to the
BlockManager. (cmccabe via wang)
HDFS-5659. dfsadmin -report doesn't output cache information properly.
(wang)
BREAKDOWN OF HDFS-2832 SUBTASKS AND RELATED JIRAS
HDFS-4985. Add storage type to the protocol and expose it in block report
and block locations. (Arpit Agarwal)
HDFS-5115. Make StorageID a UUID. (Arpit Agarwal)
HDFS-5000. DataNode configuration should allow specifying storage type.
(Arpit Agarwal)
HDFS-4987. Namenode changes to track multiple storages per datanode.
(szetszwo)
HDFS-5154. Fix TestBlockManager and TestDatanodeDescriptor after HDFS-4987.
(Junping Du via szetszwo)
HDFS-5009. Include storage information in the LocatedBlock. (szetszwo)
HDFS-5134. Move blockContentsStale, heartbeatedSinceFailover and
firstBlockReport from DatanodeDescriptor to DatanodeStorageInfo; and
fix a synchronization problem in DatanodeStorageInfo. (szetszwo)
HDFS-5157. Add StorageType to FsVolume. (Junping Du via szetszwo)
HDFS-4990. Change BlockPlacementPolicy to choose storages instead of
datanodes. (szetszwo)
HDFS-5232. Protocol changes to transmit StorageUuid. (Arpit Agarwal)
HDFS-5233. Use Datanode UUID to identify Datanodes. (Arpit Agarwal)
HDFS-5222. Move block schedule information from DatanodeDescriptor to
DatanodeStorageInfo. (szetszwo)
HDFS-4988. Datanode must support all the volumes as individual storages.
(Arpit Agarwal)
HDFS-5377. Heartbeats from Datandode should include one storage report
per storage directory. (Arpit Agarwal)
HDFS-5398. NameNode changes to process storage reports per storage
directory. (Arpit Agarwal)
HDFS-5390. Send one incremental block report per storage directory.
(Arpit Agarwal)
HDFS-5401. Fix NPE in Directory Scanner. (Arpit Agarwal)
HDFS-5417. Fix storage IDs in PBHelper and UpgradeUtilities. (szetszwo)
HDFS-5214. Fix NPEs in BlockManager and DirectoryScanner. (Arpit Agarwal)
HDFS-5435. File append fails to initialize storageIDs. (Junping Du via
Arpit Agarwal)
HDFS-5437. Fix TestBlockReport and TestBPOfferService failures. (Arpit
Agarwal)
HDFS-5447. Fix TestJspHelper. (Arpit Agarwal)
HDFS-5452. Fix TestReplicationPolicy and TestBlocksScheduledCounter.
HDFS-5448. Datanode should generate its ID on first registration. (Arpit
Agarwal)
HDFS-5448. Fix break caused by previous checkin for HDFS-5448. (Arpit
Agarwal)
HDFS-5455. NN should update storageMap on first heartbeat. (Arpit Agarwal)
HDFS-5457. Fix TestDatanodeRegistration, TestFsck and TestAddBlockRetry.
(Contributed by szetszwo)
HDFS-5466. Update storage IDs when the pipeline is updated. (Contributed
by szetszwo)
HDFS-5439. Fix TestPendingReplication. (Contributed by Junping Du, Arpit
Agarwal)
HDFS-5470. Add back trunk's reportDiff algorithm to the branch.
(Contributed by szetszwo)
HDFS-5472. Fix TestDatanodeManager, TestSafeMode and
TestNNThroughputBenchmark (Contributed by szetszwo)
HDFS-5475. NN incorrectly tracks more than one replica per DN. (Arpit
Agarwal)
HDFS-5481. Fix TestDataNodeVolumeFailure in branch HDFS-2832. (Contributed
by Junping Du)
HDFS-5480. Update Balancer for HDFS-2832. (Contributed by szetszwo)
HDFS-5486. Fix TestNameNodeMetrics for HDFS-2832. (Arpit Agarwal)
HDFS-5491. Update editsStored for HDFS-2832. (Arpit Agarwal)
HDFS-5494. Fix findbugs warnings for HDFS-2832. (Arpit Agarwal)
HDFS-5508. Fix compilation error after merge. (Contributed by szetszwo)
HDFS-5501. Fix pendingReceivedRequests tracking in BPServiceActor. (Arpit
Agarwal)
HDFS-5510. Fix a findbug warning in DataStorage.java on HDFS-2832 branch.
(Junping Du via Arpit Agarwal)
HDFS-5515. Fix TestDFSStartupVersions for HDFS-2832. (Arpit Agarwal)
HDFS-5527. Fix TestUnderReplicatedBlocks on branch HDFS-2832. (Arpit
Agarwal)
HDFS-5547. Fix build break after merge from trunk to HDFS-2832. (Arpit
Agarwal)
HDFS-5542. Fix TODO and clean up the code in HDFS-2832. (Contributed by
szetszwo)
HDFS-5559. Fix TestDatanodeConfig in HDFS-2832. (Contributed by szetszwo)
HDFS-5484. StorageType and State in DatanodeStorageInfo in NameNode is
not accurate. (Eric Sirianni via Arpit Agarwal)
HDFS-5648. Get rid of FsDatasetImpl#perVolumeReplicaMap. (Arpit Agarwal)
Release 2.4.0 - UNRELEASED Release 2.4.0 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -483,9 +636,6 @@ Release 2.4.0 - UNRELEASED
HDFS-5004. Add additional JMX bean for NameNode status data HDFS-5004. Add additional JMX bean for NameNode status data
(Trevor Lorimer via cos) (Trevor Lorimer via cos)
HDFS-5068. Convert NNThroughputBenchmark to a Tool to allow generic options.
(shv)
HDFS-4994. Audit log getContentSummary() calls. (Robert Parker via kihwal) HDFS-4994. Audit log getContentSummary() calls. (Robert Parker via kihwal)
HDFS-5144. Document time unit to NameNodeMetrics. (Akira Ajisaka via HDFS-5144. Document time unit to NameNodeMetrics. (Akira Ajisaka via
@ -590,6 +740,49 @@ Release 2.4.0 - UNRELEASED
HDFS-5633. Improve OfflineImageViewer to use less memory. (jing9) HDFS-5633. Improve OfflineImageViewer to use less memory. (jing9)
HDFS-5023. TestSnapshotPathINodes.testAllowSnapshot is failing with jdk7
(Mit Desai via jeagles)
HDFS-5637. Try to refeatchToken while local read InvalidToken occurred.
(Liang Xie via junping_du)
HDFS-5652. Refactor invalid block token exception handling in DFSInputStream.
(Liang Xie via junping_du)
HDFS-5350. Name Node should report fsimage transfer time as a metric.
(Jimmy Xiang via wang)
HDFS-5538. URLConnectionFactory should pick up the SSL related configuration
by default. (Haohui Mai via jing9)
HDFS-5545. Allow specifying endpoints for listeners in HttpServer. (Haohui
Mai via jing9)
HDFS-5536. Implement HTTP policy for Namenode and DataNode. (Haohui Mai via
jing9)
HDFS-5312. Generate HTTP / HTTPS URL in DFSUtil#getInfoServer() based on the
configured http policy. (Haohui Mai via jing9)
HDFS-5629. Support HTTPS in JournalNode and SecondaryNameNode.
(Haohui Mai via jing9)
HDFS-5674. Editlog code cleanup: remove @SuppressWarnings("deprecation") in
FSEditLogOp; change FSEditLogOpCodes.fromByte(..) to be more efficient; and
change Some fields in FSEditLog to final. (szetszwo)
HDFS-5634. Allow BlockReaderLocal to switch between checksumming and not
(cmccabe)
HDFS-5663 make the retry time and interval value configurable in openInfo()
(Liang Xie via stack)
HDFS-5540. Fix intermittent failure in TestBlocksWithNotEnoughRacks.
(Binglin Chang via junping_du)
HDFS-2933. Improve DataNode Web UI Index Page. (Vivek Ganesan via
Arpit Agarwal)
OPTIMIZATIONS OPTIMIZATIONS
HDFS-5239. Allow FSNamesystem lock fairness to be configurable (daryn) HDFS-5239. Allow FSNamesystem lock fairness to be configurable (daryn)
@ -597,6 +790,8 @@ Release 2.4.0 - UNRELEASED
HDFS-5341. Reduce fsdataset lock duration during directory scanning. HDFS-5341. Reduce fsdataset lock duration during directory scanning.
(Qus-Jiawei via kihwal) (Qus-Jiawei via kihwal)
HDFS-5681. renewLease should not hold fsn write lock. (daryn via Kihwal)
BUG FIXES BUG FIXES
HDFS-5034. Remove debug prints from GetFileLinkInfo (Andrew Wang via Colin HDFS-5034. Remove debug prints from GetFileLinkInfo (Andrew Wang via Colin
@ -645,6 +840,14 @@ Release 2.4.0 - UNRELEASED
HDFS-5580. Fix infinite loop in Balancer.waitForMoveCompletion. HDFS-5580. Fix infinite loop in Balancer.waitForMoveCompletion.
(Binglin Chang via junping_du) (Binglin Chang via junping_du)
HDFS-5676. fix inconsistent synchronization of CachingStrategy (cmccabe)
HDFS-5691. Fix typo in ShortCircuitLocalRead document.
(Akira Ajisaka via suresh)
HDFS-5690. DataNode fails to start in secure mode when dfs.http.policy equals to
HTTP_ONLY. (Haohui Mai via jing9)
Release 2.3.0 - UNRELEASED Release 2.3.0 - UNRELEASED
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -672,6 +875,18 @@ Release 2.3.0 - UNRELEASED
HDFS-4983. Numeric usernames do not work with WebHDFS FS. (Yongjun Zhang via HDFS-4983. Numeric usernames do not work with WebHDFS FS. (Yongjun Zhang via
jing9) jing9)
HDFS-5592. statechangeLog of completeFile should be logged only in case of success.
(Vinayakumar via umamahesh)
HDFS-5662. Can't decommission a DataNode due to file's replication factor
larger than the rest of the cluster size. (brandonli)
HDFS-5068. Convert NNThroughputBenchmark to a Tool to allow generic options.
(shv)
HDFS-5675. Add Mkdirs operation to NNThroughputBenchmark.
(Plamen Jeliazkov via shv)
OPTIMIZATIONS OPTIMIZATIONS
BUG FIXES BUG FIXES
@ -813,6 +1028,20 @@ Release 2.3.0 - UNRELEASED
HDFS-5074. Allow starting up from an fsimage checkpoint in the middle of a HDFS-5074. Allow starting up from an fsimage checkpoint in the middle of a
segment. (Todd Lipcon via atm) segment. (Todd Lipcon via atm)
HDFS-4201. NPE in BPServiceActor#sendHeartBeat. (jxiang via cmccabe)
HDFS-5666. Fix inconsistent synchronization in BPOfferService (jxiang via cmccabe)
HDFS-5657. race condition causes writeback state error in NFS gateway (brandonli)
HDFS-5661. Browsing FileSystem via web ui, should use datanode's fqdn instead of ip
address. (Benoy Antony via jing9)
HDFS-5582. hdfs getconf -excludeFile or -includeFile always failed (sathish
via cmccabe)
HDFS-5671. Fix socket leak in DFSInputStream#getBlockReader. (JamesLi via umamahesh)
Release 2.2.0 - 2013-10-13 Release 2.2.0 - 2013-10-13
INCOMPATIBLE CHANGES INCOMPATIBLE CHANGES
@ -942,9 +1171,6 @@ Release 2.1.1-beta - 2013-09-23
HDFS-5047. Supress logging of full stack trace of quota and lease HDFS-5047. Supress logging of full stack trace of quota and lease
exceptions. (Robert Parker via kihwal) exceptions. (Robert Parker via kihwal)
HDFS-2933. Improve DataNode Web UI Index Page. (Vivek Ganesan via
Arpit Agarwal)
HDFS-5111. Remove duplicated error message for snapshot commands when HDFS-5111. Remove duplicated error message for snapshot commands when
processing invalid arguments. (jing9) processing invalid arguments. (jing9)

View File

@ -357,16 +357,9 @@
<Method name="insertInternal" /> <Method name="insertInternal" />
<Bug pattern="BC_UNCONFIRMED_CAST" /> <Bug pattern="BC_UNCONFIRMED_CAST" />
</Match> </Match>
<!-- These two are used for shutting down and kicking the CRMon, do not need strong sync -->
<Match> <Match>
<Class name="org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor" /> <Class name="org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor" />
<Field name="shutdown" /> <Bug pattern="RV_RETURN_VALUE_IGNORED_BAD_PRACTICE" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match>
<Match>
<Class name="org.apache.hadoop.hdfs.server.blockmanagement.CacheReplicationMonitor" />
<Field name="rescanImmediately" />
<Bug pattern="IS2_INCONSISTENT_SYNC" />
</Match> </Match>
</FindBugsFilter> </FindBugsFilter>

View File

@ -15,22 +15,30 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
*/ */
package org.apache.hadoop.fs;
package org.apache.hadoop.yarn.exceptions;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
/** /**
* Exception to be thrown when an Active-Only operation is attempted on a * Specifies semantics for CacheDirective operations. Multiple flags can
* ResourceManager that is not Active. * be combined in an EnumSet.
*/ */
@InterfaceAudience.Private @InterfaceAudience.Public
@InterfaceStability.Evolving @InterfaceStability.Evolving
public class RMNotYetActiveException extends YarnException { public enum CacheFlag {
private static final long serialVersionUID = 1L;
public RMNotYetActiveException() { /**
super("ResourceManager is not yet Active!"); * Ignore cache pool resource limits when performing this operation.
*/
FORCE((short) 0x01);
private final short mode;
private CacheFlag(short mode) {
this.mode = mode;
}
short getMode() {
return mode;
} }
} }

View File

@ -18,8 +18,10 @@
package org.apache.hadoop.hdfs; package org.apache.hadoop.hdfs;
import java.io.IOException; import java.io.IOException;
import java.util.EnumSet;
import org.apache.hadoop.fs.ByteBufferReadable; import org.apache.hadoop.fs.ByteBufferReadable;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap; import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager; import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@ -89,10 +91,10 @@ public interface BlockReader extends ByteBufferReadable {
/** /**
* Get a ClientMmap object for this BlockReader. * Get a ClientMmap object for this BlockReader.
* *
* @param curBlock The current block. * @param opts The read options to use.
* @return The ClientMmap object, or null if mmap is not * @return The ClientMmap object, or null if mmap is not
* supported. * supported.
*/ */
ClientMmap getClientMmap(LocatedBlock curBlock, ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager); ClientMmapManager mmapManager);
} }

View File

@ -35,6 +35,7 @@ import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.RemoteException;
@ -98,7 +99,7 @@ public class BlockReaderFactory {
// enabled, try to set up a BlockReaderLocal. // enabled, try to set up a BlockReaderLocal.
BlockReader reader = newShortCircuitBlockReader(conf, file, BlockReader reader = newShortCircuitBlockReader(conf, file,
block, blockToken, startOffset, len, peer, datanodeID, block, blockToken, startOffset, len, peer, datanodeID,
domSockFactory, verifyChecksum, fisCache); domSockFactory, verifyChecksum, fisCache, cachingStrategy);
if (reader != null) { if (reader != null) {
// One we've constructed the short-circuit block reader, we don't // One we've constructed the short-circuit block reader, we don't
// need the socket any more. So let's return it to the cache. // need the socket any more. So let's return it to the cache.
@ -160,7 +161,8 @@ public class BlockReaderFactory {
* @param verifyChecksum True if we should verify the checksums. * @param verifyChecksum True if we should verify the checksums.
* Note: even if this is true, when * Note: even if this is true, when
* DFS_CLIENT_READ_CHECKSUM_SKIP_CHECKSUM_KEY is * DFS_CLIENT_READ_CHECKSUM_SKIP_CHECKSUM_KEY is
* set, we will skip checksums. * set or the block is mlocked, we will skip
* checksums.
* *
* @return The BlockReaderLocal, or null if the * @return The BlockReaderLocal, or null if the
* DataNode declined to provide short-circuit * DataNode declined to provide short-circuit
@ -172,7 +174,8 @@ public class BlockReaderFactory {
Token<BlockTokenIdentifier> blockToken, long startOffset, Token<BlockTokenIdentifier> blockToken, long startOffset,
long len, Peer peer, DatanodeID datanodeID, long len, Peer peer, DatanodeID datanodeID,
DomainSocketFactory domSockFactory, boolean verifyChecksum, DomainSocketFactory domSockFactory, boolean verifyChecksum,
FileInputStreamCache fisCache) throws IOException { FileInputStreamCache fisCache,
CachingStrategy cachingStrategy) throws IOException {
final DataOutputStream out = final DataOutputStream out =
new DataOutputStream(new BufferedOutputStream( new DataOutputStream(new BufferedOutputStream(
peer.getOutputStream())); peer.getOutputStream()));
@ -189,9 +192,18 @@ public class BlockReaderFactory {
FileInputStream fis[] = new FileInputStream[2]; FileInputStream fis[] = new FileInputStream[2];
sock.recvFileInputStreams(fis, buf, 0, buf.length); sock.recvFileInputStreams(fis, buf, 0, buf.length);
try { try {
reader = new BlockReaderLocal(conf, file, block, reader = new BlockReaderLocal.Builder(conf).
startOffset, len, fis[0], fis[1], datanodeID, verifyChecksum, setFilename(file).
fisCache); setBlock(block).
setStartOffset(startOffset).
setStreams(fis).
setDatanodeID(datanodeID).
setVerifyChecksum(verifyChecksum).
setBlockMetadataHeader(
BlockMetadataHeader.preadHeader(fis[1].getChannel())).
setFileInputStreamCache(fisCache).
setCachingStrategy(cachingStrategy).
build();
} finally { } finally {
if (reader == null) { if (reader == null) {
IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]); IOUtils.cleanup(DFSClient.LOG, fis[0], fis[1]);

View File

@ -24,10 +24,12 @@ import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.security.PrivilegedExceptionAction; import java.security.PrivilegedExceptionAction;
import java.util.Collections; import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap; import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager; import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
@ -706,7 +708,7 @@ class BlockReaderLocalLegacy implements BlockReader {
} }
@Override @Override
public ClientMmap getClientMmap(LocatedBlock curBlock, public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) { ClientMmapManager mmapManager) {
return null; return null;
} }

View File

@ -85,6 +85,7 @@ import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.BlockStorageLocation; import org.apache.hadoop.fs.BlockStorageLocation;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.CreateFlag;
@ -98,10 +99,10 @@ import org.apache.hadoop.fs.MD5MD5CRC32CastagnoliFileChecksum;
import org.apache.hadoop.fs.MD5MD5CRC32FileChecksum; import org.apache.hadoop.fs.MD5MD5CRC32FileChecksum;
import org.apache.hadoop.fs.MD5MD5CRC32GzipFileChecksum; import org.apache.hadoop.fs.MD5MD5CRC32GzipFileChecksum;
import org.apache.hadoop.fs.Options; import org.apache.hadoop.fs.Options;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.Options.ChecksumOpt; import org.apache.hadoop.fs.Options.ChecksumOpt;
import org.apache.hadoop.fs.ParentNotDirectoryException; import org.apache.hadoop.fs.ParentNotDirectoryException;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hadoop.fs.UnresolvedLinkException; import org.apache.hadoop.fs.UnresolvedLinkException;
import org.apache.hadoop.fs.VolumeId; import org.apache.hadoop.fs.VolumeId;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
@ -109,6 +110,7 @@ import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.client.HdfsDataInputStream; import org.apache.hadoop.hdfs.client.HdfsDataInputStream;
import org.apache.hadoop.hdfs.client.HdfsDataOutputStream; import org.apache.hadoop.hdfs.client.HdfsDataOutputStream;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveIterator; import org.apache.hadoop.hdfs.protocol.CacheDirectiveIterator;
import org.apache.hadoop.hdfs.protocol.CachePoolEntry; import org.apache.hadoop.hdfs.protocol.CachePoolEntry;
import org.apache.hadoop.hdfs.protocol.CachePoolInfo; import org.apache.hadoop.hdfs.protocol.CachePoolInfo;
@ -121,7 +123,6 @@ import org.apache.hadoop.hdfs.protocol.DirectoryListing;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata; import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata;
import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
@ -282,6 +283,8 @@ public class DFSClient implements java.io.Closeable {
final boolean getHdfsBlocksMetadataEnabled; final boolean getHdfsBlocksMetadataEnabled;
final int getFileBlockStorageLocationsNumThreads; final int getFileBlockStorageLocationsNumThreads;
final int getFileBlockStorageLocationsTimeout; final int getFileBlockStorageLocationsTimeout;
final int retryTimesForGetLastBlockLength;
final int retryIntervalForGetLastBlockLength;
final boolean useLegacyBlockReader; final boolean useLegacyBlockReader;
final boolean useLegacyBlockReaderLocal; final boolean useLegacyBlockReaderLocal;
@ -355,6 +358,12 @@ public class DFSClient implements java.io.Closeable {
getFileBlockStorageLocationsTimeout = conf.getInt( getFileBlockStorageLocationsTimeout = conf.getInt(
DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT, DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT,
DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT_DEFAULT); DFSConfigKeys.DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT_DEFAULT);
retryTimesForGetLastBlockLength = conf.getInt(
DFSConfigKeys.DFS_CLIENT_RETRY_TIMES_GET_LAST_BLOCK_LENGTH,
DFSConfigKeys.DFS_CLIENT_RETRY_TIMES_GET_LAST_BLOCK_LENGTH_DEFAULT);
retryIntervalForGetLastBlockLength = conf.getInt(
DFSConfigKeys.DFS_CLIENT_RETRY_INTERVAL_GET_LAST_BLOCK_LENGTH,
DFSConfigKeys.DFS_CLIENT_RETRY_INTERVAL_GET_LAST_BLOCK_LENGTH_DEFAULT);
useLegacyBlockReader = conf.getBoolean( useLegacyBlockReader = conf.getBoolean(
DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADER, DFSConfigKeys.DFS_CLIENT_USE_LEGACY_BLOCKREADER,
@ -2295,20 +2304,20 @@ public class DFSClient implements java.io.Closeable {
} }
public long addCacheDirective( public long addCacheDirective(
CacheDirectiveInfo info) throws IOException { CacheDirectiveInfo info, EnumSet<CacheFlag> flags) throws IOException {
checkOpen(); checkOpen();
try { try {
return namenode.addCacheDirective(info); return namenode.addCacheDirective(info, flags);
} catch (RemoteException re) { } catch (RemoteException re) {
throw re.unwrapRemoteException(); throw re.unwrapRemoteException();
} }
} }
public void modifyCacheDirective( public void modifyCacheDirective(
CacheDirectiveInfo info) throws IOException { CacheDirectiveInfo info, EnumSet<CacheFlag> flags) throws IOException {
checkOpen(); checkOpen();
try { try {
namenode.modifyCacheDirective(info); namenode.modifyCacheDirective(info, flags);
} catch (RemoteException re) { } catch (RemoteException re) {
throw re.unwrapRemoteException(); throw re.unwrapRemoteException();
} }

View File

@ -65,6 +65,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final int DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_NUM_THREADS_DEFAULT = 10; public static final int DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_NUM_THREADS_DEFAULT = 10;
public static final String DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT = "dfs.client.file-block-storage-locations.timeout"; public static final String DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT = "dfs.client.file-block-storage-locations.timeout";
public static final int DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT_DEFAULT = 60; public static final int DFS_CLIENT_FILE_BLOCK_STORAGE_LOCATIONS_TIMEOUT_DEFAULT = 60;
public static final String DFS_CLIENT_RETRY_TIMES_GET_LAST_BLOCK_LENGTH = "dfs.client.retry.times.get-last-block-length";
public static final int DFS_CLIENT_RETRY_TIMES_GET_LAST_BLOCK_LENGTH_DEFAULT = 3;
public static final String DFS_CLIENT_RETRY_INTERVAL_GET_LAST_BLOCK_LENGTH = "dfs.client.retry.interval-ms.get-last-block-length";
public static final int DFS_CLIENT_RETRY_INTERVAL_GET_LAST_BLOCK_LENGTH_DEFAULT = 4000;
// HA related configuration // HA related configuration
public static final String DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX = "dfs.client.failover.proxy.provider"; public static final String DFS_CLIENT_FAILOVER_PROXY_PROVIDER_KEY_PREFIX = "dfs.client.failover.proxy.provider";
@ -104,8 +108,9 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final long DFS_DATANODE_MAX_LOCKED_MEMORY_DEFAULT = 0; public static final long DFS_DATANODE_MAX_LOCKED_MEMORY_DEFAULT = 0;
public static final String DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY = "dfs.datanode.fsdatasetcache.max.threads.per.volume"; public static final String DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_KEY = "dfs.datanode.fsdatasetcache.max.threads.per.volume";
public static final int DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_DEFAULT = 4; public static final int DFS_DATANODE_FSDATASETCACHE_MAX_THREADS_PER_VOLUME_DEFAULT = 4;
public static final String DFS_NAMENODE_CACHING_ENABLED_KEY = "dfs.namenode.caching.enabled"; public static final String DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT =
public static final boolean DFS_NAMENODE_CACHING_ENABLED_DEFAULT = false; "dfs.namenode.path.based.cache.block.map.allocation.percent";
public static final float DFS_NAMENODE_PATH_BASED_CACHE_BLOCK_MAP_ALLOCATION_PERCENT_DEFAULT = 0.25f;
public static final String DFS_NAMENODE_HTTP_PORT_KEY = "dfs.http.port"; public static final String DFS_NAMENODE_HTTP_PORT_KEY = "dfs.http.port";
public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT = 50070; public static final int DFS_NAMENODE_HTTP_PORT_DEFAULT = 50070;

View File

@ -57,6 +57,7 @@ import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException; import org.apache.hadoop.hdfs.protocol.datatransfer.InvalidEncryptionKeyException;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException; import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.datanode.BlockMetadataHeader;
import org.apache.hadoop.hdfs.server.datanode.CachingStrategy; import org.apache.hadoop.hdfs.server.datanode.CachingStrategy;
import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException; import org.apache.hadoop.hdfs.server.datanode.ReplicaNotFoundException;
import org.apache.hadoop.io.ByteBufferPool; import org.apache.hadoop.io.ByteBufferPool;
@ -65,6 +66,7 @@ import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.net.NetUtils; import org.apache.hadoop.net.NetUtils;
import org.apache.hadoop.net.unix.DomainSocket; import org.apache.hadoop.net.unix.DomainSocket;
import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.AccessControlException;
import org.apache.hadoop.security.token.SecretManager.InvalidToken;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.util.IdentityHashStore; import org.apache.hadoop.util.IdentityHashStore;
@ -226,7 +228,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
dfsClient.getConf().shortCircuitStreamsCacheSize, dfsClient.getConf().shortCircuitStreamsCacheSize,
dfsClient.getConf().shortCircuitStreamsCacheExpiryMs); dfsClient.getConf().shortCircuitStreamsCacheExpiryMs);
this.cachingStrategy = this.cachingStrategy =
dfsClient.getDefaultReadCachingStrategy().duplicate(); dfsClient.getDefaultReadCachingStrategy();
openInfo(); openInfo();
} }
@ -235,7 +237,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
*/ */
synchronized void openInfo() throws IOException, UnresolvedLinkException { synchronized void openInfo() throws IOException, UnresolvedLinkException {
lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
int retriesForLastBlockLength = 3; int retriesForLastBlockLength = dfsClient.getConf().retryTimesForGetLastBlockLength;
while (retriesForLastBlockLength > 0) { while (retriesForLastBlockLength > 0) {
// Getting last block length as -1 is a special case. When cluster // Getting last block length as -1 is a special case. When cluster
// restarts, DNs may not report immediately. At this time partial block // restarts, DNs may not report immediately. At this time partial block
@ -245,7 +247,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
DFSClient.LOG.warn("Last block locations not available. " DFSClient.LOG.warn("Last block locations not available. "
+ "Datanodes might not have reported blocks completely." + "Datanodes might not have reported blocks completely."
+ " Will retry for " + retriesForLastBlockLength + " times"); + " Will retry for " + retriesForLastBlockLength + " times");
waitFor(4000); waitFor(dfsClient.getConf().retryIntervalForGetLastBlockLength);
lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength(); lastBlockBeingWrittenLength = fetchLocatedBlocksAndGetLastBlockLength();
} else { } else {
break; break;
@ -572,7 +574,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken(); Token<BlockTokenIdentifier> accessToken = targetBlock.getBlockToken();
blockReader = getBlockReader(targetAddr, chosenNode, src, blk, blockReader = getBlockReader(targetAddr, chosenNode, src, blk,
accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock, accessToken, offsetIntoBlock, blk.getNumBytes() - offsetIntoBlock,
buffersize, verifyChecksum, dfsClient.clientName); buffersize, verifyChecksum, dfsClient.clientName, cachingStrategy);
if(connectFailedOnce) { if(connectFailedOnce) {
DFSClient.LOG.info("Successfully connected to " + targetAddr + DFSClient.LOG.info("Successfully connected to " + targetAddr +
" for " + blk); " for " + blk);
@ -590,20 +592,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
// The encryption key used is invalid. // The encryption key used is invalid.
refetchEncryptionKey--; refetchEncryptionKey--;
dfsClient.clearDataEncryptionKey(); dfsClient.clearDataEncryptionKey();
} else if (ex instanceof InvalidBlockTokenException && refetchToken > 0) { } else if (refetchToken > 0 && tokenRefetchNeeded(ex, targetAddr)) {
DFSClient.LOG.info("Will fetch a new access token and retry, "
+ "access token was invalid when connecting to " + targetAddr
+ " : " + ex);
/*
* Get a new access token and retry. Retry is needed in 2 cases. 1)
* When both NN and DN re-started while DFSClient holding a cached
* access token. 2) In the case that NN fails to update its
* access key at pre-set interval (by a wide margin) and
* subsequently restarts. In this case, DN re-registers itself with
* NN and receives a new access key, but DN will delete the old
* access key from its memory since it's considered expired based on
* the estimated expiration date.
*/
refetchToken--; refetchToken--;
fetchBlockAt(target); fetchBlockAt(target);
} else { } else {
@ -939,7 +928,11 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
// cached block locations may have been updated by chooseDataNode() // cached block locations may have been updated by chooseDataNode()
// or fetchBlockAt(). Always get the latest list of locations at the // or fetchBlockAt(). Always get the latest list of locations at the
// start of the loop. // start of the loop.
CachingStrategy curCachingStrategy;
synchronized (this) {
block = getBlockAt(block.getStartOffset(), false); block = getBlockAt(block.getStartOffset(), false);
curCachingStrategy = cachingStrategy;
}
DNAddrPair retval = chooseDataNode(block); DNAddrPair retval = chooseDataNode(block);
DatanodeInfo chosenNode = retval.info; DatanodeInfo chosenNode = retval.info;
InetSocketAddress targetAddr = retval.addr; InetSocketAddress targetAddr = retval.addr;
@ -951,7 +944,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
int len = (int) (end - start + 1); int len = (int) (end - start + 1);
reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(), reader = getBlockReader(targetAddr, chosenNode, src, block.getBlock(),
blockToken, start, len, buffersize, verifyChecksum, blockToken, start, len, buffersize, verifyChecksum,
dfsClient.clientName); dfsClient.clientName, curCachingStrategy);
int nread = reader.readAll(buf, offset, len); int nread = reader.readAll(buf, offset, len);
if (nread != len) { if (nread != len) {
throw new IOException("truncated return from reader.read(): " + throw new IOException("truncated return from reader.read(): " +
@ -976,10 +969,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
// The encryption key used is invalid. // The encryption key used is invalid.
refetchEncryptionKey--; refetchEncryptionKey--;
dfsClient.clearDataEncryptionKey(); dfsClient.clearDataEncryptionKey();
} else if (e instanceof InvalidBlockTokenException && refetchToken > 0) { } else if (refetchToken > 0 && tokenRefetchNeeded(e, targetAddr)) {
DFSClient.LOG.info("Will get a new access token and retry, "
+ "access token was invalid when connecting to " + targetAddr
+ " : " + e);
refetchToken--; refetchToken--;
fetchBlockAt(block.getStartOffset()); fetchBlockAt(block.getStartOffset());
continue; continue;
@ -1000,6 +990,34 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
} }
} }
/**
* Should the block access token be refetched on an exception
*
* @param ex Exception received
* @param targetAddr Target datanode address from where exception was received
* @return true if block access token has expired or invalid and it should be
* refetched
*/
private static boolean tokenRefetchNeeded(IOException ex,
InetSocketAddress targetAddr) {
/*
* Get a new access token and retry. Retry is needed in 2 cases. 1)
* When both NN and DN re-started while DFSClient holding a cached
* access token. 2) In the case that NN fails to update its
* access key at pre-set interval (by a wide margin) and
* subsequently restarts. In this case, DN re-registers itself with
* NN and receives a new access key, but DN will delete the old
* access key from its memory since it's considered expired based on
* the estimated expiration date.
*/
if (ex instanceof InvalidBlockTokenException || ex instanceof InvalidToken) {
DFSClient.LOG.info("Access token was invalid when connecting to "
+ targetAddr + " : " + ex);
return true;
}
return false;
}
private Peer newTcpPeer(InetSocketAddress addr) throws IOException { private Peer newTcpPeer(InetSocketAddress addr) throws IOException {
Peer peer = null; Peer peer = null;
boolean success = false; boolean success = false;
@ -1039,6 +1057,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
* @param bufferSize The IO buffer size (not the client buffer size) * @param bufferSize The IO buffer size (not the client buffer size)
* @param verifyChecksum Whether to verify checksum * @param verifyChecksum Whether to verify checksum
* @param clientName Client name * @param clientName Client name
* @param CachingStrategy caching strategy to use
* @return New BlockReader instance * @return New BlockReader instance
*/ */
protected BlockReader getBlockReader(InetSocketAddress dnAddr, protected BlockReader getBlockReader(InetSocketAddress dnAddr,
@ -1050,7 +1069,8 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
long len, long len,
int bufferSize, int bufferSize,
boolean verifyChecksum, boolean verifyChecksum,
String clientName) String clientName,
CachingStrategy curCachingStrategy)
throws IOException { throws IOException {
// Firstly, we check to see if we have cached any file descriptors for // Firstly, we check to see if we have cached any file descriptors for
// local blocks. If so, we can just re-use those file descriptors. // local blocks. If so, we can just re-use those file descriptors.
@ -1060,9 +1080,18 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
DFSClient.LOG.debug("got FileInputStreams for " + block + " from " + DFSClient.LOG.debug("got FileInputStreams for " + block + " from " +
"the FileInputStreamCache."); "the FileInputStreamCache.");
} }
return new BlockReaderLocal(dfsClient.getConf(), file, return new BlockReaderLocal.Builder(dfsClient.getConf()).
block, startOffset, len, fis[0], fis[1], chosenNode, verifyChecksum, setFilename(file).
fileInputStreamCache); setBlock(block).
setStartOffset(startOffset).
setStreams(fis).
setDatanodeID(chosenNode).
setVerifyChecksum(verifyChecksum).
setBlockMetadataHeader(BlockMetadataHeader.
preadHeader(fis[1].getChannel())).
setFileInputStreamCache(fileInputStreamCache).
setCachingStrategy(curCachingStrategy).
build();
} }
// If the legacy local block reader is enabled and we are reading a local // If the legacy local block reader is enabled and we are reading a local
@ -1096,7 +1125,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
dfsClient.getConf(), file, block, blockToken, startOffset, dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode, len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache, dsFactory, peerCache, fileInputStreamCache,
allowShortCircuitLocalReads, cachingStrategy); allowShortCircuitLocalReads, curCachingStrategy);
return reader; return reader;
} catch (IOException ex) { } catch (IOException ex) {
DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " + DFSClient.LOG.debug("Error making BlockReader with DomainSocket. " +
@ -1119,7 +1148,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
dfsClient.getConf(), file, block, blockToken, startOffset, dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode, len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache, dsFactory, peerCache, fileInputStreamCache,
allowShortCircuitLocalReads, cachingStrategy); allowShortCircuitLocalReads, curCachingStrategy);
return reader; return reader;
} catch (IOException e) { } catch (IOException e) {
DFSClient.LOG.warn("failed to connect to " + domSock, e); DFSClient.LOG.warn("failed to connect to " + domSock, e);
@ -1143,7 +1172,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
dfsClient.getConf(), file, block, blockToken, startOffset, dfsClient.getConf(), file, block, blockToken, startOffset,
len, verifyChecksum, clientName, peer, chosenNode, len, verifyChecksum, clientName, peer, chosenNode,
dsFactory, peerCache, fileInputStreamCache, false, dsFactory, peerCache, fileInputStreamCache, false,
cachingStrategy); curCachingStrategy);
return reader; return reader;
} catch (IOException ex) { } catch (IOException ex) {
DFSClient.LOG.debug("Error making BlockReader. Closing stale " + DFSClient.LOG.debug("Error making BlockReader. Closing stale " +
@ -1159,11 +1188,21 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
} }
// Try to create a new remote peer. // Try to create a new remote peer.
Peer peer = newTcpPeer(dnAddr); Peer peer = newTcpPeer(dnAddr);
return BlockReaderFactory.newBlockReader( try {
dfsClient.getConf(), file, block, blockToken, startOffset, reader = BlockReaderFactory.newBlockReader(dfsClient.getConf(), file,
len, verifyChecksum, clientName, peer, chosenNode, block, blockToken, startOffset, len, verifyChecksum, clientName,
dsFactory, peerCache, fileInputStreamCache, false, peer, chosenNode, dsFactory, peerCache, fileInputStreamCache, false,
cachingStrategy); curCachingStrategy);
return reader;
} catch (IOException ex) {
DFSClient.LOG.debug(
"Exception while getting block reader, closing stale " + peer, ex);
throw ex;
} finally {
if (reader == null) {
IOUtils.closeQuietly(peer);
}
}
} }
@ -1344,7 +1383,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
* deadNodes and added currentNode again. Thats ok. */ * deadNodes and added currentNode again. Thats ok. */
deadNodes.remove(oldNode); deadNodes.remove(oldNode);
} }
if (!oldNode.getStorageID().equals(newNode.getStorageID())) { if (!oldNode.getDatanodeUuid().equals(newNode.getDatanodeUuid())) {
currentNode = newNode; currentNode = newNode;
return true; return true;
} else { } else {
@ -1437,14 +1476,18 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
@Override @Override
public synchronized void setReadahead(Long readahead) public synchronized void setReadahead(Long readahead)
throws IOException { throws IOException {
this.cachingStrategy.setReadahead(readahead); this.cachingStrategy =
new CachingStrategy.Builder(this.cachingStrategy).
setReadahead(readahead).build();
closeCurrentBlockReader(); closeCurrentBlockReader();
} }
@Override @Override
public synchronized void setDropBehind(Boolean dropBehind) public synchronized void setDropBehind(Boolean dropBehind)
throws IOException { throws IOException {
this.cachingStrategy.setDropBehind(dropBehind); this.cachingStrategy =
new CachingStrategy.Builder(this.cachingStrategy).
setDropBehind(dropBehind).build();
closeCurrentBlockReader(); closeCurrentBlockReader();
} }
@ -1466,23 +1509,19 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
"at position " + pos); "at position " + pos);
} }
} }
boolean canSkipChecksums = opts.contains(ReadOption.SKIP_CHECKSUMS); ByteBuffer buffer = tryReadZeroCopy(maxLength, opts);
if (canSkipChecksums) {
ByteBuffer buffer = tryReadZeroCopy(maxLength);
if (buffer != null) { if (buffer != null) {
return buffer; return buffer;
} }
} buffer = ByteBufferUtil.fallbackRead(this, bufferPool, maxLength);
ByteBuffer buffer = ByteBufferUtil.
fallbackRead(this, bufferPool, maxLength);
if (buffer != null) { if (buffer != null) {
extendedReadBuffers.put(buffer, bufferPool); extendedReadBuffers.put(buffer, bufferPool);
} }
return buffer; return buffer;
} }
private synchronized ByteBuffer tryReadZeroCopy(int maxLength) private synchronized ByteBuffer tryReadZeroCopy(int maxLength,
throws IOException { EnumSet<ReadOption> opts) throws IOException {
// Java ByteBuffers can't be longer than 2 GB, because they use // Java ByteBuffers can't be longer than 2 GB, because they use
// 4-byte signed integers to represent capacity, etc. // 4-byte signed integers to represent capacity, etc.
// So we can't mmap the parts of the block higher than the 2 GB offset. // So we can't mmap the parts of the block higher than the 2 GB offset.
@ -1505,8 +1544,7 @@ implements ByteBufferReadable, CanSetDropBehind, CanSetReadahead,
long blockPos = curPos - blockStartInFile; long blockPos = curPos - blockStartInFile;
long limit = blockPos + length; long limit = blockPos + length;
ClientMmap clientMmap = ClientMmap clientMmap =
blockReader.getClientMmap(currentLocatedBlock, blockReader.getClientMmap(opts, dfsClient.getMmapManager());
dfsClient.getMmapManager());
if (clientMmap == null) { if (clientMmap == null) {
if (DFSClient.LOG.isDebugEnabled()) { if (DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("unable to perform a zero-copy read from offset " + DFSClient.LOG.debug("unable to perform a zero-copy read from offset " +

View File

@ -150,7 +150,7 @@ public class DFSOutputStream extends FSOutputSummer
private Progressable progress; private Progressable progress;
private final short blockReplication; // replication factor of file private final short blockReplication; // replication factor of file
private boolean shouldSyncBlock = false; // force blocks to disk upon close private boolean shouldSyncBlock = false; // force blocks to disk upon close
private CachingStrategy cachingStrategy; private AtomicReference<CachingStrategy> cachingStrategy;
private boolean failPacket = false; private boolean failPacket = false;
private static class Packet { private static class Packet {
@ -312,6 +312,7 @@ public class DFSOutputStream extends FSOutputSummer
private DataInputStream blockReplyStream; private DataInputStream blockReplyStream;
private ResponseProcessor response = null; private ResponseProcessor response = null;
private volatile DatanodeInfo[] nodes = null; // list of targets for current block private volatile DatanodeInfo[] nodes = null; // list of targets for current block
private volatile String[] storageIDs = null;
private LoadingCache<DatanodeInfo, DatanodeInfo> excludedNodes = private LoadingCache<DatanodeInfo, DatanodeInfo> excludedNodes =
CacheBuilder.newBuilder() CacheBuilder.newBuilder()
.expireAfterWrite( .expireAfterWrite(
@ -402,7 +403,7 @@ public class DFSOutputStream extends FSOutputSummer
} }
// setup pipeline to append to the last block XXX retries?? // setup pipeline to append to the last block XXX retries??
nodes = lastBlock.getLocations(); setPipeline(lastBlock);
errorIndex = -1; // no errors yet. errorIndex = -1; // no errors yet.
if (nodes.length < 1) { if (nodes.length < 1) {
throw new IOException("Unable to retrieve blocks locations " + throw new IOException("Unable to retrieve blocks locations " +
@ -412,6 +413,14 @@ public class DFSOutputStream extends FSOutputSummer
} }
} }
private void setPipeline(LocatedBlock lb) {
setPipeline(lb.getLocations(), lb.getStorageIDs());
}
private void setPipeline(DatanodeInfo[] nodes, String[] storageIDs) {
this.nodes = nodes;
this.storageIDs = storageIDs;
}
private void setFavoredNodes(String[] favoredNodes) { private void setFavoredNodes(String[] favoredNodes) {
this.favoredNodes = favoredNodes; this.favoredNodes = favoredNodes;
} }
@ -434,7 +443,7 @@ public class DFSOutputStream extends FSOutputSummer
this.setName("DataStreamer for file " + src); this.setName("DataStreamer for file " + src);
closeResponder(); closeResponder();
closeStream(); closeStream();
nodes = null; setPipeline(null, null);
stage = BlockConstructionStage.PIPELINE_SETUP_CREATE; stage = BlockConstructionStage.PIPELINE_SETUP_CREATE;
} }
@ -503,7 +512,7 @@ public class DFSOutputStream extends FSOutputSummer
if(DFSClient.LOG.isDebugEnabled()) { if(DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug("Allocating new block"); DFSClient.LOG.debug("Allocating new block");
} }
nodes = nextBlockOutputStream(); setPipeline(nextBlockOutputStream());
initDataStreaming(); initDataStreaming();
} else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) { } else if (stage == BlockConstructionStage.PIPELINE_SETUP_APPEND) {
if(DFSClient.LOG.isDebugEnabled()) { if(DFSClient.LOG.isDebugEnabled()) {
@ -917,9 +926,10 @@ public class DFSOutputStream extends FSOutputSummer
//get a new datanode //get a new datanode
final DatanodeInfo[] original = nodes; final DatanodeInfo[] original = nodes;
final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode( final LocatedBlock lb = dfsClient.namenode.getAdditionalDatanode(
src, block, nodes, failed.toArray(new DatanodeInfo[failed.size()]), src, block, nodes, storageIDs,
failed.toArray(new DatanodeInfo[failed.size()]),
1, dfsClient.clientName); 1, dfsClient.clientName);
nodes = lb.getLocations(); setPipeline(lb);
//find the new datanode //find the new datanode
final int d = findNewDatanode(original); final int d = findNewDatanode(original);
@ -1019,7 +1029,14 @@ public class DFSOutputStream extends FSOutputSummer
System.arraycopy(nodes, 0, newnodes, 0, errorIndex); System.arraycopy(nodes, 0, newnodes, 0, errorIndex);
System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex, System.arraycopy(nodes, errorIndex+1, newnodes, errorIndex,
newnodes.length-errorIndex); newnodes.length-errorIndex);
nodes = newnodes;
final String[] newStorageIDs = new String[newnodes.length];
System.arraycopy(storageIDs, 0, newStorageIDs, 0, errorIndex);
System.arraycopy(storageIDs, errorIndex+1, newStorageIDs, errorIndex,
newStorageIDs.length-errorIndex);
setPipeline(newnodes, newStorageIDs);
hasError = false; hasError = false;
lastException.set(null); lastException.set(null);
errorIndex = -1; errorIndex = -1;
@ -1055,7 +1072,8 @@ public class DFSOutputStream extends FSOutputSummer
// update pipeline at the namenode // update pipeline at the namenode
ExtendedBlock newBlock = new ExtendedBlock( ExtendedBlock newBlock = new ExtendedBlock(
block.getBlockPoolId(), block.getBlockId(), block.getNumBytes(), newGS); block.getBlockPoolId(), block.getBlockId(), block.getNumBytes(), newGS);
dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock, nodes); dfsClient.namenode.updatePipeline(dfsClient.clientName, block, newBlock,
nodes, storageIDs);
// update client side generation stamp // update client side generation stamp
block = newBlock; block = newBlock;
} }
@ -1068,7 +1086,7 @@ public class DFSOutputStream extends FSOutputSummer
* Must get block ID and the IDs of the destinations from the namenode. * Must get block ID and the IDs of the destinations from the namenode.
* Returns the list of target datanodes. * Returns the list of target datanodes.
*/ */
private DatanodeInfo[] nextBlockOutputStream() throws IOException { private LocatedBlock nextBlockOutputStream() throws IOException {
LocatedBlock lb = null; LocatedBlock lb = null;
DatanodeInfo[] nodes = null; DatanodeInfo[] nodes = null;
int count = dfsClient.getConf().nBlockWriteRetry; int count = dfsClient.getConf().nBlockWriteRetry;
@ -1110,7 +1128,7 @@ public class DFSOutputStream extends FSOutputSummer
if (!success) { if (!success) {
throw new IOException("Unable to create new block."); throw new IOException("Unable to create new block.");
} }
return nodes; return lb;
} }
// connects to the first datanode in the pipeline // connects to the first datanode in the pipeline
@ -1165,7 +1183,7 @@ public class DFSOutputStream extends FSOutputSummer
new Sender(out).writeBlock(block, accessToken, dfsClient.clientName, new Sender(out).writeBlock(block, accessToken, dfsClient.clientName,
nodes, null, recoveryFlag? stage.getRecoveryStage() : stage, nodes, null, recoveryFlag? stage.getRecoveryStage() : stage,
nodes.length, block.getNumBytes(), bytesSent, newGS, checksum, nodes.length, block.getNumBytes(), bytesSent, newGS, checksum,
cachingStrategy); cachingStrategy.get());
// receive ack for connect // receive ack for connect
BlockOpResponseProto resp = BlockOpResponseProto.parseFrom( BlockOpResponseProto resp = BlockOpResponseProto.parseFrom(
@ -1360,8 +1378,8 @@ public class DFSOutputStream extends FSOutputSummer
this.blockSize = stat.getBlockSize(); this.blockSize = stat.getBlockSize();
this.blockReplication = stat.getReplication(); this.blockReplication = stat.getReplication();
this.progress = progress; this.progress = progress;
this.cachingStrategy = this.cachingStrategy = new AtomicReference<CachingStrategy>(
dfsClient.getDefaultWriteCachingStrategy().duplicate(); dfsClient.getDefaultWriteCachingStrategy());
if ((progress != null) && DFSClient.LOG.isDebugEnabled()) { if ((progress != null) && DFSClient.LOG.isDebugEnabled()) {
DFSClient.LOG.debug( DFSClient.LOG.debug(
"Set non-null progress callback on DFSOutputStream " + src); "Set non-null progress callback on DFSOutputStream " + src);
@ -1975,7 +1993,14 @@ public class DFSOutputStream extends FSOutputSummer
@Override @Override
public void setDropBehind(Boolean dropBehind) throws IOException { public void setDropBehind(Boolean dropBehind) throws IOException {
this.cachingStrategy.setDropBehind(dropBehind); CachingStrategy prevStrategy, nextStrategy;
// CachingStrategy is immutable. So build a new CachingStrategy with the
// modifications we want, and compare-and-swap it in.
do {
prevStrategy = this.cachingStrategy.get();
nextStrategy = new CachingStrategy.Builder(prevStrategy).
setDropBehind(dropBehind).build();
} while (!this.cachingStrategy.compareAndSet(prevStrategy, nextStrategy));
} }
@VisibleForTesting @VisibleForTesting

View File

@ -145,6 +145,23 @@ public class DFSUtil {
return SECURE_RANDOM.get(); return SECURE_RANDOM.get();
} }
/** Shuffle the elements in the given array. */
public static <T> T[] shuffle(final T[] array) {
if (array != null && array.length > 0) {
final Random random = getRandom();
for (int n = array.length; n > 1; ) {
final int randomIndex = random.nextInt(n);
n--;
if (n != randomIndex) {
final T tmp = array[randomIndex];
array[randomIndex] = array[n];
array[n] = tmp;
}
}
}
return array;
}
/** /**
* Compartor for sorting DataNodeInfo[] based on decommissioned states. * Compartor for sorting DataNodeInfo[] based on decommissioned states.
* Decommissioned nodes are moved to the end of the array on sorting with * Decommissioned nodes are moved to the end of the array on sorting with
@ -1529,7 +1546,11 @@ public class DFSUtil {
* Converts a time duration in milliseconds into DDD:HH:MM:SS format. * Converts a time duration in milliseconds into DDD:HH:MM:SS format.
*/ */
public static String durationToString(long durationMs) { public static String durationToString(long durationMs) {
Preconditions.checkArgument(durationMs >= 0, "Invalid negative duration"); boolean negative = false;
if (durationMs < 0) {
negative = true;
durationMs = -durationMs;
}
// Chop off the milliseconds // Chop off the milliseconds
long durationSec = durationMs / 1000; long durationSec = durationMs / 1000;
final int secondsPerMinute = 60; final int secondsPerMinute = 60;
@ -1542,7 +1563,12 @@ public class DFSUtil {
final long minutes = durationSec / secondsPerMinute; final long minutes = durationSec / secondsPerMinute;
durationSec -= minutes * secondsPerMinute; durationSec -= minutes * secondsPerMinute;
final long seconds = durationSec; final long seconds = durationSec;
return String.format("%03d:%02d:%02d:%02d", days, hours, minutes, seconds); final long milliseconds = durationMs % 1000;
String format = "%03d:%02d:%02d:%02d.%03d";
if (negative) {
format = "-" + format;
}
return String.format(format, days, hours, minutes, seconds, milliseconds);
} }
/** /**
@ -1554,9 +1580,9 @@ public class DFSUtil {
+ ": too short"); + ": too short");
} }
String ttlString = relTime.substring(0, relTime.length()-1); String ttlString = relTime.substring(0, relTime.length()-1);
int ttl; long ttl;
try { try {
ttl = Integer.parseInt(ttlString); ttl = Long.parseLong(ttlString);
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
throw new IOException("Unable to parse relative time value of " + relTime throw new IOException("Unable to parse relative time value of " + relTime
+ ": " + ttlString + " is not a number"); + ": " + ttlString + " is not a number");

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BlockLocation; import org.apache.hadoop.fs.BlockLocation;
import org.apache.hadoop.fs.BlockStorageLocation; import org.apache.hadoop.fs.BlockStorageLocation;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
@ -1585,40 +1586,56 @@ public class DistributedFileSystem extends FileSystem {
}.resolve(this, absF); }.resolve(this, absF);
} }
/**
* @see {@link #addCacheDirective(CacheDirectiveInfo, EnumSet)}
*/
public long addCacheDirective(CacheDirectiveInfo info) throws IOException {
return addCacheDirective(info, EnumSet.noneOf(CacheFlag.class));
}
/** /**
* Add a new CacheDirective. * Add a new CacheDirective.
* *
* @param info Information about a directive to add. * @param info Information about a directive to add.
* @param flags {@link CacheFlag}s to use for this operation.
* @return the ID of the directive that was created. * @return the ID of the directive that was created.
* @throws IOException if the directive could not be added * @throws IOException if the directive could not be added
*/ */
public long addCacheDirective( public long addCacheDirective(
CacheDirectiveInfo info) throws IOException { CacheDirectiveInfo info, EnumSet<CacheFlag> flags) throws IOException {
Preconditions.checkNotNull(info.getPath()); Preconditions.checkNotNull(info.getPath());
Path path = new Path(getPathName(fixRelativePart(info.getPath()))). Path path = new Path(getPathName(fixRelativePart(info.getPath()))).
makeQualified(getUri(), getWorkingDirectory()); makeQualified(getUri(), getWorkingDirectory());
return dfs.addCacheDirective( return dfs.addCacheDirective(
new CacheDirectiveInfo.Builder(info). new CacheDirectiveInfo.Builder(info).
setPath(path). setPath(path).
build()); build(),
flags);
}
/**
* @see {@link #modifyCacheDirective(CacheDirectiveInfo, EnumSet)}
*/
public void modifyCacheDirective(CacheDirectiveInfo info) throws IOException {
modifyCacheDirective(info, EnumSet.noneOf(CacheFlag.class));
} }
/** /**
* Modify a CacheDirective. * Modify a CacheDirective.
* *
* @param info Information about the directive to modify. * @param info Information about the directive to modify. You must set the ID
* You must set the ID to indicate which CacheDirective you want * to indicate which CacheDirective you want to modify.
* to modify. * @param flags {@link CacheFlag}s to use for this operation.
* @throws IOException if the directive could not be modified * @throws IOException if the directive could not be modified
*/ */
public void modifyCacheDirective( public void modifyCacheDirective(
CacheDirectiveInfo info) throws IOException { CacheDirectiveInfo info, EnumSet<CacheFlag> flags) throws IOException {
if (info.getPath() != null) { if (info.getPath() != null) {
info = new CacheDirectiveInfo.Builder(info). info = new CacheDirectiveInfo.Builder(info).
setPath(new Path(getPathName(fixRelativePart(info.getPath()))). setPath(new Path(getPathName(fixRelativePart(info.getPath()))).
makeQualified(getUri(), getWorkingDirectory())).build(); makeQualified(getUri(), getWorkingDirectory())).build();
} }
dfs.modifyCacheDirective(info); dfs.modifyCacheDirective(info, flags);
} }
/** /**

View File

@ -23,10 +23,12 @@ import java.io.DataInputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.EnumSet;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.FSInputChecker; import org.apache.hadoop.fs.FSInputChecker;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap; import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager; import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.net.Peer; import org.apache.hadoop.hdfs.net.Peer;
@ -490,7 +492,7 @@ public class RemoteBlockReader extends FSInputChecker implements BlockReader {
} }
@Override @Override
public ClientMmap getClientMmap(LocatedBlock curBlock, public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager mmapManager) { ClientMmapManager mmapManager) {
return null; return null;
} }

View File

@ -25,10 +25,12 @@ import java.io.OutputStream;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.nio.channels.ReadableByteChannel; import java.nio.channels.ReadableByteChannel;
import java.util.EnumSet;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.ReadOption;
import org.apache.hadoop.hdfs.client.ClientMmap; import org.apache.hadoop.hdfs.client.ClientMmap;
import org.apache.hadoop.hdfs.client.ClientMmapManager; import org.apache.hadoop.hdfs.client.ClientMmapManager;
import org.apache.hadoop.hdfs.net.Peer; import org.apache.hadoop.hdfs.net.Peer;
@ -455,8 +457,8 @@ public class RemoteBlockReader2 implements BlockReader {
} }
@Override @Override
public ClientMmap getClientMmap(LocatedBlock curBlock, public ClientMmap getClientMmap(EnumSet<ReadOption> opts,
ClientMmapManager manager) { ClientMmapManager mmapManager) {
return null; return null;
} }
} }

View File

@ -0,0 +1,35 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
/**
* Defines the types of supported storage media. The default storage
* medium is assumed to be DISK.
*/
@InterfaceAudience.Public
@InterfaceStability.Unstable
public enum StorageType {
DISK,
SSD;
public static StorageType DEFAULT = DISK;
}

View File

@ -19,10 +19,12 @@ package org.apache.hadoop.hdfs.client;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.EnumSet;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.RemoteIterator;
@ -131,25 +133,26 @@ public class HdfsAdmin {
* Add a new CacheDirectiveInfo. * Add a new CacheDirectiveInfo.
* *
* @param info Information about a directive to add. * @param info Information about a directive to add.
* @param flags {@link CacheFlag}s to use for this operation.
* @return the ID of the directive that was created. * @return the ID of the directive that was created.
* @throws IOException if the directive could not be added * @throws IOException if the directive could not be added
*/ */
public long addCacheDirective(CacheDirectiveInfo info) public long addCacheDirective(CacheDirectiveInfo info,
throws IOException { EnumSet<CacheFlag> flags) throws IOException {
return dfs.addCacheDirective(info); return dfs.addCacheDirective(info, flags);
} }
/** /**
* Modify a CacheDirective. * Modify a CacheDirective.
* *
* @param info Information about the directive to modify. * @param info Information about the directive to modify. You must set the ID
* You must set the ID to indicate which CacheDirective you want * to indicate which CacheDirective you want to modify.
* to modify. * @param flags {@link CacheFlag}s to use for this operation.
* @throws IOException if the directive could not be modified * @throws IOException if the directive could not be modified
*/ */
public void modifyCacheDirective(CacheDirectiveInfo info) public void modifyCacheDirective(CacheDirectiveInfo info,
throws IOException { EnumSet<CacheFlag> flags) throws IOException {
dfs.modifyCacheDirective(info); dfs.modifyCacheDirective(info, flags);
} }
/** /**

View File

@ -19,7 +19,9 @@ package org.apache.hadoop.hdfs.protocol;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Random;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
@ -250,33 +252,28 @@ public class BlockListAsLongs implements Iterable<Block> {
} }
/** /**
* The block-id of the indexTh block * Corrupt the generation stamp of the block with the given index.
* @param index - the block whose block-id is desired * Not meant to be used outside of tests.
* @return the block-id
*/ */
@Deprecated @VisibleForTesting
public long getBlockId(final int index) { public long corruptBlockGSForTesting(final int blockIndex, Random rand) {
return blockId(index); long oldGS = blockList[index2BlockId(blockIndex) + 2];
while (blockList[index2BlockId(blockIndex) + 2] == oldGS) {
blockList[index2BlockId(blockIndex) + 2] = rand.nextInt();
}
return oldGS;
} }
/** /**
* The block-len of the indexTh block * Corrupt the length of the block with the given index by truncation.
* @param index - the block whose block-len is desired * Not meant to be used outside of tests.
* @return - the block-len
*/ */
@Deprecated @VisibleForTesting
public long getBlockLen(final int index) { public long corruptBlockLengthForTesting(final int blockIndex, Random rand) {
return blockLength(index); long oldLength = blockList[index2BlockId(blockIndex) + 1];
} blockList[index2BlockId(blockIndex) + 1] =
rand.nextInt((int) oldLength - 1);
/** return oldLength;
* The generation stamp of the indexTh block
* @param index - the block whose block-len is desired
* @return - the generation stamp
*/
@Deprecated
public long getBlockGenStamp(final int index) {
return blockGenerationStamp(index);
} }
/** /**

View File

@ -52,6 +52,14 @@ public final class CacheDirective implements IntrusiveCollection.Element {
private Element prev; private Element prev;
private Element next; private Element next;
public CacheDirective(CacheDirectiveInfo info) {
this(
info.getId(),
info.getPath().toUri().getPath(),
info.getReplication(),
info.getExpiration().getAbsoluteMillis());
}
public CacheDirective(long id, String path, public CacheDirective(long id, String path,
short replication, long expiryTime) { short replication, long expiryTime) {
Preconditions.checkArgument(id > 0); Preconditions.checkArgument(id > 0);

View File

@ -26,6 +26,8 @@ import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
import com.google.common.base.Preconditions;
/** /**
* Describes a path-based cache directive. * Describes a path-based cache directive.
*/ */
@ -138,11 +140,22 @@ public class CacheDirectiveInfo {
*/ */
public static class Expiration { public static class Expiration {
/** Denotes a CacheDirectiveInfo that never expires **/ /**
public static final int EXPIRY_NEVER = -1; * The maximum value we accept for a relative expiry.
*/
public static final long MAX_RELATIVE_EXPIRY_MS =
Long.MAX_VALUE / 4; // This helps prevent weird overflow bugs
/**
* An relative Expiration that never expires.
*/
public static final Expiration NEVER = newRelative(MAX_RELATIVE_EXPIRY_MS);
/** /**
* Create a new relative Expiration. * Create a new relative Expiration.
* <p>
* Use {@link Expiration#NEVER} to indicate an Expiration that never
* expires.
* *
* @param ms how long until the CacheDirective expires, in milliseconds * @param ms how long until the CacheDirective expires, in milliseconds
* @return A relative Expiration * @return A relative Expiration
@ -153,6 +166,9 @@ public class CacheDirectiveInfo {
/** /**
* Create a new absolute Expiration. * Create a new absolute Expiration.
* <p>
* Use {@link Expiration#NEVER} to indicate an Expiration that never
* expires.
* *
* @param date when the CacheDirective expires * @param date when the CacheDirective expires
* @return An absolute Expiration * @return An absolute Expiration
@ -163,6 +179,9 @@ public class CacheDirectiveInfo {
/** /**
* Create a new absolute Expiration. * Create a new absolute Expiration.
* <p>
* Use {@link Expiration#NEVER} to indicate an Expiration that never
* expires.
* *
* @param ms when the CacheDirective expires, in milliseconds since the Unix * @param ms when the CacheDirective expires, in milliseconds since the Unix
* epoch. * epoch.
@ -176,6 +195,10 @@ public class CacheDirectiveInfo {
private final boolean isRelative; private final boolean isRelative;
private Expiration(long ms, boolean isRelative) { private Expiration(long ms, boolean isRelative) {
if (isRelative) {
Preconditions.checkArgument(ms <= MAX_RELATIVE_EXPIRY_MS,
"Expiration time is too far in the future!");
}
this.ms = ms; this.ms = ms;
this.isRelative = isRelative; this.isRelative = isRelative;
} }

View File

@ -18,8 +18,6 @@
package org.apache.hadoop.hdfs.protocol; package org.apache.hadoop.hdfs.protocol;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException; import java.io.IOException;
import javax.annotation.Nullable; import javax.annotation.Nullable;
@ -32,14 +30,7 @@ import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.InvalidRequestException; import org.apache.hadoop.fs.InvalidRequestException;
import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.permission.PermissionStatus; import org.apache.hadoop.hdfs.protocol.CacheDirectiveInfo.Expiration;
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
import org.apache.hadoop.hdfs.util.XMLUtils;
import org.apache.hadoop.hdfs.util.XMLUtils.InvalidXmlException;
import org.apache.hadoop.hdfs.util.XMLUtils.Stanza;
import org.apache.hadoop.io.Text;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/** /**
* CachePoolInfo describes a cache pool. * CachePoolInfo describes a cache pool.
@ -52,6 +43,20 @@ import org.xml.sax.SAXException;
public class CachePoolInfo { public class CachePoolInfo {
public static final Log LOG = LogFactory.getLog(CachePoolInfo.class); public static final Log LOG = LogFactory.getLog(CachePoolInfo.class);
/**
* Indicates that the pool does not have a maximum relative expiry.
*/
public static final long RELATIVE_EXPIRY_NEVER =
Expiration.MAX_RELATIVE_EXPIRY_MS;
/**
* Default max relative expiry for cache pools.
*/
public static final long DEFAULT_MAX_RELATIVE_EXPIRY =
RELATIVE_EXPIRY_NEVER;
public static final long LIMIT_UNLIMITED = Long.MAX_VALUE;
public static final long DEFAULT_LIMIT = LIMIT_UNLIMITED;
final String poolName; final String poolName;
@Nullable @Nullable
@ -64,16 +69,26 @@ public class CachePoolInfo {
FsPermission mode; FsPermission mode;
@Nullable @Nullable
Integer weight; Long limit;
@Nullable
Long maxRelativeExpiryMs;
public CachePoolInfo(String poolName) { public CachePoolInfo(String poolName) {
this.poolName = poolName; this.poolName = poolName;
} }
/**
* @return Name of the pool.
*/
public String getPoolName() { public String getPoolName() {
return poolName; return poolName;
} }
/**
* @return The owner of the pool. Along with the group and mode, determines
* who has access to view and modify the pool.
*/
public String getOwnerName() { public String getOwnerName() {
return ownerName; return ownerName;
} }
@ -83,6 +98,10 @@ public class CachePoolInfo {
return this; return this;
} }
/**
* @return The group of the pool. Along with the owner and mode, determines
* who has access to view and modify the pool.
*/
public String getGroupName() { public String getGroupName() {
return groupName; return groupName;
} }
@ -92,6 +111,10 @@ public class CachePoolInfo {
return this; return this;
} }
/**
* @return Unix-style permissions of the pool. Along with the owner and group,
* determines who has access to view and modify the pool.
*/
public FsPermission getMode() { public FsPermission getMode() {
return mode; return mode;
} }
@ -101,12 +124,36 @@ public class CachePoolInfo {
return this; return this;
} }
public Integer getWeight() { /**
return weight; * @return The maximum aggregate number of bytes that can be cached by
* directives in this pool.
*/
public Long getLimit() {
return limit;
} }
public CachePoolInfo setWeight(Integer weight) { public CachePoolInfo setLimit(Long bytes) {
this.weight = weight; this.limit = bytes;
return this;
}
/**
* @return The maximum relative expiration of directives of this pool in
* milliseconds
*/
public Long getMaxRelativeExpiryMs() {
return maxRelativeExpiryMs;
}
/**
* Set the maximum relative expiration of directives of this pool in
* milliseconds.
*
* @param ms in milliseconds
* @return This builder, for call chaining.
*/
public CachePoolInfo setMaxRelativeExpiryMs(Long ms) {
this.maxRelativeExpiryMs = ms;
return this; return this;
} }
@ -117,7 +164,8 @@ public class CachePoolInfo {
append(", groupName:").append(groupName). append(", groupName:").append(groupName).
append(", mode:").append((mode == null) ? "null" : append(", mode:").append((mode == null) ? "null" :
String.format("0%03o", mode.toShort())). String.format("0%03o", mode.toShort())).
append(", weight:").append(weight). append(", limit:").append(limit).
append(", maxRelativeExpiryMs:").append(maxRelativeExpiryMs).
append("}").toString(); append("}").toString();
} }
@ -134,7 +182,8 @@ public class CachePoolInfo {
append(ownerName, other.ownerName). append(ownerName, other.ownerName).
append(groupName, other.groupName). append(groupName, other.groupName).
append(mode, other.mode). append(mode, other.mode).
append(weight, other.weight). append(limit, other.limit).
append(maxRelativeExpiryMs, other.maxRelativeExpiryMs).
isEquals(); isEquals();
} }
@ -145,7 +194,8 @@ public class CachePoolInfo {
append(ownerName). append(ownerName).
append(groupName). append(groupName).
append(mode). append(mode).
append(weight). append(limit).
append(maxRelativeExpiryMs).
hashCode(); hashCode();
} }
@ -153,8 +203,17 @@ public class CachePoolInfo {
if (info == null) { if (info == null) {
throw new InvalidRequestException("CachePoolInfo is null"); throw new InvalidRequestException("CachePoolInfo is null");
} }
if ((info.getWeight() != null) && (info.getWeight() < 0)) { if ((info.getLimit() != null) && (info.getLimit() < 0)) {
throw new InvalidRequestException("CachePool weight is negative."); throw new InvalidRequestException("Limit is negative.");
}
if (info.getMaxRelativeExpiryMs() != null) {
long maxRelativeExpiryMs = info.getMaxRelativeExpiryMs();
if (maxRelativeExpiryMs < 0l) {
throw new InvalidRequestException("Max relative expiry is negative.");
}
if (maxRelativeExpiryMs > Expiration.MAX_RELATIVE_EXPIRY_MS) {
throw new InvalidRequestException("Max relative expiry is too big.");
}
} }
validateName(info.poolName); validateName(info.poolName);
} }
@ -167,66 +226,4 @@ public class CachePoolInfo {
throw new IOException("invalid empty cache pool name"); throw new IOException("invalid empty cache pool name");
} }
} }
public static CachePoolInfo readFrom(DataInput in) throws IOException {
String poolName = Text.readString(in);
CachePoolInfo info = new CachePoolInfo(poolName);
if (in.readBoolean()) {
info.setOwnerName(Text.readString(in));
}
if (in.readBoolean()) {
info.setGroupName(Text.readString(in));
}
if (in.readBoolean()) {
info.setMode(FsPermission.read(in));
}
if (in.readBoolean()) {
info.setWeight(in.readInt());
}
return info;
}
public void writeTo(DataOutput out) throws IOException {
Text.writeString(out, poolName);
boolean hasOwner, hasGroup, hasMode, hasWeight;
hasOwner = ownerName != null;
hasGroup = groupName != null;
hasMode = mode != null;
hasWeight = weight != null;
out.writeBoolean(hasOwner);
if (hasOwner) {
Text.writeString(out, ownerName);
}
out.writeBoolean(hasGroup);
if (hasGroup) {
Text.writeString(out, groupName);
}
out.writeBoolean(hasMode);
if (hasMode) {
mode.write(out);
}
out.writeBoolean(hasWeight);
if (hasWeight) {
out.writeInt(weight);
}
}
public void writeXmlTo(ContentHandler contentHandler) throws SAXException {
XMLUtils.addSaxString(contentHandler, "POOLNAME", poolName);
PermissionStatus perm = new PermissionStatus(ownerName,
groupName, mode);
FSEditLogOp.permissionStatusToXml(contentHandler, perm);
XMLUtils.addSaxString(contentHandler, "WEIGHT", Integer.toString(weight));
}
public static CachePoolInfo readXmlFrom(Stanza st) throws InvalidXmlException {
String poolName = st.getValue("POOLNAME");
PermissionStatus perm = FSEditLogOp.permissionStatusFromXml(st);
int weight = Integer.parseInt(st.getValue("WEIGHT"));
return new CachePoolInfo(poolName).
setOwnerName(perm.getUserName()).
setGroupName(perm.getGroupName()).
setMode(perm.getPermission()).
setWeight(weight);
}
} }

View File

@ -30,6 +30,7 @@ public class CachePoolStats {
public static class Builder { public static class Builder {
private long bytesNeeded; private long bytesNeeded;
private long bytesCached; private long bytesCached;
private long bytesOverlimit;
private long filesNeeded; private long filesNeeded;
private long filesCached; private long filesCached;
@ -46,6 +47,11 @@ public class CachePoolStats {
return this; return this;
} }
public Builder setBytesOverlimit(long bytesOverlimit) {
this.bytesOverlimit = bytesOverlimit;
return this;
}
public Builder setFilesNeeded(long filesNeeded) { public Builder setFilesNeeded(long filesNeeded) {
this.filesNeeded = filesNeeded; this.filesNeeded = filesNeeded;
return this; return this;
@ -57,20 +63,22 @@ public class CachePoolStats {
} }
public CachePoolStats build() { public CachePoolStats build() {
return new CachePoolStats(bytesNeeded, bytesCached, filesNeeded, return new CachePoolStats(bytesNeeded, bytesCached, bytesOverlimit,
filesCached); filesNeeded, filesCached);
} }
}; };
private final long bytesNeeded; private final long bytesNeeded;
private final long bytesCached; private final long bytesCached;
private final long bytesOverlimit;
private final long filesNeeded; private final long filesNeeded;
private final long filesCached; private final long filesCached;
private CachePoolStats(long bytesNeeded, long bytesCached, long filesNeeded, private CachePoolStats(long bytesNeeded, long bytesCached,
long filesCached) { long bytesOverlimit, long filesNeeded, long filesCached) {
this.bytesNeeded = bytesNeeded; this.bytesNeeded = bytesNeeded;
this.bytesCached = bytesCached; this.bytesCached = bytesCached;
this.bytesOverlimit = bytesOverlimit;
this.filesNeeded = filesNeeded; this.filesNeeded = filesNeeded;
this.filesCached = filesCached; this.filesCached = filesCached;
} }
@ -83,6 +91,10 @@ public class CachePoolStats {
return bytesCached; return bytesCached;
} }
public long getBytesOverlimit() {
return bytesOverlimit;
}
public long getFilesNeeded() { public long getFilesNeeded() {
return filesNeeded; return filesNeeded;
} }
@ -95,6 +107,7 @@ public class CachePoolStats {
return new StringBuilder().append("{"). return new StringBuilder().append("{").
append("bytesNeeded:").append(bytesNeeded). append("bytesNeeded:").append(bytesNeeded).
append(", bytesCached:").append(bytesCached). append(", bytesCached:").append(bytesCached).
append(", bytesOverlimit:").append(bytesOverlimit).
append(", filesNeeded:").append(filesNeeded). append(", filesNeeded:").append(filesNeeded).
append(", filesCached:").append(filesCached). append(", filesCached:").append(filesCached).
append("}").toString(); append("}").toString();

View File

@ -19,9 +19,11 @@ package org.apache.hadoop.hdfs.protocol;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.EnumSet;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileAlreadyExistsException;
@ -354,7 +356,8 @@ public interface ClientProtocol {
*/ */
@Idempotent @Idempotent
public LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk, public LocatedBlock getAdditionalDatanode(final String src, final ExtendedBlock blk,
final DatanodeInfo[] existings, final DatanodeInfo[] excludes, final DatanodeInfo[] existings, final String[] existingStorageIDs,
final DatanodeInfo[] excludes,
final int numAdditionalNodes, final String clientName final int numAdditionalNodes, final String clientName
) throws AccessControlException, FileNotFoundException, ) throws AccessControlException, FileNotFoundException,
SafeModeException, UnresolvedLinkException, IOException; SafeModeException, UnresolvedLinkException, IOException;
@ -983,7 +986,7 @@ public interface ClientProtocol {
*/ */
@AtMostOnce @AtMostOnce
public void updatePipeline(String clientName, ExtendedBlock oldBlock, public void updatePipeline(String clientName, ExtendedBlock oldBlock,
ExtendedBlock newBlock, DatanodeID[] newNodes) ExtendedBlock newBlock, DatanodeID[] newNodes, String[] newStorageIDs)
throws IOException; throws IOException;
/** /**
@ -1099,23 +1102,24 @@ public interface ClientProtocol {
* Add a CacheDirective to the CacheManager. * Add a CacheDirective to the CacheManager.
* *
* @param directive A CacheDirectiveInfo to be added * @param directive A CacheDirectiveInfo to be added
* @param flags {@link CacheFlag}s to use for this operation.
* @return A CacheDirectiveInfo associated with the added directive * @return A CacheDirectiveInfo associated with the added directive
* @throws IOException if the directive could not be added * @throws IOException if the directive could not be added
*/ */
@AtMostOnce @AtMostOnce
public long addCacheDirective( public long addCacheDirective(CacheDirectiveInfo directive,
CacheDirectiveInfo directive) throws IOException; EnumSet<CacheFlag> flags) throws IOException;
/** /**
* Modify a CacheDirective in the CacheManager. * Modify a CacheDirective in the CacheManager.
* *
* @return directive The directive to modify. Must contain * @return directive The directive to modify. Must contain a directive ID.
* a directive ID. * @param flags {@link CacheFlag}s to use for this operation.
* @throws IOException if the directive could not be modified * @throws IOException if the directive could not be modified
*/ */
@AtMostOnce @AtMostOnce
public void modifyCacheDirective( public void modifyCacheDirective(CacheDirectiveInfo directive,
CacheDirectiveInfo directive) throws IOException; EnumSet<CacheFlag> flags) throws IOException;
/** /**
* Remove a CacheDirectiveInfo from the CacheManager. * Remove a CacheDirectiveInfo from the CacheManager.

View File

@ -21,6 +21,8 @@ package org.apache.hadoop.hdfs.protocol;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import com.google.common.annotations.VisibleForTesting;
/** /**
* This class represents the primary identifier for a Datanode. * This class represents the primary identifier for a Datanode.
* Datanodes are identified by how they can be contacted (hostname * Datanodes are identified by how they can be contacted (hostname
@ -40,16 +42,22 @@ public class DatanodeID implements Comparable<DatanodeID> {
private String ipAddr; // IP address private String ipAddr; // IP address
private String hostName; // hostname claimed by datanode private String hostName; // hostname claimed by datanode
private String peerHostName; // hostname from the actual connection private String peerHostName; // hostname from the actual connection
private String storageID; // unique per cluster storageID
private int xferPort; // data streaming port private int xferPort; // data streaming port
private int infoPort; // info server port private int infoPort; // info server port
private int infoSecurePort; // info server port private int infoSecurePort; // info server port
private int ipcPort; // IPC server port private int ipcPort; // IPC server port
/**
* UUID identifying a given datanode. For upgraded Datanodes this is the
* same as the StorageID that was previously used by this Datanode.
* For newly formatted Datanodes it is a UUID.
*/
private String datanodeUuid = null;
public DatanodeID(DatanodeID from) { public DatanodeID(DatanodeID from) {
this(from.getIpAddr(), this(from.getIpAddr(),
from.getHostName(), from.getHostName(),
from.getStorageID(), from.getDatanodeUuid(),
from.getXferPort(), from.getXferPort(),
from.getInfoPort(), from.getInfoPort(),
from.getInfoSecurePort(), from.getInfoSecurePort(),
@ -61,16 +69,19 @@ public class DatanodeID implements Comparable<DatanodeID> {
* Create a DatanodeID * Create a DatanodeID
* @param ipAddr IP * @param ipAddr IP
* @param hostName hostname * @param hostName hostname
* @param storageID data storage ID * @param datanodeUuid data node ID, UUID for new Datanodes, may be the
* storage ID for pre-UUID datanodes. NULL if unknown
* e.g. if this is a new datanode. A new UUID will
* be assigned by the namenode.
* @param xferPort data transfer port * @param xferPort data transfer port
* @param infoPort info server port * @param infoPort info server port
* @param ipcPort ipc server port * @param ipcPort ipc server port
*/ */
public DatanodeID(String ipAddr, String hostName, String storageID, public DatanodeID(String ipAddr, String hostName, String datanodeUuid,
int xferPort, int infoPort, int infoSecurePort, int ipcPort) { int xferPort, int infoPort, int infoSecurePort, int ipcPort) {
this.ipAddr = ipAddr; this.ipAddr = ipAddr;
this.hostName = hostName; this.hostName = hostName;
this.storageID = storageID; this.datanodeUuid = checkDatanodeUuid(datanodeUuid);
this.xferPort = xferPort; this.xferPort = xferPort;
this.infoPort = infoPort; this.infoPort = infoPort;
this.infoSecurePort = infoSecurePort; this.infoSecurePort = infoSecurePort;
@ -85,8 +96,24 @@ public class DatanodeID implements Comparable<DatanodeID> {
this.peerHostName = peerHostName; this.peerHostName = peerHostName;
} }
public void setStorageID(String storageID) { /**
this.storageID = storageID; * @return data node ID.
*/
public String getDatanodeUuid() {
return datanodeUuid;
}
@VisibleForTesting
public void setDatanodeUuidForTesting(String datanodeUuid) {
this.datanodeUuid = datanodeUuid;
}
private String checkDatanodeUuid(String uuid) {
if (uuid == null || uuid.isEmpty()) {
return null;
} else {
return uuid;
}
} }
/** /**
@ -168,13 +195,6 @@ public class DatanodeID implements Comparable<DatanodeID> {
return useHostname ? getIpcAddrWithHostname() : getIpcAddr(); return useHostname ? getIpcAddrWithHostname() : getIpcAddr();
} }
/**
* @return data storage ID.
*/
public String getStorageID() {
return storageID;
}
/** /**
* @return xferPort (the port for data streaming) * @return xferPort (the port for data streaming)
*/ */
@ -212,12 +232,12 @@ public class DatanodeID implements Comparable<DatanodeID> {
return false; return false;
} }
return (getXferAddr().equals(((DatanodeID)to).getXferAddr()) && return (getXferAddr().equals(((DatanodeID)to).getXferAddr()) &&
storageID.equals(((DatanodeID)to).getStorageID())); datanodeUuid.equals(((DatanodeID)to).getDatanodeUuid()));
} }
@Override @Override
public int hashCode() { public int hashCode() {
return getXferAddr().hashCode()^ storageID.hashCode(); return getXferAddr().hashCode()^ datanodeUuid.hashCode();
} }
@Override @Override

View File

@ -115,7 +115,7 @@ public class DatanodeInfo extends DatanodeID implements Node {
final long blockPoolUsed, final long cacheCapacity, final long cacheUsed, final long blockPoolUsed, final long cacheCapacity, final long cacheUsed,
final long lastUpdate, final int xceiverCount, final long lastUpdate, final int xceiverCount,
final AdminStates adminState) { final AdminStates adminState) {
this(nodeID.getIpAddr(), nodeID.getHostName(), nodeID.getStorageID(), this(nodeID.getIpAddr(), nodeID.getHostName(), nodeID.getDatanodeUuid(),
nodeID.getXferPort(), nodeID.getInfoPort(), nodeID.getInfoSecurePort(), nodeID.getXferPort(), nodeID.getInfoPort(), nodeID.getInfoSecurePort(),
nodeID.getIpcPort(), capacity, dfsUsed, remaining, blockPoolUsed, nodeID.getIpcPort(), capacity, dfsUsed, remaining, blockPoolUsed,
cacheCapacity, cacheUsed, lastUpdate, xceiverCount, location, cacheCapacity, cacheUsed, lastUpdate, xceiverCount, location,
@ -124,13 +124,13 @@ public class DatanodeInfo extends DatanodeID implements Node {
/** Constructor */ /** Constructor */
public DatanodeInfo(final String ipAddr, final String hostName, public DatanodeInfo(final String ipAddr, final String hostName,
final String storageID, final int xferPort, final int infoPort, final String datanodeUuid, final int xferPort, final int infoPort,
final int infoSecurePort, final int ipcPort, final int infoSecurePort, final int ipcPort,
final long capacity, final long dfsUsed, final long remaining, final long capacity, final long dfsUsed, final long remaining,
final long blockPoolUsed, final long cacheCapacity, final long cacheUsed, final long blockPoolUsed, final long cacheCapacity, final long cacheUsed,
final long lastUpdate, final int xceiverCount, final long lastUpdate, final int xceiverCount,
final String networkLocation, final AdminStates adminState) { final String networkLocation, final AdminStates adminState) {
super(ipAddr, hostName, storageID, xferPort, infoPort, super(ipAddr, hostName, datanodeUuid, xferPort, infoPort,
infoSecurePort, ipcPort); infoSecurePort, ipcPort);
this.capacity = capacity; this.capacity = capacity;
this.dfsUsed = dfsUsed; this.dfsUsed = dfsUsed;

View File

@ -107,7 +107,10 @@ public class LayoutVersion {
"block IDs in the edits log and image files"), "block IDs in the edits log and image files"),
EDITLOG_SUPPORT_RETRYCACHE(-47, "Record ClientId and CallId in editlog to " EDITLOG_SUPPORT_RETRYCACHE(-47, "Record ClientId and CallId in editlog to "
+ "enable rebuilding retry cache in case of HA failover"), + "enable rebuilding retry cache in case of HA failover"),
CACHING(-48, "Support for cache pools and path-based caching"); CACHING(-48, "Support for cache pools and path-based caching"),
ADD_DATANODE_AND_STORAGE_UUIDS(-49, "Replace StorageID with DatanodeUuid."
+ " Use distinct StorageUuid per storage directory.");
final int lv; final int lv;
final int ancestorLV; final int ancestorLV;
@ -248,3 +251,4 @@ public class LayoutVersion {
throw new AssertionError("All layout versions are reserved."); throw new AssertionError("All layout versions are reserved.");
} }
} }

View File

@ -21,7 +21,9 @@ import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeStorageInfo;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import com.google.common.base.Preconditions; import com.google.common.base.Preconditions;
@ -40,6 +42,10 @@ public class LocatedBlock {
private ExtendedBlock b; private ExtendedBlock b;
private long offset; // offset of the first byte of the block in the file private long offset; // offset of the first byte of the block in the file
private DatanodeInfo[] locs; private DatanodeInfo[] locs;
/** Storage ID for each replica */
private String[] storageIDs;
// Storage type for each replica, if reported.
private StorageType[] storageTypes;
// corrupt flag is true if all of the replicas of a block are corrupt. // corrupt flag is true if all of the replicas of a block are corrupt.
// else false. If block has few corrupt replicas, they are filtered and // else false. If block has few corrupt replicas, they are filtered and
// their locations are not part of this object // their locations are not part of this object
@ -54,19 +60,33 @@ public class LocatedBlock {
private static final DatanodeInfo[] EMPTY_LOCS = new DatanodeInfo[0]; private static final DatanodeInfo[] EMPTY_LOCS = new DatanodeInfo[0];
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs) { public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs) {
this(b, locs, -1); // startOffset is unknown this(b, locs, -1, false); // startOffset is unknown
}
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, long startOffset) {
this(b, locs, startOffset, false);
} }
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, long startOffset, public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, long startOffset,
boolean corrupt) { boolean corrupt) {
this(b, locs, startOffset, corrupt, EMPTY_LOCS); this(b, locs, null, null, startOffset, corrupt, EMPTY_LOCS);
} }
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, long startOffset, public LocatedBlock(ExtendedBlock b, DatanodeStorageInfo[] storages) {
this(b, storages, -1, false); // startOffset is unknown
}
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs,
String[] storageIDs, StorageType[] storageTypes) {
this(b, locs, storageIDs, storageTypes, -1, false, EMPTY_LOCS);
}
public LocatedBlock(ExtendedBlock b, DatanodeStorageInfo[] storages,
long startOffset, boolean corrupt) {
this(b, DatanodeStorageInfo.toDatanodeInfos(storages),
DatanodeStorageInfo.toStorageIDs(storages),
DatanodeStorageInfo.toStorageTypes(storages),
startOffset, corrupt, EMPTY_LOCS); // startOffset is unknown
}
public LocatedBlock(ExtendedBlock b, DatanodeInfo[] locs, String[] storageIDs,
StorageType[] storageTypes, long startOffset,
boolean corrupt, DatanodeInfo[] cachedLocs) { boolean corrupt, DatanodeInfo[] cachedLocs) {
this.b = b; this.b = b;
this.offset = startOffset; this.offset = startOffset;
@ -76,6 +96,8 @@ public class LocatedBlock {
} else { } else {
this.locs = locs; this.locs = locs;
} }
this.storageIDs = storageIDs;
this.storageTypes = storageTypes;
Preconditions.checkArgument(cachedLocs != null, Preconditions.checkArgument(cachedLocs != null,
"cachedLocs should not be null, use a different constructor"); "cachedLocs should not be null, use a different constructor");
if (cachedLocs.length == 0) { if (cachedLocs.length == 0) {
@ -101,6 +123,14 @@ public class LocatedBlock {
return locs; return locs;
} }
public StorageType[] getStorageTypes() {
return storageTypes;
}
public String[] getStorageIDs() {
return storageIDs;
}
public long getStartOffset() { public long getStartOffset() {
return offset; return offset;
} }
@ -161,3 +191,4 @@ public class LocatedBlock {
+ "}"; + "}";
} }
} }

View File

@ -51,7 +51,7 @@ public class UnregisteredNodeException extends IOException {
*/ */
public UnregisteredNodeException(DatanodeID nodeID, DatanodeInfo storedNode) { public UnregisteredNodeException(DatanodeID nodeID, DatanodeInfo storedNode) {
super("Data node " + nodeID + " is attempting to report storage ID " super("Data node " + nodeID + " is attempting to report storage ID "
+ nodeID.getStorageID() + ". Node " + nodeID.getDatanodeUuid() + ". Node "
+ storedNode + " is expected to serve this storage."); + storedNode + " is expected to serve this storage.");
} }
} }

View File

@ -320,7 +320,7 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
try { try {
HdfsFileStatus result = server.create(req.getSrc(), HdfsFileStatus result = server.create(req.getSrc(),
PBHelper.convert(req.getMasked()), req.getClientName(), PBHelper.convert(req.getMasked()), req.getClientName(),
PBHelper.convert(req.getCreateFlag()), req.getCreateParent(), PBHelper.convertCreateFlag(req.getCreateFlag()), req.getCreateParent(),
(short) req.getReplication(), req.getBlockSize()); (short) req.getReplication(), req.getBlockSize());
if (result != null) { if (result != null) {
@ -425,11 +425,14 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
throws ServiceException { throws ServiceException {
try { try {
List<DatanodeInfoProto> existingList = req.getExistingsList(); List<DatanodeInfoProto> existingList = req.getExistingsList();
List<String> existingStorageIDsList = req.getExistingStorageUuidsList();
List<DatanodeInfoProto> excludesList = req.getExcludesList(); List<DatanodeInfoProto> excludesList = req.getExcludesList();
LocatedBlock result = server.getAdditionalDatanode( LocatedBlock result = server.getAdditionalDatanode(req.getSrc(),
req.getSrc(), PBHelper.convert(req.getBlk()), PBHelper.convert(req.getBlk()),
PBHelper.convert(existingList.toArray( PBHelper.convert(existingList.toArray(
new DatanodeInfoProto[existingList.size()])), new DatanodeInfoProto[existingList.size()])),
existingStorageIDsList.toArray(
new String[existingStorageIDsList.size()]),
PBHelper.convert(excludesList.toArray( PBHelper.convert(excludesList.toArray(
new DatanodeInfoProto[excludesList.size()])), new DatanodeInfoProto[excludesList.size()])),
req.getNumAdditionalNodes(), req.getClientName()); req.getNumAdditionalNodes(), req.getClientName());
@ -833,10 +836,12 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
UpdatePipelineRequestProto req) throws ServiceException { UpdatePipelineRequestProto req) throws ServiceException {
try { try {
List<DatanodeIDProto> newNodes = req.getNewNodesList(); List<DatanodeIDProto> newNodes = req.getNewNodesList();
server List<String> newStorageIDs = req.getStorageIDsList();
.updatePipeline(req.getClientName(), PBHelper.convert(req server.updatePipeline(req.getClientName(),
.getOldBlock()), PBHelper.convert(req.getNewBlock()), PBHelper PBHelper.convert(req.getOldBlock()),
.convert(newNodes.toArray(new DatanodeIDProto[newNodes.size()]))); PBHelper.convert(req.getNewBlock()),
PBHelper.convert(newNodes.toArray(new DatanodeIDProto[newNodes.size()])),
newStorageIDs.toArray(new String[newStorageIDs.size()]));
return VOID_UPDATEPIPELINE_RESPONSE; return VOID_UPDATEPIPELINE_RESPONSE;
} catch (IOException e) { } catch (IOException e) {
throw new ServiceException(e); throw new ServiceException(e);
@ -1029,9 +1034,11 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
RpcController controller, AddCacheDirectiveRequestProto request) RpcController controller, AddCacheDirectiveRequestProto request)
throws ServiceException { throws ServiceException {
try { try {
long id = server.addCacheDirective(
PBHelper.convert(request.getInfo()),
PBHelper.convertCacheFlags(request.getCacheFlags()));
return AddCacheDirectiveResponseProto.newBuilder(). return AddCacheDirectiveResponseProto.newBuilder().
setId(server.addCacheDirective( setId(id).build();
PBHelper.convert(request.getInfo()))).build();
} catch (IOException e) { } catch (IOException e) {
throw new ServiceException(e); throw new ServiceException(e);
} }
@ -1043,7 +1050,8 @@ public class ClientNamenodeProtocolServerSideTranslatorPB implements
throws ServiceException { throws ServiceException {
try { try {
server.modifyCacheDirective( server.modifyCacheDirective(
PBHelper.convert(request.getInfo())); PBHelper.convert(request.getInfo()),
PBHelper.convertCacheFlags(request.getCacheFlags()));
return ModifyCacheDirectiveResponseProto.newBuilder().build(); return ModifyCacheDirectiveResponseProto.newBuilder().build();
} catch (IOException e) { } catch (IOException e) {
throw new ServiceException(e); throw new ServiceException(e);

View File

@ -21,10 +21,12 @@ import java.io.Closeable;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.EnumSet;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedEntries; import org.apache.hadoop.fs.BatchedRemoteIterator.BatchedEntries;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileAlreadyExistsException;
@ -351,7 +353,8 @@ public class ClientNamenodeProtocolTranslatorPB implements
@Override @Override
public LocatedBlock getAdditionalDatanode(String src, ExtendedBlock blk, public LocatedBlock getAdditionalDatanode(String src, ExtendedBlock blk,
DatanodeInfo[] existings, DatanodeInfo[] excludes, DatanodeInfo[] existings, String[] existingStorageIDs,
DatanodeInfo[] excludes,
int numAdditionalNodes, String clientName) throws AccessControlException, int numAdditionalNodes, String clientName) throws AccessControlException,
FileNotFoundException, SafeModeException, UnresolvedLinkException, FileNotFoundException, SafeModeException, UnresolvedLinkException,
IOException { IOException {
@ -360,6 +363,7 @@ public class ClientNamenodeProtocolTranslatorPB implements
.setSrc(src) .setSrc(src)
.setBlk(PBHelper.convert(blk)) .setBlk(PBHelper.convert(blk))
.addAllExistings(PBHelper.convert(existings)) .addAllExistings(PBHelper.convert(existings))
.addAllExistingStorageUuids(Arrays.asList(existingStorageIDs))
.addAllExcludes(PBHelper.convert(excludes)) .addAllExcludes(PBHelper.convert(excludes))
.setNumAdditionalNodes(numAdditionalNodes) .setNumAdditionalNodes(numAdditionalNodes)
.setClientName(clientName) .setClientName(clientName)
@ -796,12 +800,13 @@ public class ClientNamenodeProtocolTranslatorPB implements
@Override @Override
public void updatePipeline(String clientName, ExtendedBlock oldBlock, public void updatePipeline(String clientName, ExtendedBlock oldBlock,
ExtendedBlock newBlock, DatanodeID[] newNodes) throws IOException { ExtendedBlock newBlock, DatanodeID[] newNodes, String[] storageIDs) throws IOException {
UpdatePipelineRequestProto req = UpdatePipelineRequestProto.newBuilder() UpdatePipelineRequestProto req = UpdatePipelineRequestProto.newBuilder()
.setClientName(clientName) .setClientName(clientName)
.setOldBlock(PBHelper.convert(oldBlock)) .setOldBlock(PBHelper.convert(oldBlock))
.setNewBlock(PBHelper.convert(newBlock)) .setNewBlock(PBHelper.convert(newBlock))
.addAllNewNodes(Arrays.asList(PBHelper.convert(newNodes))) .addAllNewNodes(Arrays.asList(PBHelper.convert(newNodes)))
.addAllStorageIDs(storageIDs == null ? null : Arrays.asList(storageIDs))
.build(); .build();
try { try {
rpcProxy.updatePipeline(null, req); rpcProxy.updatePipeline(null, req);
@ -1000,24 +1005,32 @@ public class ClientNamenodeProtocolTranslatorPB implements
} }
@Override @Override
public long addCacheDirective( public long addCacheDirective(CacheDirectiveInfo directive,
CacheDirectiveInfo directive) throws IOException { EnumSet<CacheFlag> flags) throws IOException {
try { try {
return rpcProxy.addCacheDirective(null, AddCacheDirectiveRequestProto.Builder builder =
AddCacheDirectiveRequestProto.newBuilder(). AddCacheDirectiveRequestProto.newBuilder().
setInfo(PBHelper.convert(directive)).build()).getId(); setInfo(PBHelper.convert(directive));
if (!flags.isEmpty()) {
builder.setCacheFlags(PBHelper.convertCacheFlags(flags));
}
return rpcProxy.addCacheDirective(null, builder.build()).getId();
} catch (ServiceException e) { } catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e); throw ProtobufHelper.getRemoteException(e);
} }
} }
@Override @Override
public void modifyCacheDirective( public void modifyCacheDirective(CacheDirectiveInfo directive,
CacheDirectiveInfo directive) throws IOException { EnumSet<CacheFlag> flags) throws IOException {
try { try {
rpcProxy.modifyCacheDirective(null, ModifyCacheDirectiveRequestProto.Builder builder =
ModifyCacheDirectiveRequestProto.newBuilder(). ModifyCacheDirectiveRequestProto.newBuilder().
setInfo(PBHelper.convert(directive)).build()); setInfo(PBHelper.convert(directive));
if (!flags.isEmpty()) {
builder.setCacheFlags(PBHelper.convertCacheFlags(flags));
}
rpcProxy.modifyCacheDirective(null, builder.build());
} catch (ServiceException e) { } catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e); throw ProtobufHelper.getRemoteException(e);
} }

View File

@ -245,7 +245,7 @@ public class DatanodeProtocolClientSideTranslatorPB implements
for (StorageReceivedDeletedBlocks storageBlock : receivedAndDeletedBlocks) { for (StorageReceivedDeletedBlocks storageBlock : receivedAndDeletedBlocks) {
StorageReceivedDeletedBlocksProto.Builder repBuilder = StorageReceivedDeletedBlocksProto.Builder repBuilder =
StorageReceivedDeletedBlocksProto.newBuilder(); StorageReceivedDeletedBlocksProto.newBuilder();
repBuilder.setStorageID(storageBlock.getStorageID()); repBuilder.setStorageUuid(storageBlock.getStorageID());
for (ReceivedDeletedBlockInfo rdBlock : storageBlock.getBlocks()) { for (ReceivedDeletedBlockInfo rdBlock : storageBlock.getBlocks()) {
repBuilder.addBlocks(PBHelper.convert(rdBlock)); repBuilder.addBlocks(PBHelper.convert(rdBlock));
} }

View File

@ -42,7 +42,6 @@ import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReportBadBlo
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReportBadBlocksResponseProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.ReportBadBlocksResponseProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageBlockReportProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageBlockReportProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReceivedDeletedBlocksProto; import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReceivedDeletedBlocksProto;
import org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.StorageReportProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.DatanodeIDProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.DatanodeIDProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.LocatedBlockProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.LocatedBlockProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.VersionRequestProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.VersionRequestProto;
@ -102,14 +101,8 @@ public class DatanodeProtocolServerSideTranslatorPB implements
HeartbeatRequestProto request) throws ServiceException { HeartbeatRequestProto request) throws ServiceException {
HeartbeatResponse response; HeartbeatResponse response;
try { try {
List<StorageReportProto> list = request.getReportsList(); final StorageReport[] report = PBHelper.convertStorageReports(
StorageReport[] report = new StorageReport[list.size()]; request.getReportsList());
int i = 0;
for (StorageReportProto p : list) {
report[i++] = new StorageReport(p.getStorageID(), p.getFailed(),
p.getCapacity(), p.getDfsUsed(), p.getRemaining(),
p.getBlockPoolUsed());
}
response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()), response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()),
report, request.getCacheCapacity(), request.getCacheUsed(), report, request.getCacheCapacity(), request.getCacheUsed(),
request.getXmitsInProgress(), request.getXmitsInProgress(),
@ -198,7 +191,7 @@ public class DatanodeProtocolServerSideTranslatorPB implements
for (int j = 0; j < list.size(); j++) { for (int j = 0; j < list.size(); j++) {
rdBlocks[j] = PBHelper.convert(list.get(j)); rdBlocks[j] = PBHelper.convert(list.get(j));
} }
info[i] = new StorageReceivedDeletedBlocks(sBlock.getStorageID(), rdBlocks); info[i] = new StorageReceivedDeletedBlocks(sBlock.getStorageUuid(), rdBlocks);
} }
try { try {
impl.blockReceivedAndDeleted(PBHelper.convert(request.getRegistration()), impl.blockReceivedAndDeleted(PBHelper.convert(request.getRegistration()),

View File

@ -82,6 +82,6 @@ public class InterDatanodeProtocolServerSideTranslatorPB implements
throw new ServiceException(e); throw new ServiceException(e);
} }
return UpdateReplicaUnderRecoveryResponseProto.newBuilder() return UpdateReplicaUnderRecoveryResponseProto.newBuilder()
.setStorageID(storageID).build(); .setStorageUuid(storageID).build();
} }
} }

View File

@ -109,7 +109,7 @@ public class InterDatanodeProtocolTranslatorPB implements
.setNewLength(newLength).setRecoveryId(recoveryId).build(); .setNewLength(newLength).setRecoveryId(recoveryId).build();
try { try {
return rpcProxy.updateReplicaUnderRecovery(NULL_CONTROLLER, req return rpcProxy.updateReplicaUnderRecovery(NULL_CONTROLLER, req
).getStorageID(); ).getStorageUuid();
} catch (ServiceException e) { } catch (ServiceException e) {
throw ProtobufHelper.getRemoteException(e); throw ProtobufHelper.getRemoteException(e);
} }

View File

@ -27,6 +27,7 @@ import java.util.Arrays;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List; import java.util.List;
import org.apache.hadoop.fs.CacheFlag;
import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.ContentSummary;
import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.CreateFlag;
import org.apache.hadoop.fs.FsServerDefaults; import org.apache.hadoop.fs.FsServerDefaults;
@ -35,6 +36,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.ha.proto.HAServiceProtocolProtos; import org.apache.hadoop.ha.proto.HAServiceProtocolProtos;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry; import org.apache.hadoop.hdfs.protocol.CacheDirectiveEntry;
import org.apache.hadoop.hdfs.protocol.CacheDirectiveStats; import org.apache.hadoop.hdfs.protocol.CacheDirectiveStats;
@ -52,17 +54,18 @@ import org.apache.hadoop.hdfs.protocol.DirectoryListing;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType; import org.apache.hadoop.hdfs.protocol.HdfsConstants.DatanodeReportType;
import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction; import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffType;
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus; import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
import org.apache.hadoop.hdfs.protocol.HdfsLocatedFileStatus; import org.apache.hadoop.hdfs.protocol.HdfsLocatedFileStatus;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.protocol.LocatedBlocks; import org.apache.hadoop.hdfs.protocol.LocatedBlocks;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffReportEntry;
import org.apache.hadoop.hdfs.protocol.SnapshotDiffReport.DiffType;
import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus; import org.apache.hadoop.hdfs.protocol.SnapshottableDirectoryStatus;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveEntryProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveEntryProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoExpirationProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveInfoExpirationProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveStatsProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheDirectiveStatsProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CacheFlagProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolEntryProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolEntryProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolInfoProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolInfoProto;
import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolStatsProto; import org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos.CachePoolStatsProto;
@ -122,6 +125,8 @@ import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.SnapshotDiffReportProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.SnapshottableDirectoryListingProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.SnapshottableDirectoryListingProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.SnapshottableDirectoryStatusProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.SnapshottableDirectoryStatusProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.StorageInfoProto; import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.StorageInfoProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.StorageTypeProto;
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.StorageUuidsProto;
import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalInfoProto; import org.apache.hadoop.hdfs.protocol.proto.JournalProtocolProtos.JournalInfoProto;
import org.apache.hadoop.hdfs.security.token.block.BlockKey; import org.apache.hadoop.hdfs.security.token.block.BlockKey;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier; import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
@ -242,17 +247,20 @@ public class PBHelper {
// DatanodeId // DatanodeId
public static DatanodeID convert(DatanodeIDProto dn) { public static DatanodeID convert(DatanodeIDProto dn) {
return new DatanodeID(dn.getIpAddr(), dn.getHostName(), dn.getStorageID(), return new DatanodeID(dn.getIpAddr(), dn.getHostName(), dn.getDatanodeUuid(),
dn.getXferPort(), dn.getInfoPort(), dn.hasInfoSecurePort() ? dn dn.getXferPort(), dn.getInfoPort(), dn.hasInfoSecurePort() ? dn
.getInfoSecurePort() : 0, dn.getIpcPort()); .getInfoSecurePort() : 0, dn.getIpcPort());
} }
public static DatanodeIDProto convert(DatanodeID dn) { public static DatanodeIDProto convert(DatanodeID dn) {
// For wire compatibility with older versions we transmit the StorageID
// which is the same as the DatanodeUuid. Since StorageID is a required
// field we pass the empty string if the DatanodeUuid is not yet known.
return DatanodeIDProto.newBuilder() return DatanodeIDProto.newBuilder()
.setIpAddr(dn.getIpAddr()) .setIpAddr(dn.getIpAddr())
.setHostName(dn.getHostName()) .setHostName(dn.getHostName())
.setStorageID(dn.getStorageID())
.setXferPort(dn.getXferPort()) .setXferPort(dn.getXferPort())
.setDatanodeUuid(dn.getDatanodeUuid() != null ? dn.getDatanodeUuid() : "")
.setInfoPort(dn.getInfoPort()) .setInfoPort(dn.getInfoPort())
.setInfoSecurePort(dn.getInfoSecurePort()) .setInfoSecurePort(dn.getInfoSecurePort())
.setIpcPort(dn.getIpcPort()).build(); .setIpcPort(dn.getIpcPort()).build();
@ -294,12 +302,16 @@ public class PBHelper {
public static BlockWithLocationsProto convert(BlockWithLocations blk) { public static BlockWithLocationsProto convert(BlockWithLocations blk) {
return BlockWithLocationsProto.newBuilder() return BlockWithLocationsProto.newBuilder()
.setBlock(convert(blk.getBlock())) .setBlock(convert(blk.getBlock()))
.addAllStorageIDs(Arrays.asList(blk.getStorageIDs())).build(); .addAllDatanodeUuids(Arrays.asList(blk.getDatanodeUuids()))
.addAllStorageUuids(Arrays.asList(blk.getStorageIDs())).build();
} }
public static BlockWithLocations convert(BlockWithLocationsProto b) { public static BlockWithLocations convert(BlockWithLocationsProto b) {
return new BlockWithLocations(convert(b.getBlock()), b.getStorageIDsList() final List<String> datanodeUuids = b.getDatanodeUuidsList();
.toArray(new String[0])); final List<String> storageUuids = b.getStorageUuidsList();
return new BlockWithLocations(convert(b.getBlock()),
datanodeUuids.toArray(new String[datanodeUuids.size()]),
storageUuids.toArray(new String[storageUuids.size()]));
} }
public static BlocksWithLocationsProto convert(BlocksWithLocations blks) { public static BlocksWithLocationsProto convert(BlocksWithLocations blks) {
@ -499,21 +511,7 @@ public class PBHelper {
static public DatanodeInfoProto convertDatanodeInfo(DatanodeInfo di) { static public DatanodeInfoProto convertDatanodeInfo(DatanodeInfo di) {
if (di == null) return null; if (di == null) return null;
DatanodeInfoProto.Builder builder = DatanodeInfoProto.newBuilder(); return convert(di);
if (di.getNetworkLocation() != null) {
builder.setLocation(di.getNetworkLocation());
}
return builder.
setId(PBHelper.convert((DatanodeID) di)).
setCapacity(di.getCapacity()).
setDfsUsed(di.getDfsUsed()).
setRemaining(di.getRemaining()).
setBlockPoolUsed(di.getBlockPoolUsed()).
setLastUpdate(di.getLastUpdate()).
setXceiverCount(di.getXceiverCount()).
setAdminState(PBHelper.convert(di.getAdminState())).
build();
} }
@ -557,15 +555,20 @@ public class PBHelper {
public static DatanodeInfoProto convert(DatanodeInfo info) { public static DatanodeInfoProto convert(DatanodeInfo info) {
DatanodeInfoProto.Builder builder = DatanodeInfoProto.newBuilder(); DatanodeInfoProto.Builder builder = DatanodeInfoProto.newBuilder();
builder.setBlockPoolUsed(info.getBlockPoolUsed()); if (info.getNetworkLocation() != null) {
builder.setAdminState(PBHelper.convert(info.getAdminState())); builder.setLocation(info.getNetworkLocation());
builder.setCapacity(info.getCapacity()) }
.setDfsUsed(info.getDfsUsed()) builder
.setId(PBHelper.convert((DatanodeID)info)) .setId(PBHelper.convert((DatanodeID)info))
.setLastUpdate(info.getLastUpdate()) .setCapacity(info.getCapacity())
.setLocation(info.getNetworkLocation()) .setDfsUsed(info.getDfsUsed())
.setRemaining(info.getRemaining()) .setRemaining(info.getRemaining())
.setBlockPoolUsed(info.getBlockPoolUsed())
.setCacheCapacity(info.getCacheCapacity())
.setCacheUsed(info.getCacheUsed())
.setLastUpdate(info.getLastUpdate())
.setXceiverCount(info.getXceiverCount()) .setXceiverCount(info.getXceiverCount())
.setAdminState(PBHelper.convert(info.getAdminState()))
.build(); .build();
return builder.build(); return builder.build();
} }
@ -601,6 +604,17 @@ public class PBHelper {
"Found additional cached replica locations that are not in the set of" "Found additional cached replica locations that are not in the set of"
+ " storage-backed locations!"); + " storage-backed locations!");
StorageType[] storageTypes = b.getStorageTypes();
if (storageTypes != null) {
for (int i = 0; i < storageTypes.length; ++i) {
builder.addStorageTypes(PBHelper.convertStorageType(storageTypes[i]));
}
}
final String[] storageIDs = b.getStorageIDs();
if (storageIDs != null) {
builder.addAllStorageIDs(Arrays.asList(storageIDs));
}
return builder.setB(PBHelper.convert(b.getBlock())) return builder.setB(PBHelper.convert(b.getBlock()))
.setBlockToken(PBHelper.convert(b.getBlockToken())) .setBlockToken(PBHelper.convert(b.getBlockToken()))
.setCorrupt(b.isCorrupt()).setOffset(b.getStartOffset()).build(); .setCorrupt(b.isCorrupt()).setOffset(b.getStartOffset()).build();
@ -613,6 +627,25 @@ public class PBHelper {
for (int i = 0; i < locs.size(); i++) { for (int i = 0; i < locs.size(); i++) {
targets[i] = PBHelper.convert(locs.get(i)); targets[i] = PBHelper.convert(locs.get(i));
} }
final int storageTypesCount = proto.getStorageTypesCount();
final StorageType[] storageTypes;
if (storageTypesCount == 0) {
storageTypes = null;
} else {
Preconditions.checkState(storageTypesCount == locs.size());
storageTypes = convertStorageTypeProtos(proto.getStorageTypesList());
}
final int storageIDsCount = proto.getStorageIDsCount();
final String[] storageIDs;
if (storageIDsCount == 0) {
storageIDs = null;
} else {
Preconditions.checkState(storageIDsCount == locs.size());
storageIDs = proto.getStorageIDsList().toArray(new String[storageIDsCount]);
}
// Set values from the isCached list, re-using references from loc // Set values from the isCached list, re-using references from loc
List<DatanodeInfo> cachedLocs = new ArrayList<DatanodeInfo>(locs.size()); List<DatanodeInfo> cachedLocs = new ArrayList<DatanodeInfo>(locs.size());
List<Boolean> isCachedList = proto.getIsCachedList(); List<Boolean> isCachedList = proto.getIsCachedList();
@ -623,7 +656,7 @@ public class PBHelper {
} }
LocatedBlock lb = new LocatedBlock(PBHelper.convert(proto.getB()), targets, LocatedBlock lb = new LocatedBlock(PBHelper.convert(proto.getB()), targets,
proto.getOffset(), proto.getCorrupt(), storageIDs, storageTypes, proto.getOffset(), proto.getCorrupt(),
cachedLocs.toArray(new DatanodeInfo[0])); cachedLocs.toArray(new DatanodeInfo[0]));
lb.setBlockToken(PBHelper.convert(proto.getBlockToken())); lb.setBlockToken(PBHelper.convert(proto.getBlockToken()));
@ -766,7 +799,8 @@ public class PBHelper {
for (int i = 0; i < blocks.length; i++) { for (int i = 0; i < blocks.length; i++) {
builder.addBlocks(PBHelper.convert(blocks[i])); builder.addBlocks(PBHelper.convert(blocks[i]));
} }
builder.addAllTargets(PBHelper.convert(cmd.getTargets())); builder.addAllTargets(convert(cmd.getTargets()))
.addAllTargetStorageUuids(convert(cmd.getTargetStorageIDs()));
return builder.build(); return builder.build();
} }
@ -799,6 +833,15 @@ public class PBHelper {
return Arrays.asList(ret); return Arrays.asList(ret);
} }
private static List<StorageUuidsProto> convert(String[][] targetStorageUuids) {
StorageUuidsProto[] ret = new StorageUuidsProto[targetStorageUuids.length];
for (int i = 0; i < targetStorageUuids.length; i++) {
ret[i] = StorageUuidsProto.newBuilder()
.addAllStorageUuids(Arrays.asList(targetStorageUuids[i])).build();
}
return Arrays.asList(ret);
}
public static DatanodeCommandProto convert(DatanodeCommand datanodeCommand) { public static DatanodeCommandProto convert(DatanodeCommand datanodeCommand) {
DatanodeCommandProto.Builder builder = DatanodeCommandProto.newBuilder(); DatanodeCommandProto.Builder builder = DatanodeCommandProto.newBuilder();
if (datanodeCommand == null) { if (datanodeCommand == null) {
@ -878,6 +921,14 @@ public class PBHelper {
for (int i = 0; i < targetList.size(); i++) { for (int i = 0; i < targetList.size(); i++) {
targets[i] = PBHelper.convert(targetList.get(i)); targets[i] = PBHelper.convert(targetList.get(i));
} }
List<StorageUuidsProto> targetStorageUuidsList = blkCmd.getTargetStorageUuidsList();
String[][] targetStorageIDs = new String[targetStorageUuidsList.size()][];
for(int i = 0; i < targetStorageIDs.length; i++) {
List<String> storageIDs = targetStorageUuidsList.get(i).getStorageUuidsList();
targetStorageIDs[i] = storageIDs.toArray(new String[storageIDs.size()]);
}
int action = DatanodeProtocol.DNA_UNKNOWN; int action = DatanodeProtocol.DNA_UNKNOWN;
switch (blkCmd.getAction()) { switch (blkCmd.getAction()) {
case TRANSFER: case TRANSFER:
@ -892,7 +943,8 @@ public class PBHelper {
default: default:
throw new AssertionError("Unknown action type: " + blkCmd.getAction()); throw new AssertionError("Unknown action type: " + blkCmd.getAction());
} }
return new BlockCommand(action, blkCmd.getBlockPoolId(), blocks, targets); return new BlockCommand(action, blkCmd.getBlockPoolId(), blocks, targets,
targetStorageIDs);
} }
public static BlockIdCommand convert(BlockIdCommandProto blkIdCmd) { public static BlockIdCommand convert(BlockIdCommandProto blkIdCmd) {
@ -1123,7 +1175,7 @@ public class PBHelper {
return value; return value;
} }
public static EnumSetWritable<CreateFlag> convert(int flag) { public static EnumSetWritable<CreateFlag> convertCreateFlag(int flag) {
EnumSet<CreateFlag> result = EnumSet<CreateFlag> result =
EnumSet.noneOf(CreateFlag.class); EnumSet.noneOf(CreateFlag.class);
if ((flag & CreateFlagProto.APPEND_VALUE) == CreateFlagProto.APPEND_VALUE) { if ((flag & CreateFlagProto.APPEND_VALUE) == CreateFlagProto.APPEND_VALUE) {
@ -1139,6 +1191,22 @@ public class PBHelper {
return new EnumSetWritable<CreateFlag>(result); return new EnumSetWritable<CreateFlag>(result);
} }
public static int convertCacheFlags(EnumSet<CacheFlag> flags) {
int value = 0;
if (flags.contains(CacheFlag.FORCE)) {
value |= CacheFlagProto.FORCE.getNumber();
}
return value;
}
public static EnumSet<CacheFlag> convertCacheFlags(int flags) {
EnumSet<CacheFlag> result = EnumSet.noneOf(CacheFlag.class);
if ((flags & CacheFlagProto.FORCE_VALUE) == CacheFlagProto.FORCE_VALUE) {
result.add(CacheFlag.FORCE);
}
return result;
}
public static HdfsFileStatus convert(HdfsFileStatusProto fs) { public static HdfsFileStatus convert(HdfsFileStatusProto fs) {
if (fs == null) if (fs == null)
return null; return null;
@ -1422,11 +1490,12 @@ public class PBHelper {
public static DatanodeStorageProto convert(DatanodeStorage s) { public static DatanodeStorageProto convert(DatanodeStorage s) {
return DatanodeStorageProto.newBuilder() return DatanodeStorageProto.newBuilder()
.setState(PBHelper.convert(s.getState())) .setState(PBHelper.convertState(s.getState()))
.setStorageID(s.getStorageID()).build(); .setStorageType(PBHelper.convertStorageType(s.getStorageType()))
.setStorageUuid(s.getStorageID()).build();
} }
private static StorageState convert(State state) { private static StorageState convertState(State state) {
switch(state) { switch(state) {
case READ_ONLY: case READ_ONLY:
return StorageState.READ_ONLY; return StorageState.READ_ONLY;
@ -1436,11 +1505,26 @@ public class PBHelper {
} }
} }
public static DatanodeStorage convert(DatanodeStorageProto s) { private static StorageTypeProto convertStorageType(
return new DatanodeStorage(s.getStorageID(), PBHelper.convert(s.getState())); StorageType type) {
switch(type) {
case DISK:
return StorageTypeProto.DISK;
case SSD:
return StorageTypeProto.SSD;
default:
throw new IllegalStateException(
"BUG: StorageType not found, type=" + type);
}
} }
private static State convert(StorageState state) { public static DatanodeStorage convert(DatanodeStorageProto s) {
return new DatanodeStorage(s.getStorageUuid(),
PBHelper.convertState(s.getState()),
PBHelper.convertType(s.getStorageType()));
}
private static State convertState(StorageState state) {
switch(state) { switch(state) {
case READ_ONLY: case READ_ONLY:
return DatanodeStorage.State.READ_ONLY; return DatanodeStorage.State.READ_ONLY;
@ -1450,14 +1534,50 @@ public class PBHelper {
} }
} }
private static StorageType convertType(StorageTypeProto type) {
switch(type) {
case DISK:
return StorageType.DISK;
case SSD:
return StorageType.SSD;
default:
throw new IllegalStateException(
"BUG: StorageTypeProto not found, type=" + type);
}
}
private static StorageType[] convertStorageTypeProtos(
List<StorageTypeProto> storageTypesList) {
final StorageType[] storageTypes = new StorageType[storageTypesList.size()];
for (int i = 0; i < storageTypes.length; ++i) {
storageTypes[i] = PBHelper.convertType(storageTypesList.get(i));
}
return storageTypes;
}
public static StorageReportProto convert(StorageReport r) { public static StorageReportProto convert(StorageReport r) {
StorageReportProto.Builder builder = StorageReportProto.newBuilder() StorageReportProto.Builder builder = StorageReportProto.newBuilder()
.setBlockPoolUsed(r.getBlockPoolUsed()).setCapacity(r.getCapacity()) .setBlockPoolUsed(r.getBlockPoolUsed()).setCapacity(r.getCapacity())
.setDfsUsed(r.getDfsUsed()).setRemaining(r.getRemaining()) .setDfsUsed(r.getDfsUsed()).setRemaining(r.getRemaining())
.setStorageID(r.getStorageID()); .setStorageUuid(r.getStorageID());
return builder.build(); return builder.build();
} }
public static StorageReport convert(StorageReportProto p) {
return new StorageReport(p.getStorageUuid(), p.getFailed(),
p.getCapacity(), p.getDfsUsed(), p.getRemaining(),
p.getBlockPoolUsed());
}
public static StorageReport[] convertStorageReports(
List<StorageReportProto> list) {
final StorageReport[] report = new StorageReport[list.size()];
for (int i = 0; i < report.length; i++) {
report[i] = convert(list.get(i));
}
return report;
}
public static JournalInfo convert(JournalInfoProto info) { public static JournalInfo convert(JournalInfoProto info) {
int lv = info.hasLayoutVersion() ? info.getLayoutVersion() : 0; int lv = info.hasLayoutVersion() ? info.getLayoutVersion() : 0;
int nsID = info.hasNamespaceID() ? info.getNamespaceID() : 0; int nsID = info.hasNamespaceID() ? info.getNamespaceID() : 0;
@ -1684,8 +1804,11 @@ public class PBHelper {
if (info.getMode() != null) { if (info.getMode() != null) {
builder.setMode(info.getMode().toShort()); builder.setMode(info.getMode().toShort());
} }
if (info.getWeight() != null) { if (info.getLimit() != null) {
builder.setWeight(info.getWeight()); builder.setLimit(info.getLimit());
}
if (info.getMaxRelativeExpiryMs() != null) {
builder.setMaxRelativeExpiry(info.getMaxRelativeExpiryMs());
} }
return builder.build(); return builder.build();
} }
@ -1703,8 +1826,11 @@ public class PBHelper {
if (proto.hasMode()) { if (proto.hasMode()) {
info.setMode(new FsPermission((short)proto.getMode())); info.setMode(new FsPermission((short)proto.getMode()));
} }
if (proto.hasWeight()) { if (proto.hasLimit()) {
info.setWeight(proto.getWeight()); info.setLimit(proto.getLimit());
}
if (proto.hasMaxRelativeExpiry()) {
info.setMaxRelativeExpiryMs(proto.getMaxRelativeExpiry());
} }
return info; return info;
} }
@ -1713,6 +1839,7 @@ public class PBHelper {
CachePoolStatsProto.Builder builder = CachePoolStatsProto.newBuilder(); CachePoolStatsProto.Builder builder = CachePoolStatsProto.newBuilder();
builder.setBytesNeeded(stats.getBytesNeeded()); builder.setBytesNeeded(stats.getBytesNeeded());
builder.setBytesCached(stats.getBytesCached()); builder.setBytesCached(stats.getBytesCached());
builder.setBytesOverlimit(stats.getBytesOverlimit());
builder.setFilesNeeded(stats.getFilesNeeded()); builder.setFilesNeeded(stats.getFilesNeeded());
builder.setFilesCached(stats.getFilesCached()); builder.setFilesCached(stats.getFilesCached());
return builder.build(); return builder.build();
@ -1722,6 +1849,7 @@ public class PBHelper {
CachePoolStats.Builder builder = new CachePoolStats.Builder(); CachePoolStats.Builder builder = new CachePoolStats.Builder();
builder.setBytesNeeded(proto.getBytesNeeded()); builder.setBytesNeeded(proto.getBytesNeeded());
builder.setBytesCached(proto.getBytesCached()); builder.setBytesCached(proto.getBytesCached());
builder.setBytesOverlimit(proto.getBytesOverlimit());
builder.setFilesNeeded(proto.getFilesNeeded()); builder.setFilesNeeded(proto.getFilesNeeded());
builder.setFilesCached(proto.getFilesCached()); builder.setFilesCached(proto.getFilesCached());
return builder.build(); return builder.build();
@ -1756,3 +1884,4 @@ public class PBHelper {
return new ExactSizeInputStream(input, size); return new ExactSizeInputStream(input, size);
} }
} }

View File

@ -18,7 +18,6 @@
package org.apache.hadoop.hdfs.server.balancer; package org.apache.hadoop.hdfs.server.balancer;
import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed; import static org.apache.hadoop.hdfs.protocolPB.PBHelper.vintPrefixed;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
@ -221,8 +220,8 @@ public class Balancer {
private Map<Block, BalancerBlock> globalBlockList private Map<Block, BalancerBlock> globalBlockList
= new HashMap<Block, BalancerBlock>(); = new HashMap<Block, BalancerBlock>();
private MovedBlocks movedBlocks = new MovedBlocks(); private MovedBlocks movedBlocks = new MovedBlocks();
// Map storage IDs to BalancerDatanodes /** Map (datanodeUuid -> BalancerDatanodes) */
private Map<String, BalancerDatanode> datanodes private final Map<String, BalancerDatanode> datanodeMap
= new HashMap<String, BalancerDatanode>(); = new HashMap<String, BalancerDatanode>();
private NetworkTopology cluster; private NetworkTopology cluster;
@ -241,6 +240,14 @@ public class Balancer {
private PendingBlockMove() { private PendingBlockMove() {
} }
@Override
public String toString() {
final Block b = block.getBlock();
return b + " with size=" + b.getNumBytes() + " from "
+ source.getDisplayName() + " to " + target.getDisplayName()
+ " through " + proxySource.getDisplayName();
}
/* choose a block & a proxy source for this pendingMove /* choose a block & a proxy source for this pendingMove
* whose source & target have already been chosen. * whose source & target have already been chosen.
* *
@ -272,11 +279,7 @@ public class Balancer {
if ( chooseProxySource() ) { if ( chooseProxySource() ) {
movedBlocks.add(block); movedBlocks.add(block);
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Decided to move block "+ block.getBlockId() LOG.debug("Decided to move " + this);
+" with a length of "+StringUtils.byteDesc(block.getNumBytes())
+ " bytes from " + source.getDisplayName()
+ " to " + target.getDisplayName()
+ " using proxy source " + proxySource.getDisplayName() );
} }
return true; return true;
} }
@ -353,17 +356,9 @@ public class Balancer {
sendRequest(out); sendRequest(out);
receiveResponse(in); receiveResponse(in);
bytesMoved.inc(block.getNumBytes()); bytesMoved.inc(block.getNumBytes());
LOG.info( "Moving block " + block.getBlock().getBlockId() + LOG.info("Successfully moved " + this);
" from "+ source.getDisplayName() + " to " +
target.getDisplayName() + " through " +
proxySource.getDisplayName() +
" is succeeded." );
} catch (IOException e) { } catch (IOException e) {
LOG.warn("Error moving block "+block.getBlockId()+ LOG.warn("Failed to move " + this + ": " + e.getMessage());
" from " + source.getDisplayName() + " to " +
target.getDisplayName() + " through " +
proxySource.getDisplayName() +
": "+e.getMessage());
} finally { } finally {
IOUtils.closeStream(out); IOUtils.closeStream(out);
IOUtils.closeStream(in); IOUtils.closeStream(in);
@ -415,9 +410,7 @@ public class Balancer {
@Override @Override
public void run() { public void run() {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Starting moving "+ block.getBlockId() + LOG.debug("Start moving " + PendingBlockMove.this);
" from " + proxySource.getDisplayName() + " to " +
target.getDisplayName());
} }
dispatch(); dispatch();
} }
@ -464,11 +457,6 @@ public class Balancer {
return block; return block;
} }
/* Return the block id */
private long getBlockId() {
return block.getBlockId();
}
/* Return the length of the block */ /* Return the length of the block */
private long getNumBytes() { private long getNumBytes() {
return block.getNumBytes(); return block.getNumBytes();
@ -552,7 +540,7 @@ public class Balancer {
/* Get the storage id of the datanode */ /* Get the storage id of the datanode */
protected String getStorageID() { protected String getStorageID() {
return datanode.getStorageID(); return datanode.getDatanodeUuid();
} }
/** Decide if still need to move more bytes */ /** Decide if still need to move more bytes */
@ -675,10 +663,10 @@ public class Balancer {
synchronized (block) { synchronized (block) {
// update locations // update locations
for ( String storageID : blk.getStorageIDs() ) { for (String datanodeUuid : blk.getDatanodeUuids()) {
BalancerDatanode datanode = datanodes.get(storageID); final BalancerDatanode d = datanodeMap.get(datanodeUuid);
if (datanode != null) { // not an unknown datanode if (datanode != null) { // not an unknown datanode
block.addLocation(datanode); block.addLocation(d);
} }
} }
} }
@ -852,16 +840,6 @@ public class Balancer {
DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT)); DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT));
} }
/* Shuffle datanode array */
static private void shuffleArray(DatanodeInfo[] datanodes) {
for (int i=datanodes.length; i>1; i--) {
int randomIndex = DFSUtil.getRandom().nextInt(i);
DatanodeInfo tmp = datanodes[randomIndex];
datanodes[randomIndex] = datanodes[i-1];
datanodes[i-1] = tmp;
}
}
/* Given a data node set, build a network topology and decide /* Given a data node set, build a network topology and decide
* over-utilized datanodes, above average utilized datanodes, * over-utilized datanodes, above average utilized datanodes,
* below average utilized datanodes, and underutilized datanodes. * below average utilized datanodes, and underutilized datanodes.
@ -891,8 +869,7 @@ public class Balancer {
* an increasing order or a decreasing order. * an increasing order or a decreasing order.
*/ */
long overLoadedBytes = 0L, underLoadedBytes = 0L; long overLoadedBytes = 0L, underLoadedBytes = 0L;
shuffleArray(datanodes); for (DatanodeInfo datanode : DFSUtil.shuffle(datanodes)) {
for (DatanodeInfo datanode : datanodes) {
if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) { if (datanode.isDecommissioned() || datanode.isDecommissionInProgress()) {
continue; // ignore decommissioning or decommissioned nodes continue; // ignore decommissioning or decommissioned nodes
} }
@ -923,13 +900,13 @@ public class Balancer {
datanodeS.utilization)*datanodeS.datanode.getCapacity()/100.0); datanodeS.utilization)*datanodeS.datanode.getCapacity()/100.0);
} }
} }
this.datanodes.put(datanode.getStorageID(), datanodeS); datanodeMap.put(datanode.getDatanodeUuid(), datanodeS);
} }
//logging //logging
logNodes(); logNodes();
assert (this.datanodes.size() == assert (this.datanodeMap.size() ==
overUtilizedDatanodes.size()+underUtilizedDatanodes.size()+ overUtilizedDatanodes.size()+underUtilizedDatanodes.size()+
aboveAvgUtilizedDatanodes.size()+belowAvgUtilizedDatanodes.size()) aboveAvgUtilizedDatanodes.size()+belowAvgUtilizedDatanodes.size())
: "Mismatched number of datanodes"; : "Mismatched number of datanodes";
@ -1001,9 +978,9 @@ public class Balancer {
// At last, match all remaining nodes // At last, match all remaining nodes
chooseNodes(ANY_OTHER); chooseNodes(ANY_OTHER);
assert (datanodes.size() >= sources.size()+targets.size()) assert (datanodeMap.size() >= sources.size()+targets.size())
: "Mismatched number of datanodes (" + : "Mismatched number of datanodes (" +
datanodes.size() + " total, " + datanodeMap.size() + " total, " +
sources.size() + " sources, " + sources.size() + " sources, " +
targets.size() + " targets)"; targets.size() + " targets)";
@ -1304,7 +1281,7 @@ public class Balancer {
this.aboveAvgUtilizedDatanodes.clear(); this.aboveAvgUtilizedDatanodes.clear();
this.belowAvgUtilizedDatanodes.clear(); this.belowAvgUtilizedDatanodes.clear();
this.underUtilizedDatanodes.clear(); this.underUtilizedDatanodes.clear();
this.datanodes.clear(); this.datanodeMap.clear();
this.sources.clear(); this.sources.clear();
this.targets.clear(); this.targets.clear();
this.policy.reset(); this.policy.reset();

View File

@ -75,7 +75,7 @@ public interface BlockCollection {
* and set the locations. * and set the locations.
*/ */
public BlockInfoUnderConstruction setLastBlock(BlockInfo lastBlock, public BlockInfoUnderConstruction setLastBlock(BlockInfo lastBlock,
DatanodeDescriptor[] locations) throws IOException; DatanodeStorageInfo[] targets) throws IOException;
/** /**
* @return whether the block collection is under construction. * @return whether the block collection is under construction.

View File

@ -21,6 +21,7 @@ import java.util.LinkedList;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.BlockUCState;
import org.apache.hadoop.util.LightWeightGSet; import org.apache.hadoop.util.LightWeightGSet;
@ -39,11 +40,11 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
private LightWeightGSet.LinkedElement nextLinkedElement; private LightWeightGSet.LinkedElement nextLinkedElement;
/** /**
* This array contains triplets of references. For each i-th datanode the * This array contains triplets of references. For each i-th storage, the
* block belongs to triplets[3*i] is the reference to the DatanodeDescriptor * block belongs to triplets[3*i] is the reference to the
* and triplets[3*i+1] and triplets[3*i+2] are references to the previous and * {@link DatanodeStorageInfo} and triplets[3*i+1] and triplets[3*i+2] are
* the next blocks, respectively, in the list of blocks belonging to this * references to the previous and the next blocks, respectively, in the list
* data-node. * of blocks belonging to this storage.
* *
* Using previous and next in Object triplets is done instead of a * Using previous and next in Object triplets is done instead of a
* {@link LinkedList} list to efficiently use memory. With LinkedList the cost * {@link LinkedList} list to efficiently use memory. With LinkedList the cost
@ -86,9 +87,14 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
} }
public DatanodeDescriptor getDatanode(int index) { public DatanodeDescriptor getDatanode(int index) {
DatanodeStorageInfo storage = getStorageInfo(index);
return storage == null ? null : storage.getDatanodeDescriptor();
}
DatanodeStorageInfo getStorageInfo(int index) {
assert this.triplets != null : "BlockInfo is not initialized"; assert this.triplets != null : "BlockInfo is not initialized";
assert index >= 0 && index*3 < triplets.length : "Index is out of bound"; assert index >= 0 && index*3 < triplets.length : "Index is out of bound";
return (DatanodeDescriptor)triplets[index*3]; return (DatanodeStorageInfo)triplets[index*3];
} }
private BlockInfo getPrevious(int index) { private BlockInfo getPrevious(int index) {
@ -111,14 +117,10 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
return info; return info;
} }
private void setDatanode(int index, DatanodeDescriptor node, BlockInfo previous, private void setStorageInfo(int index, DatanodeStorageInfo storage) {
BlockInfo next) {
assert this.triplets != null : "BlockInfo is not initialized"; assert this.triplets != null : "BlockInfo is not initialized";
int i = index * 3; assert index >= 0 && index*3 < triplets.length : "Index is out of bound";
assert index >= 0 && i+2 < triplets.length : "Index is out of bound"; triplets[index*3] = storage;
triplets[i] = node;
triplets[i+1] = previous;
triplets[i+2] = next;
} }
/** /**
@ -190,22 +192,34 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
} }
/** /**
* Add data-node this block belongs to. * Add a {@link DatanodeStorageInfo} location for a block
*/ */
public boolean addNode(DatanodeDescriptor node) { boolean addStorage(DatanodeStorageInfo storage) {
if(findDatanode(node) >= 0) // the node is already there boolean added = true;
int idx = findDatanode(storage.getDatanodeDescriptor());
if(idx >= 0) {
if (getStorageInfo(idx) == storage) { // the storage is already there
return false; return false;
} else {
// The block is on the DN but belongs to a different storage.
// Update our state.
removeStorage(storage);
added = false; // Just updating storage. Return false.
}
}
// find the last null node // find the last null node
int lastNode = ensureCapacity(1); int lastNode = ensureCapacity(1);
setDatanode(lastNode, node, null, null); setStorageInfo(lastNode, storage);
return true; setNext(lastNode, null);
setPrevious(lastNode, null);
return added;
} }
/** /**
* Remove data-node from the block. * Remove {@link DatanodeStorageInfo} location for a block
*/ */
public boolean removeNode(DatanodeDescriptor node) { boolean removeStorage(DatanodeStorageInfo storage) {
int dnIndex = findDatanode(node); int dnIndex = findStorageInfo(storage);
if(dnIndex < 0) // the node is not found if(dnIndex < 0) // the node is not found
return false; return false;
assert getPrevious(dnIndex) == null && getNext(dnIndex) == null : assert getPrevious(dnIndex) == null && getNext(dnIndex) == null :
@ -213,10 +227,13 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
// find the last not null node // find the last not null node
int lastNode = numNodes()-1; int lastNode = numNodes()-1;
// replace current node triplet by the lastNode one // replace current node triplet by the lastNode one
setDatanode(dnIndex, getDatanode(lastNode), getPrevious(lastNode), setStorageInfo(dnIndex, getStorageInfo(lastNode));
getNext(lastNode)); setNext(dnIndex, getNext(lastNode));
setPrevious(dnIndex, getPrevious(lastNode));
// set the last triplet to null // set the last triplet to null
setDatanode(lastNode, null, null, null); setStorageInfo(lastNode, null);
setNext(lastNode, null);
setPrevious(lastNode, null);
return true; return true;
} }
@ -236,37 +253,70 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
} }
return -1; return -1;
} }
/**
* Find specified DatanodeStorageInfo.
* @param dn
* @return index or -1 if not found.
*/
int findStorageInfo(DatanodeInfo dn) {
int len = getCapacity();
for(int idx = 0; idx < len; idx++) {
DatanodeStorageInfo cur = getStorageInfo(idx);
if(cur == null)
break;
if(cur.getDatanodeDescriptor() == dn)
return idx;
}
return -1;
}
/**
* Find specified DatanodeStorageInfo.
* @param storageInfo
* @return index or -1 if not found.
*/
int findStorageInfo(DatanodeStorageInfo storageInfo) {
int len = getCapacity();
for(int idx = 0; idx < len; idx++) {
DatanodeStorageInfo cur = getStorageInfo(idx);
if(cur == storageInfo)
return idx;
if(cur == null)
break;
}
return -1;
}
/** /**
* Insert this block into the head of the list of blocks * Insert this block into the head of the list of blocks
* related to the specified DatanodeDescriptor. * related to the specified DatanodeStorageInfo.
* If the head is null then form a new list. * If the head is null then form a new list.
* @return current block as the new head of the list. * @return current block as the new head of the list.
*/ */
public BlockInfo listInsert(BlockInfo head, DatanodeDescriptor dn) { BlockInfo listInsert(BlockInfo head, DatanodeStorageInfo storage) {
int dnIndex = this.findDatanode(dn); int dnIndex = this.findStorageInfo(storage);
assert dnIndex >= 0 : "Data node is not found: current"; assert dnIndex >= 0 : "Data node is not found: current";
assert getPrevious(dnIndex) == null && getNext(dnIndex) == null : assert getPrevious(dnIndex) == null && getNext(dnIndex) == null :
"Block is already in the list and cannot be inserted."; "Block is already in the list and cannot be inserted.";
this.setPrevious(dnIndex, null); this.setPrevious(dnIndex, null);
this.setNext(dnIndex, head); this.setNext(dnIndex, head);
if(head != null) if(head != null)
head.setPrevious(head.findDatanode(dn), this); head.setPrevious(head.findStorageInfo(storage), this);
return this; return this;
} }
/** /**
* Remove this block from the list of blocks * Remove this block from the list of blocks
* related to the specified DatanodeDescriptor. * related to the specified DatanodeStorageInfo.
* If this block is the head of the list then return the next block as * If this block is the head of the list then return the next block as
* the new head. * the new head.
* @return the new head of the list or null if the list becomes * @return the new head of the list or null if the list becomes
* empty after deletion. * empy after deletion.
*/ */
public BlockInfo listRemove(BlockInfo head, DatanodeDescriptor dn) { BlockInfo listRemove(BlockInfo head, DatanodeStorageInfo storage) {
if(head == null) if(head == null)
return null; return null;
int dnIndex = this.findDatanode(dn); int dnIndex = this.findStorageInfo(storage);
if(dnIndex < 0) // this block is not on the data-node list if(dnIndex < 0) // this block is not on the data-node list
return head; return head;
@ -275,9 +325,9 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
this.setNext(dnIndex, null); this.setNext(dnIndex, null);
this.setPrevious(dnIndex, null); this.setPrevious(dnIndex, null);
if(prev != null) if(prev != null)
prev.setNext(prev.findDatanode(dn), next); prev.setNext(prev.findStorageInfo(storage), next);
if(next != null) if(next != null)
next.setPrevious(next.findDatanode(dn), prev); next.setPrevious(next.findStorageInfo(storage), prev);
if(this == head) // removing the head if(this == head) // removing the head
head = next; head = next;
return head; return head;
@ -289,7 +339,7 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
* *
* @return the new head of the list. * @return the new head of the list.
*/ */
public BlockInfo moveBlockToHead(BlockInfo head, DatanodeDescriptor dn, public BlockInfo moveBlockToHead(BlockInfo head, DatanodeStorageInfo storage,
int curIndex, int headIndex) { int curIndex, int headIndex) {
if (head == this) { if (head == this) {
return this; return this;
@ -298,9 +348,9 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
BlockInfo prev = this.setPrevious(curIndex, null); BlockInfo prev = this.setPrevious(curIndex, null);
head.setPrevious(headIndex, this); head.setPrevious(headIndex, this);
prev.setNext(prev.findDatanode(dn), next); prev.setNext(prev.findStorageInfo(storage), next);
if (next != null) if (next != null)
next.setPrevious(next.findDatanode(dn), prev); next.setPrevious(next.findStorageInfo(storage), prev);
return this; return this;
} }
@ -328,10 +378,10 @@ public class BlockInfo extends Block implements LightWeightGSet.LinkedElement {
* @return BlockInfoUnderConstruction - an under construction block. * @return BlockInfoUnderConstruction - an under construction block.
*/ */
public BlockInfoUnderConstruction convertToBlockUnderConstruction( public BlockInfoUnderConstruction convertToBlockUnderConstruction(
BlockUCState s, DatanodeDescriptor[] targets) { BlockUCState s, DatanodeStorageInfo[] targets) {
if(isComplete()) { if(isComplete()) {
return new BlockInfoUnderConstruction( return new BlockInfoUnderConstruction(this,
this, getBlockCollection().getBlockReplication(), s, targets); getBlockCollection().getBlockReplication(), s, targets);
} }
// the block is already under construction // the block is already under construction
BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)this; BlockInfoUnderConstruction ucBlock = (BlockInfoUnderConstruction)this;

View File

@ -63,12 +63,12 @@ public class BlockInfoUnderConstruction extends BlockInfo {
* corresponding replicas. * corresponding replicas.
*/ */
static class ReplicaUnderConstruction extends Block { static class ReplicaUnderConstruction extends Block {
private DatanodeDescriptor expectedLocation; private final DatanodeStorageInfo expectedLocation;
private ReplicaState state; private ReplicaState state;
private boolean chosenAsPrimary; private boolean chosenAsPrimary;
ReplicaUnderConstruction(Block block, ReplicaUnderConstruction(Block block,
DatanodeDescriptor target, DatanodeStorageInfo target,
ReplicaState state) { ReplicaState state) {
super(block); super(block);
this.expectedLocation = target; this.expectedLocation = target;
@ -82,7 +82,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
* It is not guaranteed, but expected, that the data-node actually has * It is not guaranteed, but expected, that the data-node actually has
* the replica. * the replica.
*/ */
DatanodeDescriptor getExpectedLocation() { private DatanodeStorageInfo getExpectedStorageLocation() {
return expectedLocation; return expectedLocation;
} }
@ -118,7 +118,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
* Is data-node the replica belongs to alive. * Is data-node the replica belongs to alive.
*/ */
boolean isAlive() { boolean isAlive() {
return expectedLocation.isAlive; return expectedLocation.getDatanodeDescriptor().isAlive;
} }
@Override // Block @Override // Block
@ -162,7 +162,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
*/ */
public BlockInfoUnderConstruction(Block blk, int replication, public BlockInfoUnderConstruction(Block blk, int replication,
BlockUCState state, BlockUCState state,
DatanodeDescriptor[] targets) { DatanodeStorageInfo[] targets) {
super(blk, replication); super(blk, replication);
assert getBlockUCState() != BlockUCState.COMPLETE : assert getBlockUCState() != BlockUCState.COMPLETE :
"BlockInfoUnderConstruction cannot be in COMPLETE state"; "BlockInfoUnderConstruction cannot be in COMPLETE state";
@ -186,7 +186,7 @@ public class BlockInfoUnderConstruction extends BlockInfo {
} }
/** Set expected locations */ /** Set expected locations */
public void setExpectedLocations(DatanodeDescriptor[] targets) { public void setExpectedLocations(DatanodeStorageInfo[] targets) {
int numLocations = targets == null ? 0 : targets.length; int numLocations = targets == null ? 0 : targets.length;
this.replicas = new ArrayList<ReplicaUnderConstruction>(numLocations); this.replicas = new ArrayList<ReplicaUnderConstruction>(numLocations);
for(int i = 0; i < numLocations; i++) for(int i = 0; i < numLocations; i++)
@ -198,12 +198,12 @@ public class BlockInfoUnderConstruction extends BlockInfo {
* Create array of expected replica locations * Create array of expected replica locations
* (as has been assigned by chooseTargets()). * (as has been assigned by chooseTargets()).
*/ */
public DatanodeDescriptor[] getExpectedLocations() { public DatanodeStorageInfo[] getExpectedStorageLocations() {
int numLocations = replicas == null ? 0 : replicas.size(); int numLocations = replicas == null ? 0 : replicas.size();
DatanodeDescriptor[] locations = new DatanodeDescriptor[numLocations]; DatanodeStorageInfo[] storages = new DatanodeStorageInfo[numLocations];
for(int i = 0; i < numLocations; i++) for(int i = 0; i < numLocations; i++)
locations[i] = replicas.get(i).getExpectedLocation(); storages[i] = replicas.get(i).getExpectedStorageLocation();
return locations; return storages;
} }
/** Get the number of expected locations */ /** Get the number of expected locations */
@ -244,9 +244,9 @@ public class BlockInfoUnderConstruction extends BlockInfo {
// The replica list is unchanged. // The replica list is unchanged.
for (ReplicaUnderConstruction r : replicas) { for (ReplicaUnderConstruction r : replicas) {
if (genStamp != r.getGenerationStamp()) { if (genStamp != r.getGenerationStamp()) {
r.getExpectedLocation().removeBlock(this); r.getExpectedStorageLocation().removeBlock(this);
NameNode.blockStateChangeLog.info("BLOCK* Removing stale replica " NameNode.blockStateChangeLog.info("BLOCK* Removing stale replica "
+ "from location: " + r.getExpectedLocation()); + "from location: " + r.getExpectedStorageLocation());
} }
} }
} }
@ -302,31 +302,44 @@ public class BlockInfoUnderConstruction extends BlockInfo {
if (!(replicas.get(i).isAlive() && !replicas.get(i).getChosenAsPrimary())) { if (!(replicas.get(i).isAlive() && !replicas.get(i).getChosenAsPrimary())) {
continue; continue;
} }
if (replicas.get(i).getExpectedLocation().getLastUpdate() > mostRecentLastUpdate) { final ReplicaUnderConstruction ruc = replicas.get(i);
primary = replicas.get(i); final long lastUpdate = ruc.getExpectedStorageLocation().getDatanodeDescriptor().getLastUpdate();
if (lastUpdate > mostRecentLastUpdate) {
primaryNodeIndex = i; primaryNodeIndex = i;
mostRecentLastUpdate = primary.getExpectedLocation().getLastUpdate(); primary = ruc;
mostRecentLastUpdate = lastUpdate;
} }
} }
if (primary != null) { if (primary != null) {
primary.getExpectedLocation().addBlockToBeRecovered(this); primary.getExpectedStorageLocation().getDatanodeDescriptor().addBlockToBeRecovered(this);
primary.setChosenAsPrimary(true); primary.setChosenAsPrimary(true);
NameNode.blockStateChangeLog.info("BLOCK* " + this NameNode.blockStateChangeLog.info("BLOCK* " + this
+ " recovery started, primary=" + primary); + " recovery started, primary=" + primary);
} }
} }
void addReplicaIfNotPresent(DatanodeDescriptor dn, void addReplicaIfNotPresent(DatanodeStorageInfo storage,
Block block, Block block,
ReplicaState rState) { ReplicaState rState) {
for (ReplicaUnderConstruction r : replicas) { Iterator<ReplicaUnderConstruction> it = replicas.iterator();
if (r.getExpectedLocation() == dn) { while (it.hasNext()) {
ReplicaUnderConstruction r = it.next();
if(r.getExpectedStorageLocation() == storage) {
// Record the gen stamp from the report // Record the gen stamp from the report
r.setGenerationStamp(block.getGenerationStamp()); r.setGenerationStamp(block.getGenerationStamp());
return; return;
} else if (r.getExpectedStorageLocation().getDatanodeDescriptor() ==
storage.getDatanodeDescriptor()) {
// The Datanode reported that the block is on a different storage
// than the one chosen by BlockPlacementPolicy. This can occur as
// we allow Datanodes to choose the target storage. Update our
// state by removing the stale entry and adding a new one.
it.remove();
break;
} }
} }
replicas.add(new ReplicaUnderConstruction(block, dn, rState)); replicas.add(new ReplicaUnderConstruction(block, storage, rState));
} }
@Override // BlockInfo @Override // BlockInfo

View File

@ -34,6 +34,7 @@ import java.util.Map;
import java.util.Queue; import java.util.Queue;
import java.util.Set; import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
@ -44,6 +45,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.HAUtil; import org.apache.hadoop.hdfs.HAUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportIterator; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs.BlockReportIterator;
@ -70,8 +72,10 @@ import org.apache.hadoop.hdfs.server.protocol.BlockCommand;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations;
import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations; import org.apache.hadoop.hdfs.server.protocol.BlocksWithLocations.BlockWithLocations;
import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand; import org.apache.hadoop.hdfs.server.protocol.DatanodeCommand;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand; import org.apache.hadoop.hdfs.server.protocol.KeyUpdateCommand;
import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo; import org.apache.hadoop.hdfs.server.protocol.ReceivedDeletedBlockInfo;
import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
import org.apache.hadoop.hdfs.util.LightWeightLinkedSet; import org.apache.hadoop.hdfs.util.LightWeightLinkedSet;
import org.apache.hadoop.net.Node; import org.apache.hadoop.net.Node;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
@ -489,8 +493,8 @@ public class BlockManager {
private void dumpBlockMeta(Block block, PrintWriter out) { private void dumpBlockMeta(Block block, PrintWriter out) {
List<DatanodeDescriptor> containingNodes = List<DatanodeDescriptor> containingNodes =
new ArrayList<DatanodeDescriptor>(); new ArrayList<DatanodeDescriptor>();
List<DatanodeDescriptor> containingLiveReplicasNodes = List<DatanodeStorageInfo> containingLiveReplicasNodes =
new ArrayList<DatanodeDescriptor>(); new ArrayList<DatanodeStorageInfo>();
NumberReplicas numReplicas = new NumberReplicas(); NumberReplicas numReplicas = new NumberReplicas();
// source node returned is not used // source node returned is not used
@ -517,9 +521,8 @@ public class BlockManager {
Collection<DatanodeDescriptor> corruptNodes = Collection<DatanodeDescriptor> corruptNodes =
corruptReplicas.getNodes(block); corruptReplicas.getNodes(block);
for (Iterator<DatanodeDescriptor> jt = blocksMap.nodeIterator(block); for (DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
jt.hasNext();) { final DatanodeDescriptor node = storage.getDatanodeDescriptor();
DatanodeDescriptor node = jt.next();
String state = ""; String state = "";
if (corruptNodes != null && corruptNodes.contains(node)) { if (corruptNodes != null && corruptNodes.contains(node)) {
state = "(corrupt)"; state = "(corrupt)";
@ -528,7 +531,7 @@ public class BlockManager {
state = "(decommissioned)"; state = "(decommissioned)";
} }
if (node.areBlockContentsStale()) { if (storage.areBlockContentsStale()) {
state += " (block deletions maybe out of date)"; state += " (block deletions maybe out of date)";
} }
out.print(" " + node + state + " : "); out.print(" " + node + state + " : ");
@ -679,10 +682,9 @@ public class BlockManager {
assert oldBlock == getStoredBlock(oldBlock) : assert oldBlock == getStoredBlock(oldBlock) :
"last block of the file is not in blocksMap"; "last block of the file is not in blocksMap";
DatanodeDescriptor[] targets = getNodes(oldBlock); DatanodeStorageInfo[] targets = getStorages(oldBlock);
BlockInfoUnderConstruction ucBlock = BlockInfoUnderConstruction ucBlock = bc.setLastBlock(oldBlock, targets);
bc.setLastBlock(oldBlock, targets);
blocksMap.replaceBlock(ucBlock); blocksMap.replaceBlock(ucBlock);
// Remove block from replication queue. // Remove block from replication queue.
@ -692,9 +694,8 @@ public class BlockManager {
pendingReplications.remove(ucBlock); pendingReplications.remove(ucBlock);
// remove this block from the list of pending blocks to be deleted. // remove this block from the list of pending blocks to be deleted.
for (DatanodeDescriptor dd : targets) { for (DatanodeStorageInfo storage : targets) {
String datanodeId = dd.getStorageID(); invalidateBlocks.remove(storage.getStorageID(), oldBlock);
invalidateBlocks.remove(datanodeId, oldBlock);
} }
// Adjust safe-mode totals, since under-construction blocks don't // Adjust safe-mode totals, since under-construction blocks don't
@ -713,18 +714,17 @@ public class BlockManager {
/** /**
* Get all valid locations of the block * Get all valid locations of the block
*/ */
private List<String> getValidLocations(Block block) { private List<DatanodeStorageInfo> getValidLocations(Block block) {
ArrayList<String> machineSet = final List<DatanodeStorageInfo> locations
new ArrayList<String>(blocksMap.numNodes(block)); = new ArrayList<DatanodeStorageInfo>(blocksMap.numNodes(block));
for(Iterator<DatanodeDescriptor> it = for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
blocksMap.nodeIterator(block); it.hasNext();) { final String storageID = storage.getStorageID();
String storageID = it.next().getStorageID();
// filter invalidate replicas // filter invalidate replicas
if(!invalidateBlocks.contains(storageID, block)) { if(!invalidateBlocks.contains(storageID, block)) {
machineSet.add(storageID); locations.add(storage);
} }
} }
return machineSet; return locations;
} }
private List<LocatedBlock> createLocatedBlockList(final BlockInfo[] blocks, private List<LocatedBlock> createLocatedBlockList(final BlockInfo[] blocks,
@ -792,9 +792,9 @@ public class BlockManager {
+ ", blk=" + blk); + ", blk=" + blk);
} }
final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)blk; final BlockInfoUnderConstruction uc = (BlockInfoUnderConstruction)blk;
final DatanodeDescriptor[] locations = uc.getExpectedLocations(); final DatanodeStorageInfo[] storages = uc.getExpectedStorageLocations();
final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk); final ExtendedBlock eb = new ExtendedBlock(namesystem.getBlockPoolId(), blk);
return new LocatedBlock(eb, locations, pos, false); return new LocatedBlock(eb, storages, pos, false);
} }
// get block locations // get block locations
@ -809,15 +809,14 @@ public class BlockManager {
final int numNodes = blocksMap.numNodes(blk); final int numNodes = blocksMap.numNodes(blk);
final boolean isCorrupt = numCorruptNodes == numNodes; final boolean isCorrupt = numCorruptNodes == numNodes;
final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes; final int numMachines = isCorrupt ? numNodes: numNodes - numCorruptNodes;
final DatanodeDescriptor[] machines = new DatanodeDescriptor[numMachines]; final DatanodeStorageInfo[] machines = new DatanodeStorageInfo[numMachines];
int j = 0; int j = 0;
if (numMachines > 0) { if (numMachines > 0) {
for(Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(blk); for(DatanodeStorageInfo storage : blocksMap.getStorages(blk)) {
it.hasNext();) { final DatanodeDescriptor d = storage.getDatanodeDescriptor();
final DatanodeDescriptor d = it.next();
final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, d); final boolean replicaCorrupt = corruptReplicas.isReplicaCorrupt(blk, d);
if (isCorrupt || (!isCorrupt && !replicaCorrupt)) if (isCorrupt || (!isCorrupt && !replicaCorrupt))
machines[j++] = d; machines[j++] = storage;
} }
} }
assert j == machines.length : assert j == machines.length :
@ -1009,13 +1008,20 @@ public class BlockManager {
} }
node.resetBlocks(); node.resetBlocks();
invalidateBlocks.remove(node.getStorageID()); invalidateBlocks.remove(node.getDatanodeUuid());
// If the DN hasn't block-reported since the most recent // If the DN hasn't block-reported since the most recent
// failover, then we may have been holding up on processing // failover, then we may have been holding up on processing
// over-replicated blocks because of it. But we can now // over-replicated blocks because of it. But we can now
// process those blocks. // process those blocks.
if (node.areBlockContentsStale()) { boolean stale = false;
for(DatanodeStorageInfo storage : node.getStorageInfos()) {
if (storage.areBlockContentsStale()) {
stale = true;
break;
}
}
if (stale) {
rescanPostponedMisreplicatedBlocks(); rescanPostponedMisreplicatedBlocks();
} }
} }
@ -1034,9 +1040,8 @@ public class BlockManager {
*/ */
private void addToInvalidates(Block b) { private void addToInvalidates(Block b) {
StringBuilder datanodes = new StringBuilder(); StringBuilder datanodes = new StringBuilder();
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); it for(DatanodeStorageInfo storage : blocksMap.getStorages(b)) {
.hasNext();) { final DatanodeDescriptor node = storage.getDatanodeDescriptor();
DatanodeDescriptor node = it.next();
invalidateBlocks.add(b, node, false); invalidateBlocks.add(b, node, false);
datanodes.append(node).append(" "); datanodes.append(node).append(" ");
} }
@ -1054,7 +1059,7 @@ public class BlockManager {
* for logging purposes * for logging purposes
*/ */
public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk, public void findAndMarkBlockAsCorrupt(final ExtendedBlock blk,
final DatanodeInfo dn, String reason) throws IOException { final DatanodeInfo dn, String storageID, String reason) throws IOException {
assert namesystem.hasWriteLock(); assert namesystem.hasWriteLock();
final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock()); final BlockInfo storedBlock = getStoredBlock(blk.getLocalBlock());
if (storedBlock == null) { if (storedBlock == null) {
@ -1067,11 +1072,11 @@ public class BlockManager {
return; return;
} }
markBlockAsCorrupt(new BlockToMarkCorrupt(storedBlock, reason, markBlockAsCorrupt(new BlockToMarkCorrupt(storedBlock, reason,
Reason.CORRUPTION_REPORTED), dn); Reason.CORRUPTION_REPORTED), dn, storageID);
} }
private void markBlockAsCorrupt(BlockToMarkCorrupt b, private void markBlockAsCorrupt(BlockToMarkCorrupt b,
DatanodeInfo dn) throws IOException { DatanodeInfo dn, String storageID) throws IOException {
DatanodeDescriptor node = getDatanodeManager().getDatanode(dn); DatanodeDescriptor node = getDatanodeManager().getDatanode(dn);
if (node == null) { if (node == null) {
throw new IOException("Cannot mark " + b throw new IOException("Cannot mark " + b
@ -1087,7 +1092,7 @@ public class BlockManager {
} }
// Add replica to the data-node if it is not already there // Add replica to the data-node if it is not already there
node.addBlock(b.stored); node.addBlock(storageID, b.stored);
// Add this replica to corruptReplicas Map // Add this replica to corruptReplicas Map
corruptReplicas.addToCorruptReplicasMap(b.corrupted, node, b.reason, corruptReplicas.addToCorruptReplicasMap(b.corrupted, node, b.reason,
@ -1212,7 +1217,7 @@ public class BlockManager {
@VisibleForTesting @VisibleForTesting
int computeReplicationWorkForBlocks(List<List<Block>> blocksToReplicate) { int computeReplicationWorkForBlocks(List<List<Block>> blocksToReplicate) {
int requiredReplication, numEffectiveReplicas; int requiredReplication, numEffectiveReplicas;
List<DatanodeDescriptor> containingNodes, liveReplicaNodes; List<DatanodeDescriptor> containingNodes;
DatanodeDescriptor srcNode; DatanodeDescriptor srcNode;
BlockCollection bc = null; BlockCollection bc = null;
int additionalReplRequired; int additionalReplRequired;
@ -1237,7 +1242,7 @@ public class BlockManager {
// get a source data-node // get a source data-node
containingNodes = new ArrayList<DatanodeDescriptor>(); containingNodes = new ArrayList<DatanodeDescriptor>();
liveReplicaNodes = new ArrayList<DatanodeDescriptor>(); List<DatanodeStorageInfo> liveReplicaNodes = new ArrayList<DatanodeStorageInfo>();
NumberReplicas numReplicas = new NumberReplicas(); NumberReplicas numReplicas = new NumberReplicas();
srcNode = chooseSourceDatanode( srcNode = chooseSourceDatanode(
block, containingNodes, liveReplicaNodes, numReplicas, block, containingNodes, liveReplicaNodes, numReplicas,
@ -1296,7 +1301,7 @@ public class BlockManager {
namesystem.writeLock(); namesystem.writeLock();
try { try {
for(ReplicationWork rw : work){ for(ReplicationWork rw : work){
DatanodeDescriptor[] targets = rw.targets; final DatanodeStorageInfo[] targets = rw.targets;
if(targets == null || targets.length == 0){ if(targets == null || targets.length == 0){
rw.targets = null; rw.targets = null;
continue; continue;
@ -1334,7 +1339,8 @@ public class BlockManager {
if ( (numReplicas.liveReplicas() >= requiredReplication) && if ( (numReplicas.liveReplicas() >= requiredReplication) &&
(!blockHasEnoughRacks(block)) ) { (!blockHasEnoughRacks(block)) ) {
if (rw.srcNode.getNetworkLocation().equals(targets[0].getNetworkLocation())) { if (rw.srcNode.getNetworkLocation().equals(
targets[0].getDatanodeDescriptor().getNetworkLocation())) {
//No use continuing, unless a new rack in this case //No use continuing, unless a new rack in this case
continue; continue;
} }
@ -1343,15 +1349,13 @@ public class BlockManager {
// Add block to the to be replicated list // Add block to the to be replicated list
rw.srcNode.addBlockToBeReplicated(block, targets); rw.srcNode.addBlockToBeReplicated(block, targets);
scheduledWork++; scheduledWork++;
DatanodeStorageInfo.incrementBlocksScheduled(targets);
for (DatanodeDescriptor dn : targets) {
dn.incBlocksScheduled();
}
// Move the block-replication into a "pending" state. // Move the block-replication into a "pending" state.
// The reason we use 'pending' is so we can retry // The reason we use 'pending' is so we can retry
// replications that fail after an appropriate amount of time. // replications that fail after an appropriate amount of time.
pendingReplications.increment(block, targets); pendingReplications.increment(block,
DatanodeStorageInfo.toDatanodeDescriptors(targets));
if(blockLog.isDebugEnabled()) { if(blockLog.isDebugEnabled()) {
blockLog.debug( blockLog.debug(
"BLOCK* block " + block "BLOCK* block " + block
@ -1371,12 +1375,12 @@ public class BlockManager {
if (blockLog.isInfoEnabled()) { if (blockLog.isInfoEnabled()) {
// log which blocks have been scheduled for replication // log which blocks have been scheduled for replication
for(ReplicationWork rw : work){ for(ReplicationWork rw : work){
DatanodeDescriptor[] targets = rw.targets; DatanodeStorageInfo[] targets = rw.targets;
if (targets != null && targets.length != 0) { if (targets != null && targets.length != 0) {
StringBuilder targetList = new StringBuilder("datanode(s)"); StringBuilder targetList = new StringBuilder("datanode(s)");
for (int k = 0; k < targets.length; k++) { for (int k = 0; k < targets.length; k++) {
targetList.append(' '); targetList.append(' ');
targetList.append(targets[k]); targetList.append(targets[k].getDatanodeDescriptor());
} }
blockLog.info("BLOCK* ask " + rw.srcNode blockLog.info("BLOCK* ask " + rw.srcNode
+ " to replicate " + rw.block + " to " + targetList); + " to replicate " + rw.block + " to " + targetList);
@ -1400,15 +1404,16 @@ public class BlockManager {
* @see BlockPlacementPolicy#chooseTarget(String, int, Node, * @see BlockPlacementPolicy#chooseTarget(String, int, Node,
* List, boolean, Set, long) * List, boolean, Set, long)
*/ */
public DatanodeDescriptor[] chooseTarget(final String src, public DatanodeStorageInfo[] chooseTarget(final String src,
final int numOfReplicas, final DatanodeDescriptor client, final int numOfReplicas, final DatanodeDescriptor client,
final Set<Node> excludedNodes, final Set<Node> excludedNodes,
final long blocksize, List<String> favoredNodes) throws IOException { final long blocksize, List<String> favoredNodes) throws IOException {
List<DatanodeDescriptor> favoredDatanodeDescriptors = List<DatanodeDescriptor> favoredDatanodeDescriptors =
getDatanodeDescriptors(favoredNodes); getDatanodeDescriptors(favoredNodes);
final DatanodeDescriptor targets[] = blockplacement.chooseTarget(src, final DatanodeStorageInfo[] targets = blockplacement.chooseTarget(src,
numOfReplicas, client, excludedNodes, blocksize, numOfReplicas, client, excludedNodes, blocksize,
favoredDatanodeDescriptors); // TODO: get storage type from file
favoredDatanodeDescriptors, StorageType.DEFAULT);
if (targets.length < minReplication) { if (targets.length < minReplication) {
throw new IOException("File " + src + " could only be replicated to " throw new IOException("File " + src + " could only be replicated to "
+ targets.length + " nodes instead of minReplication (=" + targets.length + " nodes instead of minReplication (="
@ -1469,10 +1474,9 @@ public class BlockManager {
* the given block * the given block
*/ */
@VisibleForTesting @VisibleForTesting
DatanodeDescriptor chooseSourceDatanode( DatanodeDescriptor chooseSourceDatanode(Block block,
Block block,
List<DatanodeDescriptor> containingNodes, List<DatanodeDescriptor> containingNodes,
List<DatanodeDescriptor> nodesContainingLiveReplicas, List<DatanodeStorageInfo> nodesContainingLiveReplicas,
NumberReplicas numReplicas, NumberReplicas numReplicas,
int priority) { int priority) {
containingNodes.clear(); containingNodes.clear();
@ -1482,12 +1486,12 @@ public class BlockManager {
int decommissioned = 0; int decommissioned = 0;
int corrupt = 0; int corrupt = 0;
int excess = 0; int excess = 0;
Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block);
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(block);
while(it.hasNext()) { for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
DatanodeDescriptor node = it.next(); final DatanodeDescriptor node = storage.getDatanodeDescriptor();
LightWeightLinkedSet<Block> excessBlocks = LightWeightLinkedSet<Block> excessBlocks =
excessReplicateMap.get(node.getStorageID()); excessReplicateMap.get(node.getDatanodeUuid());
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) if ((nodesCorrupt != null) && (nodesCorrupt.contains(node)))
corrupt++; corrupt++;
else if (node.isDecommissionInProgress() || node.isDecommissioned()) else if (node.isDecommissionInProgress() || node.isDecommissioned())
@ -1495,7 +1499,7 @@ public class BlockManager {
else if (excessBlocks != null && excessBlocks.contains(block)) { else if (excessBlocks != null && excessBlocks.contains(block)) {
excess++; excess++;
} else { } else {
nodesContainingLiveReplicas.add(node); nodesContainingLiveReplicas.add(storage);
live++; live++;
} }
containingNodes.add(node); containingNodes.add(node);
@ -1627,10 +1631,11 @@ public class BlockManager {
} }
/** /**
* The given datanode is reporting all its blocks. * The given storage is reporting all its blocks.
* Update the (machine-->blocklist) and (block-->machinelist) maps. * Update the (storage-->block list) and (block-->storage list) maps.
*/ */
public void processReport(final DatanodeID nodeID, final String poolId, public void processReport(final DatanodeID nodeID,
final DatanodeStorage storage, final String poolId,
final BlockListAsLongs newReport) throws IOException { final BlockListAsLongs newReport) throws IOException {
namesystem.writeLock(); namesystem.writeLock();
final long startTime = Time.now(); //after acquiring write lock final long startTime = Time.now(); //after acquiring write lock
@ -1644,26 +1649,28 @@ public class BlockManager {
// To minimize startup time, we discard any second (or later) block reports // To minimize startup time, we discard any second (or later) block reports
// that we receive while still in startup phase. // that we receive while still in startup phase.
if (namesystem.isInStartupSafeMode() && !node.isFirstBlockReport()) { final DatanodeStorageInfo storageInfo = node.updateStorage(storage);
if (namesystem.isInStartupSafeMode()
&& storageInfo.getBlockReportCount() > 0) {
blockLog.info("BLOCK* processReport: " blockLog.info("BLOCK* processReport: "
+ "discarded non-initial block report from " + nodeID + "discarded non-initial block report from " + nodeID
+ " because namenode still in startup phase"); + " because namenode still in startup phase");
return; return;
} }
if (node.numBlocks() == 0) { if (storageInfo.numBlocks() == 0) {
// The first block report can be processed a lot more efficiently than // The first block report can be processed a lot more efficiently than
// ordinary block reports. This shortens restart times. // ordinary block reports. This shortens restart times.
processFirstBlockReport(node, newReport); processFirstBlockReport(node, storage.getStorageID(), newReport);
} else { } else {
processReport(node, newReport); processReport(node, storage, newReport);
} }
// Now that we have an up-to-date block report, we know that any // Now that we have an up-to-date block report, we know that any
// deletions from a previous NN iteration have been accounted for. // deletions from a previous NN iteration have been accounted for.
boolean staleBefore = node.areBlockContentsStale(); boolean staleBefore = storageInfo.areBlockContentsStale();
node.receivedBlockReport(); storageInfo.receivedBlockReport();
if (staleBefore && !node.areBlockContentsStale()) { if (staleBefore && !storageInfo.areBlockContentsStale()) {
LOG.info("BLOCK* processReport: Received first block report from " LOG.info("BLOCK* processReport: Received first block report from "
+ node + " after starting up or becoming active. Its block " + node + " after starting up or becoming active. Its block "
+ "contents are no longer considered stale"); + "contents are no longer considered stale");
@ -1717,28 +1724,30 @@ public class BlockManager {
} }
private void processReport(final DatanodeDescriptor node, private void processReport(final DatanodeDescriptor node,
final DatanodeStorage storage,
final BlockListAsLongs report) throws IOException { final BlockListAsLongs report) throws IOException {
// Normal case: // Normal case:
// Modify the (block-->datanode) map, according to the difference // Modify the (block-->datanode) map, according to the difference
// between the old and new block report. // between the old and new block report.
// //
Collection<BlockInfo> toAdd = new LinkedList<BlockInfo>(); Collection<BlockInfo> toAdd = new LinkedList<BlockInfo>();
Collection<Block> toRemove = new LinkedList<Block>(); Collection<Block> toRemove = new TreeSet<Block>();
Collection<Block> toInvalidate = new LinkedList<Block>(); Collection<Block> toInvalidate = new LinkedList<Block>();
Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>(); Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>(); Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
reportDiff(node, report, toAdd, toRemove, toInvalidate, toCorrupt, toUC); reportDiff(node, storage, report,
toAdd, toRemove, toInvalidate, toCorrupt, toUC);
// Process the blocks on each queue // Process the blocks on each queue
for (StatefulBlockInfo b : toUC) { for (StatefulBlockInfo b : toUC) {
addStoredBlockUnderConstruction(b, node); addStoredBlockUnderConstruction(b, node, storage.getStorageID());
} }
for (Block b : toRemove) { for (Block b : toRemove) {
removeStoredBlock(b, node); removeStoredBlock(b, node);
} }
int numBlocksLogged = 0; int numBlocksLogged = 0;
for (BlockInfo b : toAdd) { for (BlockInfo b : toAdd) {
addStoredBlock(b, node, null, numBlocksLogged < maxNumBlocksToLog); addStoredBlock(b, node, storage.getStorageID(), null, numBlocksLogged < maxNumBlocksToLog);
numBlocksLogged++; numBlocksLogged++;
} }
if (numBlocksLogged > maxNumBlocksToLog) { if (numBlocksLogged > maxNumBlocksToLog) {
@ -1752,7 +1761,7 @@ public class BlockManager {
addToInvalidates(b, node); addToInvalidates(b, node);
} }
for (BlockToMarkCorrupt b : toCorrupt) { for (BlockToMarkCorrupt b : toCorrupt) {
markBlockAsCorrupt(b, node); markBlockAsCorrupt(b, node, storage.getStorageID());
} }
} }
@ -1768,10 +1777,11 @@ public class BlockManager {
* @throws IOException * @throws IOException
*/ */
private void processFirstBlockReport(final DatanodeDescriptor node, private void processFirstBlockReport(final DatanodeDescriptor node,
final String storageID,
final BlockListAsLongs report) throws IOException { final BlockListAsLongs report) throws IOException {
if (report == null) return; if (report == null) return;
assert (namesystem.hasWriteLock()); assert (namesystem.hasWriteLock());
assert (node.numBlocks() == 0); assert (node.getStorageInfo(storageID).numBlocks() == 0);
BlockReportIterator itBR = report.getBlockReportIterator(); BlockReportIterator itBR = report.getBlockReportIterator();
while(itBR.hasNext()) { while(itBR.hasNext()) {
@ -1780,7 +1790,7 @@ public class BlockManager {
if (shouldPostponeBlocksFromFuture && if (shouldPostponeBlocksFromFuture &&
namesystem.isGenStampInFuture(iblk)) { namesystem.isGenStampInFuture(iblk)) {
queueReportedBlock(node, iblk, reportedState, queueReportedBlock(node, storageID, iblk, reportedState,
QUEUE_REASON_FUTURE_GENSTAMP); QUEUE_REASON_FUTURE_GENSTAMP);
continue; continue;
} }
@ -1797,10 +1807,10 @@ public class BlockManager {
if (shouldPostponeBlocksFromFuture) { if (shouldPostponeBlocksFromFuture) {
// In the Standby, we may receive a block report for a file that we // In the Standby, we may receive a block report for a file that we
// just have an out-of-date gen-stamp or state for, for example. // just have an out-of-date gen-stamp or state for, for example.
queueReportedBlock(node, iblk, reportedState, queueReportedBlock(node, storageID, iblk, reportedState,
QUEUE_REASON_CORRUPT_STATE); QUEUE_REASON_CORRUPT_STATE);
} else { } else {
markBlockAsCorrupt(c, node); markBlockAsCorrupt(c, node, storageID);
} }
continue; continue;
} }
@ -1808,7 +1818,7 @@ public class BlockManager {
// If block is under construction, add this replica to its list // If block is under construction, add this replica to its list
if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) { if (isBlockUnderConstruction(storedBlock, ucState, reportedState)) {
((BlockInfoUnderConstruction)storedBlock).addReplicaIfNotPresent( ((BlockInfoUnderConstruction)storedBlock).addReplicaIfNotPresent(
node, iblk, reportedState); node.getStorageInfo(storageID), iblk, reportedState);
// OpenFileBlocks only inside snapshots also will be added to safemode // OpenFileBlocks only inside snapshots also will be added to safemode
// threshold. So we need to update such blocks to safemode // threshold. So we need to update such blocks to safemode
// refer HDFS-5283 // refer HDFS-5283
@ -1821,22 +1831,25 @@ public class BlockManager {
} }
//add replica if appropriate //add replica if appropriate
if (reportedState == ReplicaState.FINALIZED) { if (reportedState == ReplicaState.FINALIZED) {
addStoredBlockImmediate(storedBlock, node); addStoredBlockImmediate(storedBlock, node, storageID);
} }
} }
} }
private void reportDiff(DatanodeDescriptor dn, private void reportDiff(DatanodeDescriptor dn, DatanodeStorage storage,
BlockListAsLongs newReport, BlockListAsLongs newReport,
Collection<BlockInfo> toAdd, // add to DatanodeDescriptor Collection<BlockInfo> toAdd, // add to DatanodeDescriptor
Collection<Block> toRemove, // remove from DatanodeDescriptor Collection<Block> toRemove, // remove from DatanodeDescriptor
Collection<Block> toInvalidate, // should be removed from DN Collection<Block> toInvalidate, // should be removed from DN
Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list Collection<BlockToMarkCorrupt> toCorrupt, // add to corrupt replicas list
Collection<StatefulBlockInfo> toUC) { // add to under-construction list Collection<StatefulBlockInfo> toUC) { // add to under-construction list
final DatanodeStorageInfo storageInfo = dn.updateStorage(storage);
// place a delimiter in the list which separates blocks // place a delimiter in the list which separates blocks
// that have been reported from those that have not // that have been reported from those that have not
BlockInfo delimiter = new BlockInfo(new Block(), 1); BlockInfo delimiter = new BlockInfo(new Block(), 1);
boolean added = dn.addBlock(delimiter); boolean added = storageInfo.addBlock(delimiter);
assert added : "Delimiting block cannot be present in the node"; assert added : "Delimiting block cannot be present in the node";
int headIndex = 0; //currently the delimiter is in the head of the list int headIndex = 0; //currently the delimiter is in the head of the list
int curIndex; int curIndex;
@ -1848,20 +1861,21 @@ public class BlockManager {
while(itBR.hasNext()) { while(itBR.hasNext()) {
Block iblk = itBR.next(); Block iblk = itBR.next();
ReplicaState iState = itBR.getCurrentReplicaState(); ReplicaState iState = itBR.getCurrentReplicaState();
BlockInfo storedBlock = processReportedBlock(dn, iblk, iState, BlockInfo storedBlock = processReportedBlock(dn, storage.getStorageID(),
toAdd, toInvalidate, toCorrupt, toUC); iblk, iState, toAdd, toInvalidate, toCorrupt, toUC);
// move block to the head of the list // move block to the head of the list
if (storedBlock != null && (curIndex = storedBlock.findDatanode(dn)) >= 0) { if (storedBlock != null && (curIndex = storedBlock.findDatanode(dn)) >= 0) {
headIndex = dn.moveBlockToHead(storedBlock, curIndex, headIndex); headIndex = storageInfo.moveBlockToHead(storedBlock, curIndex, headIndex);
} }
} }
// collect blocks that have not been reported // collect blocks that have not been reported
// all of them are next to the delimiter // all of them are next to the delimiter
Iterator<? extends Block> it = new DatanodeDescriptor.BlockIterator( Iterator<BlockInfo> it = storageInfo.new BlockIterator(delimiter.getNext(0));
delimiter.getNext(0), dn);
while(it.hasNext()) while(it.hasNext())
toRemove.add(it.next()); toRemove.add(it.next());
dn.removeBlock(delimiter); storageInfo.removeBlock(delimiter);
} }
/** /**
@ -1896,6 +1910,7 @@ public class BlockManager {
* Otherwise, null. * Otherwise, null.
*/ */
private BlockInfo processReportedBlock(final DatanodeDescriptor dn, private BlockInfo processReportedBlock(final DatanodeDescriptor dn,
final String storageID,
final Block block, final ReplicaState reportedState, final Block block, final ReplicaState reportedState,
final Collection<BlockInfo> toAdd, final Collection<BlockInfo> toAdd,
final Collection<Block> toInvalidate, final Collection<Block> toInvalidate,
@ -1910,7 +1925,7 @@ public class BlockManager {
if (shouldPostponeBlocksFromFuture && if (shouldPostponeBlocksFromFuture &&
namesystem.isGenStampInFuture(block)) { namesystem.isGenStampInFuture(block)) {
queueReportedBlock(dn, block, reportedState, queueReportedBlock(dn, storageID, block, reportedState,
QUEUE_REASON_FUTURE_GENSTAMP); QUEUE_REASON_FUTURE_GENSTAMP);
return null; return null;
} }
@ -1931,7 +1946,7 @@ public class BlockManager {
} }
// Ignore replicas already scheduled to be removed from the DN // Ignore replicas already scheduled to be removed from the DN
if(invalidateBlocks.contains(dn.getStorageID(), block)) { if(invalidateBlocks.contains(dn.getDatanodeUuid(), block)) {
/* TODO: following assertion is incorrect, see HDFS-2668 /* TODO: following assertion is incorrect, see HDFS-2668
assert storedBlock.findDatanode(dn) < 0 : "Block " + block assert storedBlock.findDatanode(dn) < 0 : "Block " + block
+ " in recentInvalidatesSet should not appear in DN " + dn; */ + " in recentInvalidatesSet should not appear in DN " + dn; */
@ -1945,7 +1960,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
// If the block is an out-of-date generation stamp or state, // If the block is an out-of-date generation stamp or state,
// but we're the standby, we shouldn't treat it as corrupt, // but we're the standby, we shouldn't treat it as corrupt,
// but instead just queue it for later processing. // but instead just queue it for later processing.
queueReportedBlock(dn, storedBlock, reportedState, queueReportedBlock(dn, storageID, storedBlock, reportedState,
QUEUE_REASON_CORRUPT_STATE); QUEUE_REASON_CORRUPT_STATE);
} else { } else {
toCorrupt.add(c); toCorrupt.add(c);
@ -1974,7 +1989,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
* standby node. @see PendingDataNodeMessages. * standby node. @see PendingDataNodeMessages.
* @param reason a textual reason to report in the debug logs * @param reason a textual reason to report in the debug logs
*/ */
private void queueReportedBlock(DatanodeDescriptor dn, Block block, private void queueReportedBlock(DatanodeDescriptor dn, String storageID, Block block,
ReplicaState reportedState, String reason) { ReplicaState reportedState, String reason) {
assert shouldPostponeBlocksFromFuture; assert shouldPostponeBlocksFromFuture;
@ -1984,7 +1999,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
" from datanode " + dn + " for later processing " + " from datanode " + dn + " for later processing " +
"because " + reason + "."); "because " + reason + ".");
} }
pendingDNMessages.enqueueReportedBlock(dn, block, reportedState); pendingDNMessages.enqueueReportedBlock(dn, storageID, block, reportedState);
} }
/** /**
@ -2007,8 +2022,8 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Processing previouly queued message " + rbi); LOG.debug("Processing previouly queued message " + rbi);
} }
processAndHandleReportedBlock( processAndHandleReportedBlock(rbi.getNode(), rbi.getStorageID(),
rbi.getNode(), rbi.getBlock(), rbi.getReportedState(), null); rbi.getBlock(), rbi.getReportedState(), null);
} }
} }
@ -2127,17 +2142,19 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
} }
void addStoredBlockUnderConstruction(StatefulBlockInfo ucBlock, void addStoredBlockUnderConstruction(StatefulBlockInfo ucBlock,
DatanodeDescriptor node) throws IOException { DatanodeDescriptor node, String storageID) throws IOException {
BlockInfoUnderConstruction block = ucBlock.storedBlock; BlockInfoUnderConstruction block = ucBlock.storedBlock;
block.addReplicaIfNotPresent(node, ucBlock.reportedBlock, ucBlock.reportedState); block.addReplicaIfNotPresent(node.getStorageInfo(storageID),
ucBlock.reportedBlock, ucBlock.reportedState);
if (ucBlock.reportedState == ReplicaState.FINALIZED && block.findDatanode(node) < 0) { if (ucBlock.reportedState == ReplicaState.FINALIZED && block.findDatanode(node) < 0) {
addStoredBlock(block, node, null, true); addStoredBlock(block, node, storageID, null, true);
} }
} }
/** /**
* Faster version of * Faster version of
* {@link #addStoredBlock(BlockInfo, DatanodeDescriptor, DatanodeDescriptor, boolean)} * {@link #addStoredBlock(BlockInfo, DatanodeDescriptor, String, DatanodeDescriptor, boolean)}
* , intended for use with initial block report at startup. If not in startup * , intended for use with initial block report at startup. If not in startup
* safe mode, will call standard addStoredBlock(). Assumes this method is * safe mode, will call standard addStoredBlock(). Assumes this method is
* called "immediately" so there is no need to refresh the storedBlock from * called "immediately" so there is no need to refresh the storedBlock from
@ -2148,17 +2165,17 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
* @throws IOException * @throws IOException
*/ */
private void addStoredBlockImmediate(BlockInfo storedBlock, private void addStoredBlockImmediate(BlockInfo storedBlock,
DatanodeDescriptor node) DatanodeDescriptor node, String storageID)
throws IOException { throws IOException {
assert (storedBlock != null && namesystem.hasWriteLock()); assert (storedBlock != null && namesystem.hasWriteLock());
if (!namesystem.isInStartupSafeMode() if (!namesystem.isInStartupSafeMode()
|| namesystem.isPopulatingReplQueues()) { || namesystem.isPopulatingReplQueues()) {
addStoredBlock(storedBlock, node, null, false); addStoredBlock(storedBlock, node, storageID, null, false);
return; return;
} }
// just add it // just add it
node.addBlock(storedBlock); node.addBlock(storageID, storedBlock);
// Now check for completion of blocks and safe block count // Now check for completion of blocks and safe block count
int numCurrentReplica = countLiveNodes(storedBlock); int numCurrentReplica = countLiveNodes(storedBlock);
@ -2181,6 +2198,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
*/ */
private Block addStoredBlock(final BlockInfo block, private Block addStoredBlock(final BlockInfo block,
DatanodeDescriptor node, DatanodeDescriptor node,
String storageID,
DatanodeDescriptor delNodeHint, DatanodeDescriptor delNodeHint,
boolean logEveryBlock) boolean logEveryBlock)
throws IOException { throws IOException {
@ -2206,7 +2224,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
assert bc != null : "Block must belong to a file"; assert bc != null : "Block must belong to a file";
// add block to the datanode // add block to the datanode
boolean added = node.addBlock(storedBlock); boolean added = node.addBlock(storageID, storedBlock);
int curReplicaDelta; int curReplicaDelta;
if (added) { if (added) {
@ -2548,19 +2566,19 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>(); Collection<DatanodeDescriptor> nonExcess = new ArrayList<DatanodeDescriptor>();
Collection<DatanodeDescriptor> corruptNodes = corruptReplicas Collection<DatanodeDescriptor> corruptNodes = corruptReplicas
.getNodes(block); .getNodes(block);
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block); for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
it.hasNext();) { final DatanodeDescriptor cur = storage.getDatanodeDescriptor();
DatanodeDescriptor cur = it.next(); if (storage.areBlockContentsStale()) {
if (cur.areBlockContentsStale()) {
LOG.info("BLOCK* processOverReplicatedBlock: " + LOG.info("BLOCK* processOverReplicatedBlock: " +
"Postponing processing of over-replicated " + "Postponing processing of over-replicated " +
block + " since datanode " + cur + " does not yet have up-to-date " + block + " since storage + " + storage
+ "datanode " + cur + " does not yet have up-to-date " +
"block information."); "block information.");
postponeBlock(block); postponeBlock(block);
return; return;
} }
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(cur LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(cur
.getStorageID()); .getDatanodeUuid());
if (excessBlocks == null || !excessBlocks.contains(block)) { if (excessBlocks == null || !excessBlocks.contains(block)) {
if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
// exclude corrupt replicas // exclude corrupt replicas
@ -2649,10 +2667,10 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
private void addToExcessReplicate(DatanodeInfo dn, Block block) { private void addToExcessReplicate(DatanodeInfo dn, Block block) {
assert namesystem.hasWriteLock(); assert namesystem.hasWriteLock();
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(dn.getStorageID()); LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(dn.getDatanodeUuid());
if (excessBlocks == null) { if (excessBlocks == null) {
excessBlocks = new LightWeightLinkedSet<Block>(); excessBlocks = new LightWeightLinkedSet<Block>();
excessReplicateMap.put(dn.getStorageID(), excessBlocks); excessReplicateMap.put(dn.getDatanodeUuid(), excessBlocks);
} }
if (excessBlocks.add(block)) { if (excessBlocks.add(block)) {
excessBlocksCount.incrementAndGet(); excessBlocksCount.incrementAndGet();
@ -2700,7 +2718,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
// in "excess" there. // in "excess" there.
// //
LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(node LightWeightLinkedSet<Block> excessBlocks = excessReplicateMap.get(node
.getStorageID()); .getDatanodeUuid());
if (excessBlocks != null) { if (excessBlocks != null) {
if (excessBlocks.remove(block)) { if (excessBlocks.remove(block)) {
excessBlocksCount.decrementAndGet(); excessBlocksCount.decrementAndGet();
@ -2709,7 +2727,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
+ block + " is removed from excessBlocks"); + block + " is removed from excessBlocks");
} }
if (excessBlocks.size() == 0) { if (excessBlocks.size() == 0) {
excessReplicateMap.remove(node.getStorageID()); excessReplicateMap.remove(node.getDatanodeUuid());
} }
} }
} }
@ -2724,12 +2742,18 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
* return the length of the added block; 0 if the block is not added * return the length of the added block; 0 if the block is not added
*/ */
private long addBlock(Block block, List<BlockWithLocations> results) { private long addBlock(Block block, List<BlockWithLocations> results) {
final List<String> machineSet = getValidLocations(block); final List<DatanodeStorageInfo> locations = getValidLocations(block);
if(machineSet.size() == 0) { if(locations.size() == 0) {
return 0; return 0;
} else { } else {
results.add(new BlockWithLocations(block, final String[] datanodeUuids = new String[locations.size()];
machineSet.toArray(new String[machineSet.size()]))); final String[] storageIDs = new String[datanodeUuids.length];
for(int i = 0; i < locations.size(); i++) {
final DatanodeStorageInfo s = locations.get(i);
datanodeUuids[i] = s.getDatanodeDescriptor().getDatanodeUuid();
storageIDs[i] = s.getStorageID();
}
results.add(new BlockWithLocations(block, datanodeUuids, storageIDs));
return block.getNumBytes(); return block.getNumBytes();
} }
} }
@ -2738,12 +2762,12 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
* The given node is reporting that it received a certain block. * The given node is reporting that it received a certain block.
*/ */
@VisibleForTesting @VisibleForTesting
void addBlock(DatanodeDescriptor node, Block block, String delHint) void addBlock(DatanodeDescriptor node, String storageID, Block block, String delHint)
throws IOException { throws IOException {
// decrement number of blocks scheduled to this datanode. // Decrement number of blocks scheduled to this datanode.
// for a retry request (of DatanodeProtocol#blockReceivedAndDeleted with // for a retry request (of DatanodeProtocol#blockReceivedAndDeleted with
// RECEIVED_BLOCK), we currently also decrease the approximate number. // RECEIVED_BLOCK), we currently also decrease the approximate number.
node.decBlocksScheduled(); node.decrementBlocksScheduled();
// get the deletion hint node // get the deletion hint node
DatanodeDescriptor delHintNode = null; DatanodeDescriptor delHintNode = null;
@ -2759,11 +2783,12 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
// Modify the blocks->datanode map and node's map. // Modify the blocks->datanode map and node's map.
// //
pendingReplications.decrement(block, node); pendingReplications.decrement(block, node);
processAndHandleReportedBlock(node, block, ReplicaState.FINALIZED, processAndHandleReportedBlock(node, storageID, block, ReplicaState.FINALIZED,
delHintNode); delHintNode);
} }
private void processAndHandleReportedBlock(DatanodeDescriptor node, Block block, private void processAndHandleReportedBlock(DatanodeDescriptor node,
String storageID, Block block,
ReplicaState reportedState, DatanodeDescriptor delHintNode) ReplicaState reportedState, DatanodeDescriptor delHintNode)
throws IOException { throws IOException {
// blockReceived reports a finalized block // blockReceived reports a finalized block
@ -2771,7 +2796,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
Collection<Block> toInvalidate = new LinkedList<Block>(); Collection<Block> toInvalidate = new LinkedList<Block>();
Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>(); Collection<BlockToMarkCorrupt> toCorrupt = new LinkedList<BlockToMarkCorrupt>();
Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>(); Collection<StatefulBlockInfo> toUC = new LinkedList<StatefulBlockInfo>();
processReportedBlock(node, block, reportedState, processReportedBlock(node, storageID, block, reportedState,
toAdd, toInvalidate, toCorrupt, toUC); toAdd, toInvalidate, toCorrupt, toUC);
// the block is only in one of the to-do lists // the block is only in one of the to-do lists
// if it is in none then data-node already has it // if it is in none then data-node already has it
@ -2779,11 +2804,11 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
: "The block should be only in one of the lists."; : "The block should be only in one of the lists.";
for (StatefulBlockInfo b : toUC) { for (StatefulBlockInfo b : toUC) {
addStoredBlockUnderConstruction(b, node); addStoredBlockUnderConstruction(b, node, storageID);
} }
long numBlocksLogged = 0; long numBlocksLogged = 0;
for (BlockInfo b : toAdd) { for (BlockInfo b : toAdd) {
addStoredBlock(b, node, delHintNode, numBlocksLogged < maxNumBlocksToLog); addStoredBlock(b, node, storageID, delHintNode, numBlocksLogged < maxNumBlocksToLog);
numBlocksLogged++; numBlocksLogged++;
} }
if (numBlocksLogged > maxNumBlocksToLog) { if (numBlocksLogged > maxNumBlocksToLog) {
@ -2797,7 +2822,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
addToInvalidates(b, node); addToInvalidates(b, node);
} }
for (BlockToMarkCorrupt b : toCorrupt) { for (BlockToMarkCorrupt b : toCorrupt) {
markBlockAsCorrupt(b, node); markBlockAsCorrupt(b, node, storageID);
} }
} }
@ -2809,7 +2834,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
* This method must be called with FSNamesystem lock held. * This method must be called with FSNamesystem lock held.
*/ */
public void processIncrementalBlockReport(final DatanodeID nodeID, public void processIncrementalBlockReport(final DatanodeID nodeID,
final String poolId, final ReceivedDeletedBlockInfo blockInfos[]) final String poolId, final StorageReceivedDeletedBlocks srdb)
throws IOException { throws IOException {
assert namesystem.hasWriteLock(); assert namesystem.hasWriteLock();
int received = 0; int received = 0;
@ -2825,19 +2850,19 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
"Got incremental block report from unregistered or dead node"); "Got incremental block report from unregistered or dead node");
} }
for (ReceivedDeletedBlockInfo rdbi : blockInfos) { for (ReceivedDeletedBlockInfo rdbi : srdb.getBlocks()) {
switch (rdbi.getStatus()) { switch (rdbi.getStatus()) {
case DELETED_BLOCK: case DELETED_BLOCK:
removeStoredBlock(rdbi.getBlock(), node); removeStoredBlock(rdbi.getBlock(), node);
deleted++; deleted++;
break; break;
case RECEIVED_BLOCK: case RECEIVED_BLOCK:
addBlock(node, rdbi.getBlock(), rdbi.getDelHints()); addBlock(node, srdb.getStorageID(), rdbi.getBlock(), rdbi.getDelHints());
received++; received++;
break; break;
case RECEIVING_BLOCK: case RECEIVING_BLOCK:
receiving++; receiving++;
processAndHandleReportedBlock(node, rdbi.getBlock(), processAndHandleReportedBlock(node, srdb.getStorageID(), rdbi.getBlock(),
ReplicaState.RBW, null); ReplicaState.RBW, null);
break; break;
default: default:
@ -2869,24 +2894,23 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
int corrupt = 0; int corrupt = 0;
int excess = 0; int excess = 0;
int stale = 0; int stale = 0;
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
while (nodeIter.hasNext()) { for(DatanodeStorageInfo storage : blocksMap.getStorages(b)) {
DatanodeDescriptor node = nodeIter.next(); final DatanodeDescriptor node = storage.getDatanodeDescriptor();
if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) { if ((nodesCorrupt != null) && (nodesCorrupt.contains(node))) {
corrupt++; corrupt++;
} else if (node.isDecommissionInProgress() || node.isDecommissioned()) { } else if (node.isDecommissionInProgress() || node.isDecommissioned()) {
decommissioned++; decommissioned++;
} else { } else {
LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.get(node LightWeightLinkedSet<Block> blocksExcess = excessReplicateMap.get(node
.getStorageID()); .getDatanodeUuid());
if (blocksExcess != null && blocksExcess.contains(b)) { if (blocksExcess != null && blocksExcess.contains(b)) {
excess++; excess++;
} else { } else {
live++; live++;
} }
} }
if (node.areBlockContentsStale()) { if (storage.areBlockContentsStale()) {
stale++; stale++;
} }
} }
@ -2909,10 +2933,9 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
} }
// else proceed with fast case // else proceed with fast case
int live = 0; int live = 0;
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(b);
Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b); Collection<DatanodeDescriptor> nodesCorrupt = corruptReplicas.getNodes(b);
while (nodeIter.hasNext()) { for(DatanodeStorageInfo storage : blocksMap.getStorages(b)) {
DatanodeDescriptor node = nodeIter.next(); final DatanodeDescriptor node = storage.getDatanodeDescriptor();
if ((nodesCorrupt == null) || (!nodesCorrupt.contains(node))) if ((nodesCorrupt == null) || (!nodesCorrupt.contains(node)))
live++; live++;
} }
@ -2924,10 +2947,9 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
int curReplicas = num.liveReplicas(); int curReplicas = num.liveReplicas();
int curExpectedReplicas = getReplication(block); int curExpectedReplicas = getReplication(block);
BlockCollection bc = blocksMap.getBlockCollection(block); BlockCollection bc = blocksMap.getBlockCollection(block);
Iterator<DatanodeDescriptor> nodeIter = blocksMap.nodeIterator(block);
StringBuilder nodeList = new StringBuilder(); StringBuilder nodeList = new StringBuilder();
while (nodeIter.hasNext()) { for(DatanodeStorageInfo storage : blocksMap.getStorages(block)) {
DatanodeDescriptor node = nodeIter.next(); final DatanodeDescriptor node = storage.getDatanodeDescriptor();
nodeList.append(node); nodeList.append(node);
nodeList.append(" "); nodeList.append(" ");
} }
@ -2972,6 +2994,7 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
*/ */
boolean isReplicationInProgress(DatanodeDescriptor srcNode) { boolean isReplicationInProgress(DatanodeDescriptor srcNode) {
boolean status = false; boolean status = false;
boolean firstReplicationLog = true;
int underReplicatedBlocks = 0; int underReplicatedBlocks = 0;
int decommissionOnlyReplicas = 0; int decommissionOnlyReplicas = 0;
int underReplicatedInOpenFiles = 0; int underReplicatedInOpenFiles = 0;
@ -2986,11 +3009,18 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
int curExpectedReplicas = getReplication(block); int curExpectedReplicas = getReplication(block);
if (isNeededReplication(block, curExpectedReplicas, curReplicas)) { if (isNeededReplication(block, curExpectedReplicas, curReplicas)) {
if (curExpectedReplicas > curReplicas) { if (curExpectedReplicas > curReplicas) {
//Log info about one block for this node which needs replication // Log info about one block for this node which needs replication
if (!status) { if (!status) {
status = true; status = true;
if (firstReplicationLog) {
logBlockReplicationInfo(block, srcNode, num); logBlockReplicationInfo(block, srcNode, num);
} }
// Allowing decommission as long as default replication is met
if (curReplicas >= defaultReplication) {
status = false;
firstReplicationLog = false;
}
}
underReplicatedBlocks++; underReplicatedBlocks++;
if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) { if ((curReplicas == 0) && (num.decommissionedReplicas() > 0)) {
decommissionOnlyReplicas++; decommissionOnlyReplicas++;
@ -3024,14 +3054,13 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
return blocksMap.size(); return blocksMap.size();
} }
public DatanodeDescriptor[] getNodes(BlockInfo block) { public DatanodeStorageInfo[] getStorages(BlockInfo block) {
DatanodeDescriptor[] nodes = final DatanodeStorageInfo[] storages = new DatanodeStorageInfo[block.numNodes()];
new DatanodeDescriptor[block.numNodes()]; int i = 0;
Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(block); for(DatanodeStorageInfo s : blocksMap.getStorages(block)) {
for (int i = 0; it != null && it.hasNext(); i++) { storages[i++] = s;
nodes[i] = it.next();
} }
return nodes; return storages;
} }
public int getTotalBlocks() { public int getTotalBlocks() {
@ -3160,9 +3189,8 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
corruptReplicas.getNodes(b); corruptReplicas.getNodes(b);
int numExpectedReplicas = getReplication(b); int numExpectedReplicas = getReplication(b);
String rackName = null; String rackName = null;
for (Iterator<DatanodeDescriptor> it = blocksMap.nodeIterator(b); for(DatanodeStorageInfo storage : blocksMap.getStorages(b)) {
it.hasNext();) { final DatanodeDescriptor cur = storage.getDatanodeDescriptor();
DatanodeDescriptor cur = it.next();
if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) { if (!cur.isDecommissionInProgress() && !cur.isDecommissioned()) {
if ((corruptNodes == null ) || !corruptNodes.contains(cur)) { if ((corruptNodes == null ) || !corruptNodes.contains(cur)) {
if (numExpectedReplicas == 1 || if (numExpectedReplicas == 1 ||
@ -3206,8 +3234,8 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
} }
/** @return an iterator of the datanodes. */ /** @return an iterator of the datanodes. */
public Iterator<DatanodeDescriptor> datanodeIterator(final Block block) { public Iterable<DatanodeStorageInfo> getStorages(final Block block) {
return blocksMap.nodeIterator(block); return blocksMap.getStorages(block);
} }
public int numCorruptReplicas(Block block) { public int numCorruptReplicas(Block block) {
@ -3358,24 +3386,24 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
private DatanodeDescriptor srcNode; private DatanodeDescriptor srcNode;
private List<DatanodeDescriptor> containingNodes; private List<DatanodeDescriptor> containingNodes;
private List<DatanodeDescriptor> liveReplicaNodes; private List<DatanodeStorageInfo> liveReplicaStorages;
private int additionalReplRequired; private int additionalReplRequired;
private DatanodeDescriptor targets[]; private DatanodeStorageInfo targets[];
private int priority; private int priority;
public ReplicationWork(Block block, public ReplicationWork(Block block,
BlockCollection bc, BlockCollection bc,
DatanodeDescriptor srcNode, DatanodeDescriptor srcNode,
List<DatanodeDescriptor> containingNodes, List<DatanodeDescriptor> containingNodes,
List<DatanodeDescriptor> liveReplicaNodes, List<DatanodeStorageInfo> liveReplicaStorages,
int additionalReplRequired, int additionalReplRequired,
int priority) { int priority) {
this.block = block; this.block = block;
this.bc = bc; this.bc = bc;
this.srcNode = srcNode; this.srcNode = srcNode;
this.containingNodes = containingNodes; this.containingNodes = containingNodes;
this.liveReplicaNodes = liveReplicaNodes; this.liveReplicaStorages = liveReplicaStorages;
this.additionalReplRequired = additionalReplRequired; this.additionalReplRequired = additionalReplRequired;
this.priority = priority; this.priority = priority;
this.targets = null; this.targets = null;
@ -3384,8 +3412,8 @@ assert storedBlock.findDatanode(dn) < 0 : "Block " + block
private void chooseTargets(BlockPlacementPolicy blockplacement, private void chooseTargets(BlockPlacementPolicy blockplacement,
Set<Node> excludedNodes) { Set<Node> excludedNodes) {
targets = blockplacement.chooseTarget(bc.getName(), targets = blockplacement.chooseTarget(bc.getName(),
additionalReplRequired, srcNode, liveReplicaNodes, false, additionalReplRequired, srcNode, liveReplicaStorages, false,
excludedNodes, block.getNumBytes()); excludedNodes, block.getNumBytes(), StorageType.DEFAULT);
} }
} }

View File

@ -28,6 +28,7 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
@ -67,13 +68,14 @@ public abstract class BlockPlacementPolicy {
* @return array of DatanodeDescriptor instances chosen as target * @return array of DatanodeDescriptor instances chosen as target
* and sorted as a pipeline. * and sorted as a pipeline.
*/ */
public abstract DatanodeDescriptor[] chooseTarget(String srcPath, public abstract DatanodeStorageInfo[] chooseTarget(String srcPath,
int numOfReplicas, int numOfReplicas,
Node writer, Node writer,
List<DatanodeDescriptor> chosenNodes, List<DatanodeStorageInfo> chosen,
boolean returnChosenNodes, boolean returnChosenNodes,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize); long blocksize,
StorageType storageType);
/** /**
* Same as {@link #chooseTarget(String, int, Node, List, boolean, * Same as {@link #chooseTarget(String, int, Node, List, boolean,
@ -82,16 +84,19 @@ public abstract class BlockPlacementPolicy {
* is only a hint and due to cluster state, namenode may not be * is only a hint and due to cluster state, namenode may not be
* able to place the blocks on these datanodes. * able to place the blocks on these datanodes.
*/ */
DatanodeDescriptor[] chooseTarget(String src, DatanodeStorageInfo[] chooseTarget(String src,
int numOfReplicas, Node writer, int numOfReplicas, Node writer,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, List<DatanodeDescriptor> favoredNodes) { long blocksize,
List<DatanodeDescriptor> favoredNodes,
StorageType storageType) {
// This class does not provide the functionality of placing // This class does not provide the functionality of placing
// a block in favored datanodes. The implementations of this class // a block in favored datanodes. The implementations of this class
// are expected to provide this functionality // are expected to provide this functionality
return chooseTarget(src, numOfReplicas, writer, return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false, excludedNodes, new ArrayList<DatanodeStorageInfo>(numOfReplicas), false,
blocksize); excludedNodes, blocksize, storageType);
} }
/** /**

View File

@ -29,11 +29,14 @@ import java.util.TreeSet;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.HdfsConstants; import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.LocatedBlock; import org.apache.hadoop.hdfs.protocol.LocatedBlock;
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage.State;
import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.NetworkTopology;
import org.apache.hadoop.net.Node; import org.apache.hadoop.net.Node;
import org.apache.hadoop.net.NodeBase; import org.apache.hadoop.net.NodeBase;
@ -103,99 +106,101 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
@Override @Override
public DatanodeDescriptor[] chooseTarget(String srcPath, public DatanodeStorageInfo[] chooseTarget(String srcPath,
int numOfReplicas, int numOfReplicas,
Node writer, Node writer,
List<DatanodeDescriptor> chosenNodes, List<DatanodeStorageInfo> chosenNodes,
boolean returnChosenNodes, boolean returnChosenNodes,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize) { long blocksize,
StorageType storageType) {
return chooseTarget(numOfReplicas, writer, chosenNodes, returnChosenNodes, return chooseTarget(numOfReplicas, writer, chosenNodes, returnChosenNodes,
excludedNodes, blocksize); excludedNodes, blocksize, storageType);
} }
@Override @Override
DatanodeDescriptor[] chooseTarget(String src, DatanodeStorageInfo[] chooseTarget(String src,
int numOfReplicas, int numOfReplicas,
Node writer, Node writer,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
List<DatanodeDescriptor> favoredNodes) { List<DatanodeDescriptor> favoredNodes,
StorageType storageType) {
try { try {
if (favoredNodes == null || favoredNodes.size() == 0) { if (favoredNodes == null || favoredNodes.size() == 0) {
// Favored nodes not specified, fall back to regular block placement. // Favored nodes not specified, fall back to regular block placement.
return chooseTarget(src, numOfReplicas, writer, return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false, new ArrayList<DatanodeStorageInfo>(numOfReplicas), false,
excludedNodes, blocksize); excludedNodes, blocksize, storageType);
} }
Set<Node> favoriteAndExcludedNodes = excludedNodes == null ? Set<Node> favoriteAndExcludedNodes = excludedNodes == null ?
new HashSet<Node>() : new HashSet<Node>(excludedNodes); new HashSet<Node>() : new HashSet<Node>(excludedNodes);
// Choose favored nodes // Choose favored nodes
List<DatanodeDescriptor> results = new ArrayList<DatanodeDescriptor>(); List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>();
boolean avoidStaleNodes = stats != null boolean avoidStaleNodes = stats != null
&& stats.isAvoidingStaleDataNodesForWrite(); && stats.isAvoidingStaleDataNodesForWrite();
for (int i = 0; i < Math.min(favoredNodes.size(), numOfReplicas); i++) { for (int i = 0; i < Math.min(favoredNodes.size(), numOfReplicas); i++) {
DatanodeDescriptor favoredNode = favoredNodes.get(i); DatanodeDescriptor favoredNode = favoredNodes.get(i);
// Choose a single node which is local to favoredNode. // Choose a single node which is local to favoredNode.
// 'results' is updated within chooseLocalNode // 'results' is updated within chooseLocalNode
DatanodeDescriptor target = chooseLocalNode(favoredNode, final DatanodeStorageInfo target = chooseLocalStorage(favoredNode,
favoriteAndExcludedNodes, blocksize, favoriteAndExcludedNodes, blocksize,
getMaxNodesPerRack(results, getMaxNodesPerRack(results.size(), numOfReplicas)[1],
numOfReplicas)[1], results, avoidStaleNodes); results, avoidStaleNodes, storageType);
if (target == null) { if (target == null) {
LOG.warn("Could not find a target for file " + src LOG.warn("Could not find a target for file " + src
+ " with favored node " + favoredNode); + " with favored node " + favoredNode);
continue; continue;
} }
favoriteAndExcludedNodes.add(target); favoriteAndExcludedNodes.add(target.getDatanodeDescriptor());
} }
if (results.size() < numOfReplicas) { if (results.size() < numOfReplicas) {
// Not enough favored nodes, choose other nodes. // Not enough favored nodes, choose other nodes.
numOfReplicas -= results.size(); numOfReplicas -= results.size();
DatanodeDescriptor[] remainingTargets = DatanodeStorageInfo[] remainingTargets =
chooseTarget(src, numOfReplicas, writer, results, chooseTarget(src, numOfReplicas, writer, results,
false, favoriteAndExcludedNodes, blocksize); false, favoriteAndExcludedNodes, blocksize, storageType);
for (int i = 0; i < remainingTargets.length; i++) { for (int i = 0; i < remainingTargets.length; i++) {
results.add(remainingTargets[i]); results.add(remainingTargets[i]);
} }
} }
return getPipeline(writer, return getPipeline(writer,
results.toArray(new DatanodeDescriptor[results.size()])); results.toArray(new DatanodeStorageInfo[results.size()]));
} catch (NotEnoughReplicasException nr) { } catch (NotEnoughReplicasException nr) {
// Fall back to regular block placement disregarding favored nodes hint // Fall back to regular block placement disregarding favored nodes hint
return chooseTarget(src, numOfReplicas, writer, return chooseTarget(src, numOfReplicas, writer,
new ArrayList<DatanodeDescriptor>(numOfReplicas), false, new ArrayList<DatanodeStorageInfo>(numOfReplicas), false,
excludedNodes, blocksize); excludedNodes, blocksize, storageType);
} }
} }
/** This is the implementation. */ /** This is the implementation. */
private DatanodeDescriptor[] chooseTarget(int numOfReplicas, private DatanodeStorageInfo[] chooseTarget(int numOfReplicas,
Node writer, Node writer,
List<DatanodeDescriptor> chosenNodes, List<DatanodeStorageInfo> chosenStorage,
boolean returnChosenNodes, boolean returnChosenNodes,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize) { long blocksize,
StorageType storageType) {
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
return DatanodeDescriptor.EMPTY_ARRAY; return DatanodeStorageInfo.EMPTY_ARRAY;
} }
if (excludedNodes == null) { if (excludedNodes == null) {
excludedNodes = new HashSet<Node>(); excludedNodes = new HashSet<Node>();
} }
int[] result = getMaxNodesPerRack(chosenNodes, numOfReplicas); int[] result = getMaxNodesPerRack(chosenStorage.size(), numOfReplicas);
numOfReplicas = result[0]; numOfReplicas = result[0];
int maxNodesPerRack = result[1]; int maxNodesPerRack = result[1];
List<DatanodeDescriptor> results = final List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>(chosenStorage);
new ArrayList<DatanodeDescriptor>(chosenNodes); for (DatanodeStorageInfo storage : chosenStorage) {
for (DatanodeDescriptor node:chosenNodes) {
// add localMachine and related nodes to excludedNodes // add localMachine and related nodes to excludedNodes
addToExcludedNodes(node, excludedNodes); addToExcludedNodes(storage.getDatanodeDescriptor(), excludedNodes);
} }
if (!clusterMap.contains(writer)) { if (!clusterMap.contains(writer)) {
@ -205,20 +210,19 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
boolean avoidStaleNodes = (stats != null boolean avoidStaleNodes = (stats != null
&& stats.isAvoidingStaleDataNodesForWrite()); && stats.isAvoidingStaleDataNodesForWrite());
Node localNode = chooseTarget(numOfReplicas, writer, Node localNode = chooseTarget(numOfReplicas, writer,
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes); excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
if (!returnChosenNodes) { if (!returnChosenNodes) {
results.removeAll(chosenNodes); results.removeAll(chosenStorage);
} }
// sorting nodes to form a pipeline // sorting nodes to form a pipeline
return getPipeline((writer==null)?localNode:writer, return getPipeline((writer==null)?localNode:writer,
results.toArray(new DatanodeDescriptor[results.size()])); results.toArray(new DatanodeStorageInfo[results.size()]));
} }
private int[] getMaxNodesPerRack(List<DatanodeDescriptor> chosenNodes, private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) {
int numOfReplicas) {
int clusterSize = clusterMap.getNumOfLeaves(); int clusterSize = clusterMap.getNumOfLeaves();
int totalNumOfReplicas = chosenNodes.size()+numOfReplicas; int totalNumOfReplicas = numOfChosen + numOfReplicas;
if (totalNumOfReplicas > clusterSize) { if (totalNumOfReplicas > clusterSize) {
numOfReplicas -= (totalNumOfReplicas-clusterSize); numOfReplicas -= (totalNumOfReplicas-clusterSize);
totalNumOfReplicas = clusterSize; totalNumOfReplicas = clusterSize;
@ -243,8 +247,9 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
final boolean avoidStaleNodes) { final boolean avoidStaleNodes,
StorageType storageType) {
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
return writer; return writer;
} }
@ -253,7 +258,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
int numOfResults = results.size(); int numOfResults = results.size();
boolean newBlock = (numOfResults==0); boolean newBlock = (numOfResults==0);
if ((writer == null || !(writer instanceof DatanodeDescriptor)) && !newBlock) { if ((writer == null || !(writer instanceof DatanodeDescriptor)) && !newBlock) {
writer = results.get(0); writer = results.get(0).getDatanodeDescriptor();
} }
// Keep a copy of original excludedNodes // Keep a copy of original excludedNodes
@ -261,42 +266,49 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
new HashSet<Node>(excludedNodes) : null; new HashSet<Node>(excludedNodes) : null;
try { try {
if (numOfResults == 0) { if (numOfResults == 0) {
writer = chooseLocalNode(writer, excludedNodes, blocksize, writer = chooseLocalStorage(writer, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType)
.getDatanodeDescriptor();
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
} }
final DatanodeDescriptor dn0 = results.get(0).getDatanodeDescriptor();
if (numOfResults <= 1) { if (numOfResults <= 1) {
chooseRemoteRack(1, results.get(0), excludedNodes, blocksize, chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
maxNodesPerRack, results, avoidStaleNodes); results, avoidStaleNodes, storageType);
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
} }
if (numOfResults <= 2) { if (numOfResults <= 2) {
if (clusterMap.isOnSameRack(results.get(0), results.get(1))) { final DatanodeDescriptor dn1 = results.get(1).getDatanodeDescriptor();
chooseRemoteRack(1, results.get(0), excludedNodes, if (clusterMap.isOnSameRack(dn0, dn1)) {
blocksize, maxNodesPerRack, chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes); results, avoidStaleNodes, storageType);
} else if (newBlock){ } else if (newBlock){
chooseLocalRack(results.get(1), excludedNodes, blocksize, chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack,
maxNodesPerRack, results, avoidStaleNodes); results, avoidStaleNodes, storageType);
} else { } else {
chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack, chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes); results, avoidStaleNodes, storageType);
} }
if (--numOfReplicas == 0) { if (--numOfReplicas == 0) {
return writer; return writer;
} }
} }
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
LOG.warn("Not able to place enough replicas, still in need of " final String message = "Failed to place enough replicas, still in need of "
+ (totalReplicasExpected - results.size()) + " to reach " + (totalReplicasExpected - results.size()) + " to reach "
+ totalReplicasExpected + "\n" + totalReplicasExpected + ".";
+ e.getMessage()); if (LOG.isTraceEnabled()) {
LOG.trace(message, e);
} else {
LOG.warn(message + " " + e.getMessage());
}
if (avoidStaleNodes) { if (avoidStaleNodes) {
// Retry chooseTarget again, this time not avoiding stale nodes. // Retry chooseTarget again, this time not avoiding stale nodes.
@ -304,14 +316,14 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
// not chosen because they were stale, decommissioned, etc. // not chosen because they were stale, decommissioned, etc.
// We need to additionally exclude the nodes that were added to the // We need to additionally exclude the nodes that were added to the
// result list in the successful calls to choose*() above. // result list in the successful calls to choose*() above.
for (Node node : results) { for (DatanodeStorageInfo resultStorage : results) {
oldExcludedNodes.add(node); oldExcludedNodes.add(resultStorage.getDatanodeDescriptor());
} }
// Set numOfReplicas, since it can get out of sync with the result list // Set numOfReplicas, since it can get out of sync with the result list
// if the NotEnoughReplicasException was thrown in chooseRandom(). // if the NotEnoughReplicasException was thrown in chooseRandom().
numOfReplicas = totalReplicasExpected - results.size(); numOfReplicas = totalReplicasExpected - results.size();
return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize, return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize,
maxNodesPerRack, results, false); maxNodesPerRack, results, false, storageType);
} }
} }
return writer; return writer;
@ -321,32 +333,36 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* Choose <i>localMachine</i> as the target. * Choose <i>localMachine</i> as the target.
* if <i>localMachine</i> is not available, * if <i>localMachine</i> is not available,
* choose a node on the same rack * choose a node on the same rack
* @return the chosen node * @return the chosen storage
*/ */
protected DatanodeDescriptor chooseLocalNode(Node localMachine, protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) boolean avoidStaleNodes,
StorageType storageType)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// if no local machine, randomly choose one node // if no local machine, randomly choose one node
if (localMachine == null) if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
if (preferLocalNode && localMachine instanceof DatanodeDescriptor) { if (preferLocalNode && localMachine instanceof DatanodeDescriptor) {
DatanodeDescriptor localDatanode = (DatanodeDescriptor) localMachine; DatanodeDescriptor localDatanode = (DatanodeDescriptor) localMachine;
// otherwise try local machine first // otherwise try local machine first
if (excludedNodes.add(localMachine)) { // was not in the excluded list if (excludedNodes.add(localMachine)) { // was not in the excluded list
if (addIfIsGoodTarget(localDatanode, excludedNodes, blocksize, for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
maxNodesPerRack, false, results, avoidStaleNodes) >= 0) { localDatanode.getStorageInfos())) {
return localDatanode; if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
return localStorage;
}
} }
} }
} }
// try a node on local rack // try a node on local rack
return chooseLocalRack(localMachine, excludedNodes, blocksize, return chooseLocalRack(localMachine, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
/** /**
@ -368,27 +384,29 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* in the cluster. * in the cluster.
* @return the chosen node * @return the chosen node
*/ */
protected DatanodeDescriptor chooseLocalRack(Node localMachine, protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) boolean avoidStaleNodes,
StorageType storageType)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
// choose one from the local rack // choose one from the local rack
try { try {
return chooseRandom(localMachine.getNetworkLocation(), excludedNodes, return chooseRandom(localMachine.getNetworkLocation(), excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica // find the second replica
DatanodeDescriptor newLocal=null; DatanodeDescriptor newLocal=null;
for(DatanodeDescriptor nextNode : results) { for(DatanodeStorageInfo resultStorage : results) {
DatanodeDescriptor nextNode = resultStorage.getDatanodeDescriptor();
if (nextNode != localMachine) { if (nextNode != localMachine) {
newLocal = nextNode; newLocal = nextNode;
break; break;
@ -397,16 +415,16 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom(newLocal.getNetworkLocation(), excludedNodes, return chooseRandom(newLocal.getNetworkLocation(), excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} }
} }
@ -423,48 +441,51 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxReplicasPerRack, int maxReplicasPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) boolean avoidStaleNodes,
StorageType storageType)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
int oldNumOfReplicas = results.size(); int oldNumOfReplicas = results.size();
// randomly choose one node from remote racks // randomly choose one node from remote racks
try { try {
chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(), chooseRandom(numOfReplicas, "~" + localMachine.getNetworkLocation(),
excludedNodes, blocksize, maxReplicasPerRack, results, excludedNodes, blocksize, maxReplicasPerRack, results,
avoidStaleNodes); avoidStaleNodes, storageType);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas), chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas),
localMachine.getNetworkLocation(), excludedNodes, blocksize, localMachine.getNetworkLocation(), excludedNodes, blocksize,
maxReplicasPerRack, results, avoidStaleNodes); maxReplicasPerRack, results, avoidStaleNodes, storageType);
} }
} }
/** /**
* Randomly choose one target from the given <i>scope</i>. * Randomly choose one target from the given <i>scope</i>.
* @return the chosen node, if there is any. * @return the chosen storage, if there is any.
*/ */
protected DatanodeDescriptor chooseRandom(String scope, protected DatanodeStorageInfo chooseRandom(String scope,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) boolean avoidStaleNodes,
StorageType storageType)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
return chooseRandom(1, scope, excludedNodes, blocksize, maxNodesPerRack, return chooseRandom(1, scope, excludedNodes, blocksize, maxNodesPerRack,
results, avoidStaleNodes); results, avoidStaleNodes, storageType);
} }
/** /**
* Randomly choose <i>numOfReplicas</i> targets from the given <i>scope</i>. * Randomly choose <i>numOfReplicas</i> targets from the given <i>scope</i>.
* @return the first chosen node, if there is any. * @return the first chosen node, if there is any.
*/ */
protected DatanodeDescriptor chooseRandom(int numOfReplicas, protected DatanodeStorageInfo chooseRandom(int numOfReplicas,
String scope, String scope,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blocksize, long blocksize,
int maxNodesPerRack, int maxNodesPerRack,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) boolean avoidStaleNodes,
StorageType storageType)
throws NotEnoughReplicasException { throws NotEnoughReplicasException {
int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes( int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(
@ -476,25 +497,33 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
builder.append("["); builder.append("[");
} }
boolean badTarget = false; boolean badTarget = false;
DatanodeDescriptor firstChosen = null; DatanodeStorageInfo firstChosen = null;
while(numOfReplicas > 0 && numOfAvailableNodes > 0) { while(numOfReplicas > 0 && numOfAvailableNodes > 0) {
DatanodeDescriptor chosenNode = DatanodeDescriptor chosenNode =
(DatanodeDescriptor)clusterMap.chooseRandom(scope); (DatanodeDescriptor)clusterMap.chooseRandom(scope);
if (excludedNodes.add(chosenNode)) { //was not in the excluded list if (excludedNodes.add(chosenNode)) { //was not in the excluded list
numOfAvailableNodes--; numOfAvailableNodes--;
int newExcludedNodes = addIfIsGoodTarget(chosenNode, excludedNodes, final DatanodeStorageInfo[] storages = DFSUtil.shuffle(
blocksize, maxNodesPerRack, considerLoad, results, avoidStaleNodes); chosenNode.getStorageInfos());
int i;
for(i = 0; i < storages.length; i++) {
final int newExcludedNodes = addIfIsGoodTarget(storages[i],
excludedNodes, blocksize, maxNodesPerRack, considerLoad, results,
avoidStaleNodes, storageType);
if (newExcludedNodes >= 0) { if (newExcludedNodes >= 0) {
numOfReplicas--; numOfReplicas--;
if (firstChosen == null) { if (firstChosen == null) {
firstChosen = chosenNode; firstChosen = storages[i];
} }
numOfAvailableNodes -= newExcludedNodes; numOfAvailableNodes -= newExcludedNodes;
} else { break;
badTarget = true;
} }
} }
// If no candidate storage was found on this DN then set badTarget.
badTarget = (i == storages.length);
}
} }
if (numOfReplicas>0) { if (numOfReplicas>0) {
@ -512,43 +541,46 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
} }
/** /**
* If the given node is a good target, add it to the result list and * If the given storage is a good target, add it to the result list and
* update the set of excluded nodes. * update the set of excluded nodes.
* @return -1 if the given is not a good target; * @return -1 if the given is not a good target;
* otherwise, return the number of nodes added to excludedNodes set. * otherwise, return the number of nodes added to excludedNodes set.
*/ */
int addIfIsGoodTarget(DatanodeDescriptor node, int addIfIsGoodTarget(DatanodeStorageInfo storage,
Set<Node> excludedNodes, Set<Node> excludedNodes,
long blockSize, long blockSize,
int maxNodesPerRack, int maxNodesPerRack,
boolean considerLoad, boolean considerLoad,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) { boolean avoidStaleNodes,
if (isGoodTarget(node, blockSize, maxNodesPerRack, considerLoad, StorageType storageType) {
results, avoidStaleNodes)) { if (isGoodTarget(storage, blockSize, maxNodesPerRack, considerLoad,
results.add(node); results, avoidStaleNodes, storageType)) {
results.add(storage);
// add node and related nodes to excludedNode // add node and related nodes to excludedNode
return addToExcludedNodes(node, excludedNodes); return addToExcludedNodes(storage.getDatanodeDescriptor(), excludedNodes);
} else { } else {
return -1; return -1;
} }
} }
private static void logNodeIsNotChosen(DatanodeDescriptor node, String reason) { private static void logNodeIsNotChosen(DatanodeStorageInfo storage, String reason) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
final DatanodeDescriptor node = storage.getDatanodeDescriptor();
// build the error message for later use. // build the error message for later use.
debugLoggingBuilder.get() debugLoggingBuilder.get()
.append(node).append(": ") .append(node).append(": ")
.append("Node ").append(NodeBase.getPath(node)) .append("Storage ").append(storage)
.append("at node ").append(NodeBase.getPath(node))
.append(" is not chosen because ") .append(" is not chosen because ")
.append(reason); .append(reason);
} }
} }
/** /**
* Determine if a node is a good target. * Determine if a storage is a good target.
* *
* @param node The target node * @param storage The target storage
* @param blockSize Size of block * @param blockSize Size of block
* @param maxTargetPerRack Maximum number of targets per rack. The value of * @param maxTargetPerRack Maximum number of targets per rack. The value of
* this parameter depends on the number of racks in * this parameter depends on the number of racks in
@ -561,29 +593,40 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* does not have too much load, * does not have too much load,
* and the rack does not have too many nodes. * and the rack does not have too many nodes.
*/ */
private boolean isGoodTarget(DatanodeDescriptor node, private boolean isGoodTarget(DatanodeStorageInfo storage,
long blockSize, int maxTargetPerRack, long blockSize, int maxTargetPerRack,
boolean considerLoad, boolean considerLoad,
List<DatanodeDescriptor> results, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) { boolean avoidStaleNodes,
// check if the node is (being) decommissed StorageType storageType) {
if (storage.getStorageType() != storageType) {
logNodeIsNotChosen(storage,
"storage types do not match, where the expected storage type is "
+ storageType);
return false;
}
if (storage.getState() == State.READ_ONLY) {
logNodeIsNotChosen(storage, "storage is read-only");
return false;
}
DatanodeDescriptor node = storage.getDatanodeDescriptor();
// check if the node is (being) decommissioned
if (node.isDecommissionInProgress() || node.isDecommissioned()) { if (node.isDecommissionInProgress() || node.isDecommissioned()) {
logNodeIsNotChosen(node, "the node is (being) decommissioned "); logNodeIsNotChosen(storage, "the node is (being) decommissioned ");
return false; return false;
} }
if (avoidStaleNodes) { if (avoidStaleNodes) {
if (node.isStale(this.staleInterval)) { if (node.isStale(this.staleInterval)) {
logNodeIsNotChosen(node, "the node is stale "); logNodeIsNotChosen(storage, "the node is stale ");
return false; return false;
} }
} }
long remaining = node.getRemaining() - final long requiredSize = blockSize * HdfsConstants.MIN_BLOCKS_FOR_WRITE;
(node.getBlocksScheduled() * blockSize); final long scheduledSize = blockSize * node.getBlocksScheduled();
// check the remaining capacity of the target machine if (requiredSize > node.getRemaining() - scheduledSize) {
if (blockSize* HdfsConstants.MIN_BLOCKS_FOR_WRITE>remaining) { logNodeIsNotChosen(storage, "the node does not have enough space ");
logNodeIsNotChosen(node, "the node does not have enough space ");
return false; return false;
} }
@ -595,7 +638,7 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
avgLoad = (double)stats.getTotalLoad()/size; avgLoad = (double)stats.getTotalLoad()/size;
} }
if (node.getXceiverCount() > (2.0 * avgLoad)) { if (node.getXceiverCount() > (2.0 * avgLoad)) {
logNodeIsNotChosen(node, "the node is too busy "); logNodeIsNotChosen(storage, "the node is too busy ");
return false; return false;
} }
} }
@ -603,13 +646,14 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
// check if the target rack has chosen too many nodes // check if the target rack has chosen too many nodes
String rackname = node.getNetworkLocation(); String rackname = node.getNetworkLocation();
int counter=1; int counter=1;
for(Node result : results) { for(DatanodeStorageInfo resultStorage : results) {
if (rackname.equals(result.getNetworkLocation())) { if (rackname.equals(
resultStorage.getDatanodeDescriptor().getNetworkLocation())) {
counter++; counter++;
} }
} }
if (counter>maxTargetPerRack) { if (counter>maxTargetPerRack) {
logNodeIsNotChosen(node, "the rack has too many chosen nodes "); logNodeIsNotChosen(storage, "the rack has too many chosen nodes ");
return false; return false;
} }
return true; return true;
@ -621,37 +665,40 @@ public class BlockPlacementPolicyDefault extends BlockPlacementPolicy {
* starts from the writer and traverses all <i>nodes</i> * starts from the writer and traverses all <i>nodes</i>
* This is basically a traveling salesman problem. * This is basically a traveling salesman problem.
*/ */
private DatanodeDescriptor[] getPipeline(Node writer, private DatanodeStorageInfo[] getPipeline(Node writer,
DatanodeDescriptor[] nodes) { DatanodeStorageInfo[] storages) {
if (nodes.length==0) return nodes; if (storages.length == 0) {
return storages;
}
synchronized(clusterMap) { synchronized(clusterMap) {
int index=0; int index=0;
if (writer == null || !clusterMap.contains(writer)) { if (writer == null || !clusterMap.contains(writer)) {
writer = nodes[0]; writer = storages[0].getDatanodeDescriptor();
} }
for(;index<nodes.length; index++) { for(; index < storages.length; index++) {
DatanodeDescriptor shortestNode = nodes[index]; DatanodeStorageInfo shortestStorage = storages[index];
int shortestDistance = clusterMap.getDistance(writer, shortestNode); int shortestDistance = clusterMap.getDistance(writer,
shortestStorage.getDatanodeDescriptor());
int shortestIndex = index; int shortestIndex = index;
for(int i=index+1; i<nodes.length; i++) { for(int i = index + 1; i < storages.length; i++) {
DatanodeDescriptor currentNode = nodes[i]; int currentDistance = clusterMap.getDistance(writer,
int currentDistance = clusterMap.getDistance(writer, currentNode); storages[i].getDatanodeDescriptor());
if (shortestDistance>currentDistance) { if (shortestDistance>currentDistance) {
shortestDistance = currentDistance; shortestDistance = currentDistance;
shortestNode = currentNode; shortestStorage = storages[i];
shortestIndex = i; shortestIndex = i;
} }
} }
//switch position index & shortestIndex //switch position index & shortestIndex
if (index != shortestIndex) { if (index != shortestIndex) {
nodes[shortestIndex] = nodes[index]; storages[shortestIndex] = storages[index];
nodes[index] = shortestNode; storages[index] = shortestStorage;
} }
writer = shortestNode; writer = shortestStorage.getDatanodeDescriptor();
} }
} }
return nodes; return storages;
} }
@Override @Override

View File

@ -25,6 +25,8 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.namenode.FSClusterStats; import org.apache.hadoop.hdfs.server.namenode.FSClusterStats;
import org.apache.hadoop.net.NetworkTopology; import org.apache.hadoop.net.NetworkTopology;
@ -64,81 +66,87 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
* @return the chosen node * @return the chosen node
*/ */
@Override @Override
protected DatanodeDescriptor chooseLocalNode(Node localMachine, protected DatanodeStorageInfo chooseLocalStorage(Node localMachine,
Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
List<DatanodeDescriptor> results, boolean avoidStaleNodes) List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
throws NotEnoughReplicasException { StorageType storageType) throws NotEnoughReplicasException {
// if no local machine, randomly choose one node // if no local machine, randomly choose one node
if (localMachine == null) if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
// otherwise try local machine first
if (localMachine instanceof DatanodeDescriptor) { if (localMachine instanceof DatanodeDescriptor) {
DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine; DatanodeDescriptor localDataNode = (DatanodeDescriptor)localMachine;
// otherwise try local machine first
if (excludedNodes.add(localMachine)) { // was not in the excluded list if (excludedNodes.add(localMachine)) { // was not in the excluded list
if (addIfIsGoodTarget(localDataNode, excludedNodes, blocksize, for(DatanodeStorageInfo localStorage : DFSUtil.shuffle(
maxNodesPerRack, false, results, avoidStaleNodes) >= 0) { localDataNode.getStorageInfos())) {
return localDataNode; if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize,
maxNodesPerRack, false, results, avoidStaleNodes, storageType) >= 0) {
return localStorage;
}
} }
} }
} }
// try a node on local node group // try a node on local node group
DatanodeDescriptor chosenNode = chooseLocalNodeGroup( DatanodeStorageInfo chosenStorage = chooseLocalNodeGroup(
(NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes, (NetworkTopologyWithNodeGroup)clusterMap, localMachine, excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
if (chosenNode != null) { if (chosenStorage != null) {
return chosenNode; return chosenStorage;
} }
// try a node on local rack // try a node on local rack
return chooseLocalRack(localMachine, excludedNodes, return chooseLocalRack(localMachine, excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
/** @return the node of the second replica */
private static DatanodeDescriptor secondNode(Node localMachine,
List<DatanodeStorageInfo> results) {
// find the second replica
for(DatanodeStorageInfo nextStorage : results) {
DatanodeDescriptor nextNode = nextStorage.getDatanodeDescriptor();
if (nextNode != localMachine) {
return nextNode;
}
}
return null;
}
@Override @Override
protected DatanodeDescriptor chooseLocalRack(Node localMachine, protected DatanodeStorageInfo chooseLocalRack(Node localMachine,
Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
List<DatanodeDescriptor> results, boolean avoidStaleNodes) List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
throws NotEnoughReplicasException { StorageType storageType) throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results, maxNodesPerRack, results, avoidStaleNodes, storageType);
avoidStaleNodes);
} }
// choose one from the local rack, but off-nodegroup // choose one from the local rack, but off-nodegroup
try { try {
return chooseRandom(NetworkTopology.getFirstHalf( final String scope = NetworkTopology.getFirstHalf(localMachine.getNetworkLocation());
localMachine.getNetworkLocation()), return chooseRandom(scope, excludedNodes, blocksize, maxNodesPerRack,
excludedNodes, blocksize, results, avoidStaleNodes, storageType);
maxNodesPerRack, results,
avoidStaleNodes);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica // find the second replica
DatanodeDescriptor newLocal=null; final DatanodeDescriptor newLocal = secondNode(localMachine, results);
for(DatanodeDescriptor nextNode : results) {
if (nextNode != localMachine) {
newLocal = nextNode;
break;
}
}
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom( return chooseRandom(
clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes, clusterMap.getRack(newLocal.getNetworkLocation()), excludedNodes,
blocksize, maxNodesPerRack, results, avoidStaleNodes); blocksize, maxNodesPerRack, results, avoidStaleNodes, storageType);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} }
} }
@ -146,8 +154,9 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
@Override @Override
protected void chooseRemoteRack(int numOfReplicas, protected void chooseRemoteRack(int numOfReplicas,
DatanodeDescriptor localMachine, Set<Node> excludedNodes, DatanodeDescriptor localMachine, Set<Node> excludedNodes,
long blocksize, int maxReplicasPerRack, List<DatanodeDescriptor> results, long blocksize, int maxReplicasPerRack, List<DatanodeStorageInfo> results,
boolean avoidStaleNodes) throws NotEnoughReplicasException { boolean avoidStaleNodes, StorageType storageType)
throws NotEnoughReplicasException {
int oldNumOfReplicas = results.size(); int oldNumOfReplicas = results.size();
final String rackLocation = NetworkTopology.getFirstHalf( final String rackLocation = NetworkTopology.getFirstHalf(
@ -155,12 +164,12 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
try { try {
// randomly choose from remote racks // randomly choose from remote racks
chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize, chooseRandom(numOfReplicas, "~" + rackLocation, excludedNodes, blocksize,
maxReplicasPerRack, results, avoidStaleNodes); maxReplicasPerRack, results, avoidStaleNodes, storageType);
} catch (NotEnoughReplicasException e) { } catch (NotEnoughReplicasException e) {
// fall back to the local rack // fall back to the local rack
chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas), chooseRandom(numOfReplicas - (results.size() - oldNumOfReplicas),
rackLocation, excludedNodes, blocksize, rackLocation, excludedNodes, blocksize,
maxReplicasPerRack, results, avoidStaleNodes); maxReplicasPerRack, results, avoidStaleNodes, storageType);
} }
} }
@ -170,46 +179,40 @@ public class BlockPlacementPolicyWithNodeGroup extends BlockPlacementPolicyDefau
* if still no such node is available, choose a random node in the cluster. * if still no such node is available, choose a random node in the cluster.
* @return the chosen node * @return the chosen node
*/ */
private DatanodeDescriptor chooseLocalNodeGroup( private DatanodeStorageInfo chooseLocalNodeGroup(
NetworkTopologyWithNodeGroup clusterMap, Node localMachine, NetworkTopologyWithNodeGroup clusterMap, Node localMachine,
Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack,
List<DatanodeDescriptor> results, boolean avoidStaleNodes) List<DatanodeStorageInfo> results, boolean avoidStaleNodes,
throws NotEnoughReplicasException { StorageType storageType) throws NotEnoughReplicasException {
// no local machine, so choose a random machine // no local machine, so choose a random machine
if (localMachine == null) { if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
blocksize, maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
// choose one from the local node group // choose one from the local node group
try { try {
return chooseRandom( return chooseRandom(
clusterMap.getNodeGroup(localMachine.getNetworkLocation()), clusterMap.getNodeGroup(localMachine.getNetworkLocation()),
excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes); excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes,
storageType);
} catch (NotEnoughReplicasException e1) { } catch (NotEnoughReplicasException e1) {
// find the second replica final DatanodeDescriptor newLocal = secondNode(localMachine, results);
DatanodeDescriptor newLocal=null;
for(DatanodeDescriptor nextNode : results) {
if (nextNode != localMachine) {
newLocal = nextNode;
break;
}
}
if (newLocal != null) { if (newLocal != null) {
try { try {
return chooseRandom( return chooseRandom(
clusterMap.getNodeGroup(newLocal.getNetworkLocation()), clusterMap.getNodeGroup(newLocal.getNetworkLocation()),
excludedNodes, blocksize, maxNodesPerRack, results, excludedNodes, blocksize, maxNodesPerRack, results,
avoidStaleNodes); avoidStaleNodes, storageType);
} catch(NotEnoughReplicasException e2) { } catch(NotEnoughReplicasException e2) {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} else { } else {
//otherwise randomly choose one from the network //otherwise randomly choose one from the network
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize,
maxNodesPerRack, results, avoidStaleNodes); maxNodesPerRack, results, avoidStaleNodes, storageType);
} }
} }
} }

View File

@ -30,11 +30,11 @@ import org.apache.hadoop.util.LightWeightGSet.SetIterator;
* the datanodes that store the block. * the datanodes that store the block.
*/ */
class BlocksMap { class BlocksMap {
private static class NodeIterator implements Iterator<DatanodeDescriptor> { private static class StorageIterator implements Iterator<DatanodeStorageInfo> {
private BlockInfo blockInfo; private BlockInfo blockInfo;
private int nextIdx = 0; private int nextIdx = 0;
NodeIterator(BlockInfo blkInfo) { StorageIterator(BlockInfo blkInfo) {
this.blockInfo = blkInfo; this.blockInfo = blkInfo;
} }
@ -45,8 +45,8 @@ class BlocksMap {
} }
@Override @Override
public DatanodeDescriptor next() { public DatanodeStorageInfo next() {
return blockInfo.getDatanode(nextIdx++); return blockInfo.getStorageInfo(nextIdx++);
} }
@Override @Override
@ -129,18 +129,23 @@ class BlocksMap {
/** /**
* Searches for the block in the BlocksMap and * Searches for the block in the BlocksMap and
* returns Iterator that iterates through the nodes the block belongs to. * returns {@link Iterable} of the storages the block belongs to.
*/ */
Iterator<DatanodeDescriptor> nodeIterator(Block b) { Iterable<DatanodeStorageInfo> getStorages(Block b) {
return nodeIterator(blocks.get(b)); return getStorages(blocks.get(b));
} }
/** /**
* For a block that has already been retrieved from the BlocksMap * For a block that has already been retrieved from the BlocksMap
* returns Iterator that iterates through the nodes the block belongs to. * returns {@link Iterable} of the storages the block belongs to.
*/ */
Iterator<DatanodeDescriptor> nodeIterator(BlockInfo storedBlock) { Iterable<DatanodeStorageInfo> getStorages(final BlockInfo storedBlock) {
return new NodeIterator(storedBlock); return new Iterable<DatanodeStorageInfo>() {
@Override
public Iterator<DatanodeStorageInfo> iterator() {
return new StorageIterator(storedBlock);
}
};
} }
/** counts number of containing nodes. Better than using iterator. */ /** counts number of containing nodes. Better than using iterator. */

View File

@ -27,6 +27,9 @@ import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Random; import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -48,6 +51,8 @@ import org.apache.hadoop.hdfs.util.ReadOnlyList;
import org.apache.hadoop.util.GSet; import org.apache.hadoop.util.GSet;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import com.google.common.base.Preconditions;
/** /**
* Scans the namesystem, scheduling blocks to be cached as appropriate. * Scans the namesystem, scheduling blocks to be cached as appropriate.
* *
@ -79,26 +84,48 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
private final long intervalMs; private final long intervalMs;
/** /**
* True if we should rescan immediately, regardless of how much time * The CacheReplicationMonitor (CRM) lock. Used to synchronize starting and
* elapsed since the previous scan. * waiting for rescan operations.
*/ */
private boolean rescanImmediately; private final ReentrantLock lock;
/** /**
* The monotonic time at which the current scan started. * Notifies the scan thread that an immediate rescan is needed.
*/ */
private long scanTimeMs; private final Condition doRescan;
/**
* Notifies waiting threads that a rescan has finished.
*/
private final Condition scanFinished;
/**
* Whether there are pending CacheManager operations that necessitate a
* CacheReplicationMonitor rescan. Protected by the CRM lock.
*/
private boolean needsRescan = true;
/**
* Whether we are currently doing a rescan. Protected by the CRM lock.
*/
private boolean isScanning = false;
/**
* The number of rescans completed. Used to wait for scans to finish.
* Protected by the CacheReplicationMonitor lock.
*/
private long scanCount = 0;
/**
* True if this monitor should terminate. Protected by the CRM lock.
*/
private boolean shutdown = false;
/** /**
* Mark status of the current scan. * Mark status of the current scan.
*/ */
private boolean mark = false; private boolean mark = false;
/**
* True if this monitor should terminate.
*/
private boolean shutdown;
/** /**
* Cache directives found in the previous scan. * Cache directives found in the previous scan.
*/ */
@ -110,53 +137,72 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
private long scannedBlocks; private long scannedBlocks;
public CacheReplicationMonitor(FSNamesystem namesystem, public CacheReplicationMonitor(FSNamesystem namesystem,
CacheManager cacheManager, long intervalMs) { CacheManager cacheManager, long intervalMs, ReentrantLock lock) {
this.namesystem = namesystem; this.namesystem = namesystem;
this.blockManager = namesystem.getBlockManager(); this.blockManager = namesystem.getBlockManager();
this.cacheManager = cacheManager; this.cacheManager = cacheManager;
this.cachedBlocks = cacheManager.getCachedBlocks(); this.cachedBlocks = cacheManager.getCachedBlocks();
this.intervalMs = intervalMs; this.intervalMs = intervalMs;
this.lock = lock;
this.doRescan = this.lock.newCondition();
this.scanFinished = this.lock.newCondition();
} }
@Override @Override
public void run() { public void run() {
shutdown = false; long startTimeMs = 0;
rescanImmediately = true; Thread.currentThread().setName("CacheReplicationMonitor(" +
scanTimeMs = 0; System.identityHashCode(this) + ")");
LOG.info("Starting CacheReplicationMonitor with interval " + LOG.info("Starting CacheReplicationMonitor with interval " +
intervalMs + " milliseconds"); intervalMs + " milliseconds");
try { try {
long curTimeMs = Time.monotonicNow(); long curTimeMs = Time.monotonicNow();
while (true) { while (true) {
synchronized(this) { lock.lock();
try {
while (true) { while (true) {
if (shutdown) { if (shutdown) {
LOG.info("Shutting down CacheReplicationMonitor"); LOG.info("Shutting down CacheReplicationMonitor");
return; return;
} }
if (rescanImmediately) { if (needsRescan) {
LOG.info("Rescanning on request"); LOG.info("Rescanning because of pending operations");
rescanImmediately = false;
break; break;
} }
long delta = (scanTimeMs + intervalMs) - curTimeMs; long delta = (startTimeMs + intervalMs) - curTimeMs;
if (delta <= 0) { if (delta <= 0) {
LOG.info("Rescanning after " + (curTimeMs - scanTimeMs) + LOG.info("Rescanning after " + (curTimeMs - startTimeMs) +
" milliseconds"); " milliseconds");
break; break;
} }
this.wait(delta); doRescan.await(delta, TimeUnit.MILLISECONDS);
curTimeMs = Time.monotonicNow(); curTimeMs = Time.monotonicNow();
} }
isScanning = true;
needsRescan = false;
} finally {
lock.unlock();
} }
scanTimeMs = curTimeMs; startTimeMs = curTimeMs;
mark = !mark; mark = !mark;
rescan(); rescan();
curTimeMs = Time.monotonicNow(); curTimeMs = Time.monotonicNow();
// Update synchronization-related variables.
lock.lock();
try {
isScanning = false;
scanCount++;
scanFinished.signalAll();
} finally {
lock.unlock();
}
LOG.info("Scanned " + scannedDirectives + " directive(s) and " + LOG.info("Scanned " + scannedDirectives + " directive(s) and " +
scannedBlocks + " block(s) in " + (curTimeMs - scanTimeMs) + " " + scannedBlocks + " block(s) in " + (curTimeMs - startTimeMs) + " " +
"millisecond(s)."); "millisecond(s).");
} }
} catch (InterruptedException e) {
LOG.info("Shutting down CacheReplicationMonitor.");
return;
} catch (Throwable t) { } catch (Throwable t) {
LOG.fatal("Thread exiting", t); LOG.fatal("Thread exiting", t);
terminate(1, t); terminate(1, t);
@ -164,41 +210,80 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
} }
/** /**
* Kick the monitor thread. * Waits for a rescan to complete. This doesn't guarantee consistency with
* * pending operations, only relative recency, since it will not force a new
* If it is sleeping, it will wake up and start scanning. * rescan if a rescan is already underway.
* If it is currently scanning, it will finish the scan and immediately do * <p>
* another one. * Note that this call will release the FSN lock, so operations before and
* after are not atomic.
*/ */
public synchronized void kick() { public void waitForRescanIfNeeded() {
rescanImmediately = true; Preconditions.checkArgument(!namesystem.hasWriteLock(),
this.notifyAll(); "Must not hold the FSN write lock when waiting for a rescan.");
Preconditions.checkArgument(lock.isHeldByCurrentThread(),
"Must hold the CRM lock when waiting for a rescan.");
if (!needsRescan) {
return;
}
// If no scan is already ongoing, mark the CRM as dirty and kick
if (!isScanning) {
doRescan.signal();
}
// Wait until the scan finishes and the count advances
final long startCount = scanCount;
while ((!shutdown) && (startCount >= scanCount)) {
try {
scanFinished.await();
} catch (InterruptedException e) {
LOG.warn("Interrupted while waiting for CacheReplicationMonitor"
+ " rescan", e);
break;
}
}
} }
/** /**
* Shut down and join the monitor thread. * Indicates to the CacheReplicationMonitor that there have been CacheManager
* changes that require a rescan.
*/
public void setNeedsRescan() {
Preconditions.checkArgument(lock.isHeldByCurrentThread(),
"Must hold the CRM lock when setting the needsRescan bit.");
this.needsRescan = true;
}
/**
* Shut down the monitor thread.
*/ */
@Override @Override
public void close() throws IOException { public void close() throws IOException {
synchronized(this) { Preconditions.checkArgument(namesystem.hasWriteLock());
if (shutdown) return; lock.lock();
shutdown = true;
this.notifyAll();
}
try { try {
if (this.isAlive()) { if (shutdown) return;
this.join(60000); // Since we hold both the FSN write lock and the CRM lock here,
} // we know that the CRM thread cannot be currently modifying
} catch (InterruptedException e) { // the cache manager state while we're closing it.
Thread.currentThread().interrupt(); // Since the CRM thread checks the value of 'shutdown' after waiting
// for a lock, we know that the thread will not modify the cache
// manager state after this point.
shutdown = true;
doRescan.signalAll();
scanFinished.signalAll();
} finally {
lock.unlock();
} }
} }
private void rescan() { private void rescan() throws InterruptedException {
scannedDirectives = 0; scannedDirectives = 0;
scannedBlocks = 0; scannedBlocks = 0;
namesystem.writeLock(); namesystem.writeLock();
try { try {
if (shutdown) {
throw new InterruptedException("CacheReplicationMonitor was " +
"shut down.");
}
resetStatistics(); resetStatistics();
rescanCacheDirectives(); rescanCacheDirectives();
rescanCachedBlockMap(); rescanCachedBlockMap();
@ -228,12 +313,14 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
// Reset the directive's statistics // Reset the directive's statistics
directive.resetStatistics(); directive.resetStatistics();
// Skip processing this entry if it has expired // Skip processing this entry if it has expired
LOG.info("Directive expiry is at " + directive.getExpiryTime()); if (LOG.isTraceEnabled()) {
LOG.trace("Directive expiry is at " + directive.getExpiryTime());
}
if (directive.getExpiryTime() > 0 && directive.getExpiryTime() <= now) { if (directive.getExpiryTime() > 0 && directive.getExpiryTime() <= now) {
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Skipping directive id " + directive.getId() LOG.debug("Skipping directive id " + directive.getId()
+ " because it has expired (" + directive.getExpiryTime() + ">=" + " because it has expired (" + directive.getExpiryTime() + "<="
+ now); + now + ")");
} }
continue; continue;
} }
@ -280,15 +367,27 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
// Increment the "needed" statistics // Increment the "needed" statistics
directive.addFilesNeeded(1); directive.addFilesNeeded(1);
long neededTotal = 0; // We don't cache UC blocks, don't add them to the total here
for (BlockInfo blockInfo : blockInfos) { long neededTotal = file.computeFileSizeNotIncludingLastUcBlock() *
long neededByBlock = directive.getReplication();
directive.getReplication() * blockInfo.getNumBytes();
neededTotal += neededByBlock;
}
directive.addBytesNeeded(neededTotal); directive.addBytesNeeded(neededTotal);
// TODO: Enforce per-pool quotas // The pool's bytesNeeded is incremented as we scan. If the demand
// thus far plus the demand of this file would exceed the pool's limit,
// do not cache this file.
CachePool pool = directive.getPool();
if (pool.getBytesNeeded() > pool.getLimit()) {
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Skipping directive id %d file %s because "
+ "limit of pool %s would be exceeded (%d > %d)",
directive.getId(),
file.getFullPathName(),
pool.getPoolName(),
pool.getBytesNeeded(),
pool.getLimit()));
}
return;
}
long cachedTotal = 0; long cachedTotal = 0;
for (BlockInfo blockInfo : blockInfos) { for (BlockInfo blockInfo : blockInfos) {
@ -315,14 +414,21 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
directive.getReplication()) * blockInfo.getNumBytes(); directive.getReplication()) * blockInfo.getNumBytes();
cachedTotal += cachedByBlock; cachedTotal += cachedByBlock;
if (mark != ocblock.getMark()) { if ((mark != ocblock.getMark()) ||
// Mark hasn't been set in this scan, so update replication and mark. (ocblock.getReplication() < directive.getReplication())) {
//
// Overwrite the block's replication and mark in two cases:
//
// 1. If the mark on the CachedBlock is different from the mark for
// this scan, that means the block hasn't been updated during this
// scan, and we should overwrite whatever is there, since it is no
// longer valid.
//
// 2. If the replication in the CachedBlock is less than what the
// directive asks for, we want to increase the block's replication
// field to what the directive asks for.
//
ocblock.setReplicationAndMark(directive.getReplication(), mark); ocblock.setReplicationAndMark(directive.getReplication(), mark);
} else {
// Mark already set in this scan. Set replication to highest value in
// any CacheDirective that covers this file.
ocblock.setReplicationAndMark((short)Math.max(
directive.getReplication(), ocblock.getReplication()), mark);
} }
} }
} }
@ -338,6 +444,36 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
} }
} }
private String findReasonForNotCaching(CachedBlock cblock,
BlockInfo blockInfo) {
if (blockInfo == null) {
// Somehow, a cache report with the block arrived, but the block
// reports from the DataNode haven't (yet?) described such a block.
// Alternately, the NameNode might have invalidated the block, but the
// DataNode hasn't caught up. In any case, we want to tell the DN
// to uncache this.
return "not tracked by the BlockManager";
} else if (!blockInfo.isComplete()) {
// When a cached block changes state from complete to some other state
// on the DataNode (perhaps because of append), it will begin the
// uncaching process. However, the uncaching process is not
// instantaneous, especially if clients have pinned the block. So
// there may be a period of time when incomplete blocks remain cached
// on the DataNodes.
return "not complete";
} else if (cblock.getReplication() == 0) {
// Since 0 is not a valid value for a cache directive's replication
// field, seeing a replication of 0 on a CacheBlock means that it
// has never been reached by any sweep.
return "not needed by any directives";
} else if (cblock.getMark() != mark) {
// Although the block was needed in the past, we didn't reach it during
// the current sweep. Therefore, it doesn't need to be cached any more.
return "no longer needed by any directives";
}
return null;
}
/** /**
* Scan through the cached block map. * Scan through the cached block map.
* Any blocks which are under-replicated should be assigned new Datanodes. * Any blocks which are under-replicated should be assigned new Datanodes.
@ -363,11 +499,17 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
iter.remove(); iter.remove();
} }
} }
// If the block's mark doesn't match with the mark of this scan, that BlockInfo blockInfo = blockManager.
// means that this block couldn't be reached during this scan. That means getStoredBlock(new Block(cblock.getBlockId()));
// it doesn't need to be cached any more. String reason = findReasonForNotCaching(cblock, blockInfo);
int neededCached = (cblock.getMark() != mark) ? int neededCached = 0;
0 : cblock.getReplication(); if (reason != null) {
if (LOG.isDebugEnabled()) {
LOG.debug("not caching " + cblock + " because it is " + reason);
}
} else {
neededCached = cblock.getReplication();
}
int numCached = cached.size(); int numCached = cached.size();
if (numCached >= neededCached) { if (numCached >= neededCached) {
// If we have enough replicas, drop all pending cached. // If we have enough replicas, drop all pending cached.
@ -421,9 +563,6 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
private void addNewPendingUncached(int neededUncached, private void addNewPendingUncached(int neededUncached,
CachedBlock cachedBlock, List<DatanodeDescriptor> cached, CachedBlock cachedBlock, List<DatanodeDescriptor> cached,
List<DatanodeDescriptor> pendingUncached) { List<DatanodeDescriptor> pendingUncached) {
if (!cacheManager.isActive()) {
return;
}
// Figure out which replicas can be uncached. // Figure out which replicas can be uncached.
LinkedList<DatanodeDescriptor> possibilities = LinkedList<DatanodeDescriptor> possibilities =
new LinkedList<DatanodeDescriptor>(); new LinkedList<DatanodeDescriptor>();
@ -459,16 +598,15 @@ public class CacheReplicationMonitor extends Thread implements Closeable {
private void addNewPendingCached(int neededCached, private void addNewPendingCached(int neededCached,
CachedBlock cachedBlock, List<DatanodeDescriptor> cached, CachedBlock cachedBlock, List<DatanodeDescriptor> cached,
List<DatanodeDescriptor> pendingCached) { List<DatanodeDescriptor> pendingCached) {
if (!cacheManager.isActive()) {
return;
}
// To figure out which replicas can be cached, we consult the // To figure out which replicas can be cached, we consult the
// blocksMap. We don't want to try to cache a corrupt replica, though. // blocksMap. We don't want to try to cache a corrupt replica, though.
BlockInfo blockInfo = blockManager. BlockInfo blockInfo = blockManager.
getStoredBlock(new Block(cachedBlock.getBlockId())); getStoredBlock(new Block(cachedBlock.getBlockId()));
if (blockInfo == null) { if (blockInfo == null) {
LOG.debug("Not caching block " + cachedBlock + " because it " + if (LOG.isDebugEnabled()) {
"was deleted from all DataNodes."); LOG.debug("Not caching block " + cachedBlock + " because there " +
"is no record of it on the NameNode.");
}
return; return;
} }
if (!blockInfo.isComplete()) { if (!blockInfo.isComplete()) {

View File

@ -18,23 +18,29 @@
package org.apache.hadoop.hdfs.server.blockmanagement; package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.List; import java.util.List;
import java.util.Map;
import java.util.Queue; import java.util.Queue;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.namenode.CachedBlock; import org.apache.hadoop.hdfs.server.namenode.CachedBlock;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.hdfs.util.LightWeightHashSet; import org.apache.hadoop.hdfs.util.LightWeightHashSet;
import org.apache.hadoop.util.IntrusiveCollection; import org.apache.hadoop.util.IntrusiveCollection;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import com.google.common.annotations.VisibleForTesting;
/** /**
* This class extends the DatanodeInfo class with ephemeral information (eg * This class extends the DatanodeInfo class with ephemeral information (eg
* health, capacity, what blocks are associated with the Datanode) that is * health, capacity, what blocks are associated with the Datanode) that is
@ -43,6 +49,7 @@ import com.google.common.annotations.VisibleForTesting;
@InterfaceAudience.Private @InterfaceAudience.Private
@InterfaceStability.Evolving @InterfaceStability.Evolving
public class DatanodeDescriptor extends DatanodeInfo { public class DatanodeDescriptor extends DatanodeInfo {
public static final Log LOG = LogFactory.getLog(DatanodeDescriptor.class);
public static final DatanodeDescriptor[] EMPTY_ARRAY = {}; public static final DatanodeDescriptor[] EMPTY_ARRAY = {};
// Stores status of decommissioning. // Stores status of decommissioning.
@ -54,9 +61,9 @@ public class DatanodeDescriptor extends DatanodeInfo {
@InterfaceStability.Evolving @InterfaceStability.Evolving
public static class BlockTargetPair { public static class BlockTargetPair {
public final Block block; public final Block block;
public final DatanodeDescriptor[] targets; public final DatanodeStorageInfo[] targets;
BlockTargetPair(Block block, DatanodeDescriptor[] targets) { BlockTargetPair(Block block, DatanodeStorageInfo[] targets) {
this.block = block; this.block = block;
this.targets = targets; this.targets = targets;
} }
@ -99,6 +106,9 @@ public class DatanodeDescriptor extends DatanodeInfo {
} }
} }
private final Map<String, DatanodeStorageInfo> storageMap =
new HashMap<String, DatanodeStorageInfo>();
/** /**
* A list of CachedBlock objects on this datanode. * A list of CachedBlock objects on this datanode.
*/ */
@ -164,37 +174,11 @@ public class DatanodeDescriptor extends DatanodeInfo {
*/ */
private long lastCachingDirectiveSentTimeMs; private long lastCachingDirectiveSentTimeMs;
/**
* Head of the list of blocks on the datanode
*/
private volatile BlockInfo blockList = null;
/**
* Number of blocks on the datanode
*/
private int numBlocks = 0;
// isAlive == heartbeats.contains(this) // isAlive == heartbeats.contains(this)
// This is an optimization, because contains takes O(n) time on Arraylist // This is an optimization, because contains takes O(n) time on Arraylist
public boolean isAlive = false; public boolean isAlive = false;
public boolean needKeyUpdate = false; public boolean needKeyUpdate = false;
/**
* Set to false on any NN failover, and reset to true
* whenever a block report is received.
*/
private boolean heartbeatedSinceFailover = false;
/**
* At startup or at any failover, the DNs in the cluster may
* have pending block deletions from a previous incarnation
* of the NameNode. Thus, we consider their block contents
* stale until we have received a block report. When a DN
* is considered stale, any replicas on it are transitively
* considered stale. If any block has at least one stale replica,
* then no invalidations will be processed for this block.
* See HDFS-1972.
*/
private boolean blockContentsStale = true;
// A system administrator can tune the balancer bandwidth parameter // A system administrator can tune the balancer bandwidth parameter
// (dfs.balance.bandwidthPerSec) dynamically by calling // (dfs.balance.bandwidthPerSec) dynamically by calling
@ -213,7 +197,7 @@ public class DatanodeDescriptor extends DatanodeInfo {
private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>(); private LightWeightHashSet<Block> invalidateBlocks = new LightWeightHashSet<Block>();
/* Variables for maintaining number of blocks scheduled to be written to /* Variables for maintaining number of blocks scheduled to be written to
* this datanode. This count is approximate and might be slightly bigger * this storage. This count is approximate and might be slightly bigger
* in case of errors (e.g. datanode does not report if an error occurs * in case of errors (e.g. datanode does not report if an error occurs
* while writing the block). * while writing the block).
*/ */
@ -223,9 +207,6 @@ public class DatanodeDescriptor extends DatanodeInfo {
private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min private static final int BLOCKS_SCHEDULED_ROLL_INTERVAL = 600*1000; //10min
private int volumeFailures = 0; private int volumeFailures = 0;
/** Set to false after processing first block report */
private boolean firstBlockReport = true;
/** /**
* When set to true, the node is not in include list and is not allowed * When set to true, the node is not in include list and is not allowed
* to communicate with the namenode * to communicate with the namenode
@ -237,7 +218,8 @@ public class DatanodeDescriptor extends DatanodeInfo {
* @param nodeID id of the data node * @param nodeID id of the data node
*/ */
public DatanodeDescriptor(DatanodeID nodeID) { public DatanodeDescriptor(DatanodeID nodeID) {
this(nodeID, 0L, 0L, 0L, 0L, 0L, 0L, 0, 0); super(nodeID);
updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
} }
/** /**
@ -247,104 +229,60 @@ public class DatanodeDescriptor extends DatanodeInfo {
*/ */
public DatanodeDescriptor(DatanodeID nodeID, public DatanodeDescriptor(DatanodeID nodeID,
String networkLocation) { String networkLocation) {
this(nodeID, networkLocation, 0L, 0L, 0L, 0L, 0L, 0L, 0, 0);
}
/**
* DatanodeDescriptor constructor
* @param nodeID id of the data node
* @param capacity capacity of the data node
* @param dfsUsed space used by the data node
* @param remaining remaining capacity of the data node
* @param bpused space used by the block pool corresponding to this namenode
* @param cacheCapacity cache capacity of the data node
* @param cacheUsed cache used on the data node
* @param xceiverCount # of data transfers at the data node
*/
public DatanodeDescriptor(DatanodeID nodeID,
long capacity,
long dfsUsed,
long remaining,
long bpused,
long cacheCapacity,
long cacheUsed,
int xceiverCount,
int failedVolumes) {
super(nodeID);
updateHeartbeat(capacity, dfsUsed, remaining, bpused, cacheCapacity,
cacheUsed, xceiverCount, failedVolumes);
}
/**
* DatanodeDescriptor constructor
* @param nodeID id of the data node
* @param networkLocation location of the data node in network
* @param capacity capacity of the data node, including space used by non-dfs
* @param dfsUsed the used space by dfs datanode
* @param remaining remaining capacity of the data node
* @param bpused space used by the block pool corresponding to this namenode
* @param cacheCapacity cache capacity of the data node
* @param cacheUsed cache used on the data node
* @param xceiverCount # of data transfers at the data node
*/
public DatanodeDescriptor(DatanodeID nodeID,
String networkLocation,
long capacity,
long dfsUsed,
long remaining,
long bpused,
long cacheCapacity,
long cacheUsed,
int xceiverCount,
int failedVolumes) {
super(nodeID, networkLocation); super(nodeID, networkLocation);
updateHeartbeat(capacity, dfsUsed, remaining, bpused, cacheCapacity, updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
cacheUsed, xceiverCount, failedVolumes);
} }
/** /**
* Add datanode to the block. * Add data-node to the block. Add block to the head of the list of blocks
* Add block to the head of the list of blocks belonging to the data-node. * belonging to the data-node.
*/ */
public boolean addBlock(BlockInfo b) { public boolean addBlock(String storageID, BlockInfo b) {
if(!b.addNode(this)) DatanodeStorageInfo s = getStorageInfo(storageID);
return false; if (s != null) {
// add to the head of the data-node list return s.addBlock(b);
blockList = b.listInsert(blockList, this);
numBlocks++;
return true;
} }
/**
* Remove block from the list of blocks belonging to the data-node.
* Remove datanode from the block.
*/
public boolean removeBlock(BlockInfo b) {
blockList = b.listRemove(blockList, this);
if ( b.removeNode(this) ) {
numBlocks--;
return true;
} else {
return false; return false;
} }
DatanodeStorageInfo getStorageInfo(String storageID) {
synchronized (storageMap) {
return storageMap.get(storageID);
}
}
DatanodeStorageInfo[] getStorageInfos() {
synchronized (storageMap) {
final Collection<DatanodeStorageInfo> storages = storageMap.values();
return storages.toArray(new DatanodeStorageInfo[storages.size()]);
}
} }
/** /**
* Move block to the head of the list of blocks belonging to the data-node. * Remove block from the list of blocks belonging to the data-node. Remove
* @return the index of the head of the blockList * data-node from the block.
*/ */
int moveBlockToHead(BlockInfo b, int curIndex, int headIndex) { boolean removeBlock(BlockInfo b) {
blockList = b.moveBlockToHead(blockList, this, curIndex, headIndex); int index = b.findStorageInfo(this);
return curIndex; // if block exists on this datanode
if (index >= 0) {
DatanodeStorageInfo s = b.getStorageInfo(index);
if (s != null) {
return s.removeBlock(b);
}
}
return false;
} }
/** /**
* Used for testing only * Remove block from the list of blocks belonging to the data-node. Remove
* @return the head of the blockList * data-node from the block.
*/ */
@VisibleForTesting boolean removeBlock(String storageID, BlockInfo b) {
protected BlockInfo getHead(){ DatanodeStorageInfo s = getStorageInfo(storageID);
return blockList; if (s != null) {
return s.removeBlock(b);
}
return false;
} }
/** /**
@ -355,9 +293,12 @@ public class DatanodeDescriptor extends DatanodeInfo {
* @return the new block * @return the new block
*/ */
public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) { public BlockInfo replaceBlock(BlockInfo oldBlock, BlockInfo newBlock) {
boolean done = removeBlock(oldBlock); int index = oldBlock.findStorageInfo(this);
DatanodeStorageInfo s = oldBlock.getStorageInfo(index);
boolean done = s.removeBlock(oldBlock);
assert done : "Old block should belong to the data-node when replacing"; assert done : "Old block should belong to the data-node when replacing";
done = addBlock(newBlock);
done = s.addBlock(newBlock);
assert done : "New block should not belong to the data-node when replacing"; assert done : "New block should not belong to the data-node when replacing";
return newBlock; return newBlock;
} }
@ -368,7 +309,6 @@ public class DatanodeDescriptor extends DatanodeInfo {
setBlockPoolUsed(0); setBlockPoolUsed(0);
setDfsUsed(0); setDfsUsed(0);
setXceiverCount(0); setXceiverCount(0);
this.blockList = null;
this.invalidateBlocks.clear(); this.invalidateBlocks.clear();
this.volumeFailures = 0; this.volumeFailures = 0;
// pendingCached, cached, and pendingUncached are protected by the // pendingCached, cached, and pendingUncached are protected by the
@ -392,66 +332,97 @@ public class DatanodeDescriptor extends DatanodeInfo {
} }
public int numBlocks() { public int numBlocks() {
return numBlocks; int blocks = 0;
for (DatanodeStorageInfo entry : getStorageInfos()) {
blocks += entry.numBlocks();
}
return blocks;
} }
/** /**
* Updates stats from datanode heartbeat. * Updates stats from datanode heartbeat.
*/ */
public void updateHeartbeat(long capacity, long dfsUsed, long remaining, public void updateHeartbeat(StorageReport[] reports, long cacheCapacity,
long blockPoolUsed, long cacheCapacity, long cacheUsed, int xceiverCount, long cacheUsed, int xceiverCount, int volFailures) {
int volFailures) { long totalCapacity = 0;
setCapacity(capacity); long totalRemaining = 0;
setRemaining(remaining); long totalBlockPoolUsed = 0;
setBlockPoolUsed(blockPoolUsed); long totalDfsUsed = 0;
setDfsUsed(dfsUsed);
setCacheCapacity(cacheCapacity); setCacheCapacity(cacheCapacity);
setCacheUsed(cacheUsed); setCacheUsed(cacheUsed);
setXceiverCount(xceiverCount); setXceiverCount(xceiverCount);
setLastUpdate(Time.now()); setLastUpdate(Time.now());
this.volumeFailures = volFailures; this.volumeFailures = volFailures;
this.heartbeatedSinceFailover = true; for (StorageReport report : reports) {
DatanodeStorageInfo storage = storageMap.get(report.getStorageID());
if (storage == null) {
// This is seen during cluster initialization when the heartbeat
// is received before the initial block reports from each storage.
storage = updateStorage(new DatanodeStorage(report.getStorageID()));
}
storage.receivedHeartbeat(report);
totalCapacity += report.getCapacity();
totalRemaining += report.getRemaining();
totalBlockPoolUsed += report.getBlockPoolUsed();
totalDfsUsed += report.getDfsUsed();
}
rollBlocksScheduled(getLastUpdate()); rollBlocksScheduled(getLastUpdate());
// Update total metrics for the node.
setCapacity(totalCapacity);
setRemaining(totalRemaining);
setBlockPoolUsed(totalBlockPoolUsed);
setDfsUsed(totalDfsUsed);
} }
/** private static class BlockIterator implements Iterator<BlockInfo> {
* Iterates over the list of blocks belonging to the datanode. private int index = 0;
*/ private final List<Iterator<BlockInfo>> iterators;
public static class BlockIterator implements Iterator<BlockInfo> {
private BlockInfo current;
private DatanodeDescriptor node;
BlockIterator(BlockInfo head, DatanodeDescriptor dn) { private BlockIterator(final DatanodeStorageInfo... storages) {
this.current = head; List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>();
this.node = dn; for (DatanodeStorageInfo e : storages) {
iterators.add(e.getBlockIterator());
}
this.iterators = Collections.unmodifiableList(iterators);
} }
@Override @Override
public boolean hasNext() { public boolean hasNext() {
return current != null; update();
return !iterators.isEmpty() && iterators.get(index).hasNext();
} }
@Override @Override
public BlockInfo next() { public BlockInfo next() {
BlockInfo res = current; update();
current = current.getNext(current.findDatanode(node)); return iterators.get(index).next();
return res;
} }
@Override @Override
public void remove() { public void remove() {
throw new UnsupportedOperationException("Sorry. can't remove."); throw new UnsupportedOperationException("Remove unsupported.");
}
private void update() {
while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) {
index++;
}
} }
} }
public Iterator<BlockInfo> getBlockIterator() { Iterator<BlockInfo> getBlockIterator() {
return new BlockIterator(this.blockList, this); return new BlockIterator(getStorageInfos());
}
Iterator<BlockInfo> getBlockIterator(final String storageID) {
return new BlockIterator(getStorageInfo(storageID));
} }
/** /**
* Store block replication work. * Store block replication work.
*/ */
void addBlockToBeReplicated(Block block, DatanodeDescriptor[] targets) { void addBlockToBeReplicated(Block block, DatanodeStorageInfo[] targets) {
assert(block != null && targets != null && targets.length > 0); assert(block != null && targets != null && targets.length > 0);
replicateBlocks.offer(new BlockTargetPair(block, targets)); replicateBlocks.offer(new BlockTargetPair(block, targets));
} }
@ -527,17 +498,13 @@ public class DatanodeDescriptor extends DatanodeInfo {
return currApproxBlocksScheduled + prevApproxBlocksScheduled; return currApproxBlocksScheduled + prevApproxBlocksScheduled;
} }
/** /** Increment the number of blocks scheduled. */
* Increments counter for number of blocks scheduled. void incrementBlocksScheduled() {
*/
public void incBlocksScheduled() {
currApproxBlocksScheduled++; currApproxBlocksScheduled++;
} }
/** /** Decrement the number of blocks scheduled. */
* Decrements counter for number of blocks scheduled. void decrementBlocksScheduled() {
*/
void decBlocksScheduled() {
if (prevApproxBlocksScheduled > 0) { if (prevApproxBlocksScheduled > 0) {
prevApproxBlocksScheduled--; prevApproxBlocksScheduled--;
} else if (currApproxBlocksScheduled > 0) { } else if (currApproxBlocksScheduled > 0) {
@ -546,12 +513,9 @@ public class DatanodeDescriptor extends DatanodeInfo {
// its ok if both counters are zero. // its ok if both counters are zero.
} }
/** /** Adjusts curr and prev number of blocks scheduled every few minutes. */
* Adjusts curr and prev number of blocks scheduled every few minutes.
*/
private void rollBlocksScheduled(long now) { private void rollBlocksScheduled(long now) {
if ((now - lastBlocksScheduledRollTime) > if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) {
BLOCKS_SCHEDULED_ROLL_INTERVAL) {
prevApproxBlocksScheduled = currApproxBlocksScheduled; prevApproxBlocksScheduled = currApproxBlocksScheduled;
currApproxBlocksScheduled = 0; currApproxBlocksScheduled = 0;
lastBlocksScheduledRollTime = now; lastBlocksScheduledRollTime = now;
@ -647,7 +611,11 @@ public class DatanodeDescriptor extends DatanodeInfo {
@Override @Override
public void updateRegInfo(DatanodeID nodeReg) { public void updateRegInfo(DatanodeID nodeReg) {
super.updateRegInfo(nodeReg); super.updateRegInfo(nodeReg);
firstBlockReport = true; // must re-process IBR after re-registration
// must re-process IBR after re-registration
for(DatanodeStorageInfo storage : getStorageInfos()) {
storage.setBlockReportCount(0);
}
} }
/** /**
@ -664,26 +632,6 @@ public class DatanodeDescriptor extends DatanodeInfo {
this.bandwidth = bandwidth; this.bandwidth = bandwidth;
} }
public boolean areBlockContentsStale() {
return blockContentsStale;
}
public void markStaleAfterFailover() {
heartbeatedSinceFailover = false;
blockContentsStale = true;
}
public void receivedBlockReport() {
if (heartbeatedSinceFailover) {
blockContentsStale = false;
}
firstBlockReport = false;
}
boolean isFirstBlockReport() {
return firstBlockReport;
}
@Override @Override
public String dumpDatanode() { public String dumpDatanode() {
StringBuilder sb = new StringBuilder(super.dumpDatanode()); StringBuilder sb = new StringBuilder(super.dumpDatanode());
@ -702,6 +650,19 @@ public class DatanodeDescriptor extends DatanodeInfo {
return sb.toString(); return sb.toString();
} }
DatanodeStorageInfo updateStorage(DatanodeStorage s) {
synchronized (storageMap) {
DatanodeStorageInfo storage = storageMap.get(s.getStorageID());
if (storage == null) {
LOG.info("Adding new storage ID " + s.getStorageID() +
" for DN " + getXferAddr());
storage = new DatanodeStorageInfo(this, s);
storageMap.put(s.getStorageID(), storage);
}
return storage;
}
}
/** /**
* @return The time at which we last sent caching directives to this * @return The time at which we last sent caching directives to this
* DataNode, in monotonic milliseconds. * DataNode, in monotonic milliseconds.
@ -718,3 +679,4 @@ public class DatanodeDescriptor extends DatanodeInfo {
this.lastCachingDirectiveSentTimeMs = time; this.lastCachingDirectiveSentTimeMs = time;
} }
} }

View File

@ -424,9 +424,13 @@ public class DatanodeManager {
} }
/** Get a datanode descriptor given corresponding storageID */ /** Get a datanode descriptor given corresponding DatanodeUUID */
DatanodeDescriptor getDatanode(final String storageID) { DatanodeDescriptor getDatanode(final String datanodeUuid) {
return datanodeMap.get(storageID); if (datanodeUuid == null) {
return null;
}
return datanodeMap.get(datanodeUuid);
} }
/** /**
@ -438,7 +442,7 @@ public class DatanodeManager {
*/ */
public DatanodeDescriptor getDatanode(DatanodeID nodeID public DatanodeDescriptor getDatanode(DatanodeID nodeID
) throws UnregisteredNodeException { ) throws UnregisteredNodeException {
final DatanodeDescriptor node = getDatanode(nodeID.getStorageID()); final DatanodeDescriptor node = getDatanode(nodeID.getDatanodeUuid());
if (node == null) if (node == null)
return null; return null;
if (!node.getXferAddr().equals(nodeID.getXferAddr())) { if (!node.getXferAddr().equals(nodeID.getXferAddr())) {
@ -451,6 +455,20 @@ public class DatanodeManager {
return node; return node;
} }
public DatanodeStorageInfo[] getDatanodeStorageInfos(
DatanodeID[] datanodeID, String[] storageIDs)
throws UnregisteredNodeException {
if (datanodeID.length == 0) {
return null;
}
final DatanodeStorageInfo[] storages = new DatanodeStorageInfo[datanodeID.length];
for(int i = 0; i < datanodeID.length; i++) {
final DatanodeDescriptor dd = getDatanode(datanodeID[i]);
storages[i] = dd.getStorageInfo(storageIDs[i]);
}
return storages;
}
/** Prints information about all datanodes. */ /** Prints information about all datanodes. */
void datanodeDump(final PrintWriter out) { void datanodeDump(final PrintWriter out) {
synchronized (datanodeMap) { synchronized (datanodeMap) {
@ -528,7 +546,7 @@ public class DatanodeManager {
// remove from host2DatanodeMap the datanodeDescriptor removed // remove from host2DatanodeMap the datanodeDescriptor removed
// from datanodeMap before adding node to host2DatanodeMap. // from datanodeMap before adding node to host2DatanodeMap.
synchronized(datanodeMap) { synchronized(datanodeMap) {
host2DatanodeMap.remove(datanodeMap.put(node.getStorageID(), node)); host2DatanodeMap.remove(datanodeMap.put(node.getDatanodeUuid(), node));
} }
networktopology.add(node); // may throw InvalidTopologyException networktopology.add(node); // may throw InvalidTopologyException
@ -543,7 +561,7 @@ public class DatanodeManager {
/** Physically remove node from datanodeMap. */ /** Physically remove node from datanodeMap. */
private void wipeDatanode(final DatanodeID node) { private void wipeDatanode(final DatanodeID node) {
final String key = node.getStorageID(); final String key = node.getDatanodeUuid();
synchronized (datanodeMap) { synchronized (datanodeMap) {
host2DatanodeMap.remove(datanodeMap.remove(key)); host2DatanodeMap.remove(datanodeMap.remove(key));
} }
@ -705,8 +723,10 @@ public class DatanodeManager {
/** Start decommissioning the specified datanode. */ /** Start decommissioning the specified datanode. */
private void startDecommission(DatanodeDescriptor node) { private void startDecommission(DatanodeDescriptor node) {
if (!node.isDecommissionInProgress() && !node.isDecommissioned()) { if (!node.isDecommissionInProgress() && !node.isDecommissioned()) {
LOG.info("Start Decommissioning " + node + " with " + for (DatanodeStorageInfo storage : node.getStorageInfos()) {
node.numBlocks() + " blocks"); LOG.info("Start Decommissioning " + node + " " + storage
+ " with " + storage.numBlocks() + " blocks");
}
heartbeatManager.startDecommission(node); heartbeatManager.startDecommission(node);
node.decommissioningStatus.setStartTime(now()); node.decommissioningStatus.setStartTime(now());
@ -728,24 +748,6 @@ public class DatanodeManager {
} }
} }
/**
* Generate new storage ID.
*
* @return unique storage ID
*
* Note: that collisions are still possible if somebody will try
* to bring in a data storage from a different cluster.
*/
private String newStorageID() {
String newID = null;
while(newID == null) {
newID = "DS" + Integer.toString(DFSUtil.getRandom().nextInt());
if (datanodeMap.get(newID) != null)
newID = null;
}
return newID;
}
/** /**
* Register the given datanode with the namenode. NB: the given * Register the given datanode with the namenode. NB: the given
* registration is mutated and given back to the datanode. * registration is mutated and given back to the datanode.
@ -784,9 +786,9 @@ public class DatanodeManager {
} }
NameNode.stateChangeLog.info("BLOCK* registerDatanode: from " NameNode.stateChangeLog.info("BLOCK* registerDatanode: from "
+ nodeReg + " storage " + nodeReg.getStorageID()); + nodeReg + " storage " + nodeReg.getDatanodeUuid());
DatanodeDescriptor nodeS = datanodeMap.get(nodeReg.getStorageID()); DatanodeDescriptor nodeS = getDatanode(nodeReg.getDatanodeUuid());
DatanodeDescriptor nodeN = host2DatanodeMap.getDatanodeByXferAddr( DatanodeDescriptor nodeN = host2DatanodeMap.getDatanodeByXferAddr(
nodeReg.getIpAddr(), nodeReg.getXferPort()); nodeReg.getIpAddr(), nodeReg.getXferPort());
@ -821,7 +823,7 @@ public class DatanodeManager {
*/ */
NameNode.stateChangeLog.info("BLOCK* registerDatanode: " + nodeS NameNode.stateChangeLog.info("BLOCK* registerDatanode: " + nodeS
+ " is replaced by " + nodeReg + " with the same storageID " + " is replaced by " + nodeReg + " with the same storageID "
+ nodeReg.getStorageID()); + nodeReg.getDatanodeUuid());
} }
boolean success = false; boolean success = false;
@ -855,18 +857,6 @@ public class DatanodeManager {
return; return;
} }
// this is a new datanode serving a new data storage
if ("".equals(nodeReg.getStorageID())) {
// this data storage has never been registered
// it is either empty or was created by pre-storageID version of DFS
nodeReg.setStorageID(newStorageID());
if (NameNode.stateChangeLog.isDebugEnabled()) {
NameNode.stateChangeLog.debug(
"BLOCK* NameSystem.registerDatanode: "
+ "new storageID " + nodeReg.getStorageID() + " assigned.");
}
}
DatanodeDescriptor nodeDescr DatanodeDescriptor nodeDescr
= new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK); = new DatanodeDescriptor(nodeReg, NetworkTopology.DEFAULT_RACK);
boolean success = false; boolean success = false;
@ -1234,10 +1224,10 @@ public class DatanodeManager {
/** Handle heartbeat from datanodes. */ /** Handle heartbeat from datanodes. */
public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg, public DatanodeCommand[] handleHeartbeat(DatanodeRegistration nodeReg,
final String blockPoolId, StorageReport[] reports, final String blockPoolId,
long capacity, long dfsUsed, long remaining, long blockPoolUsed, long cacheCapacity, long cacheUsed, int xceiverCount,
long cacheCapacity, long cacheUsed, int xceiverCount, int maxTransfers, int maxTransfers, int failedVolumes
int failedVolumes) throws IOException { ) throws IOException {
synchronized (heartbeatManager) { synchronized (heartbeatManager) {
synchronized (datanodeMap) { synchronized (datanodeMap) {
DatanodeDescriptor nodeinfo = null; DatanodeDescriptor nodeinfo = null;
@ -1257,9 +1247,9 @@ public class DatanodeManager {
return new DatanodeCommand[]{RegisterCommand.REGISTER}; return new DatanodeCommand[]{RegisterCommand.REGISTER};
} }
heartbeatManager.updateHeartbeat(nodeinfo, capacity, dfsUsed, heartbeatManager.updateHeartbeat(nodeinfo, reports,
remaining, blockPoolUsed, cacheCapacity, cacheUsed, xceiverCount, cacheCapacity, cacheUsed,
failedVolumes); xceiverCount, failedVolumes);
// If we are in safemode, do not send back any recovery / replication // If we are in safemode, do not send back any recovery / replication
// requests. Don't even drain the existing queue of work. // requests. Don't even drain the existing queue of work.
@ -1274,32 +1264,32 @@ public class DatanodeManager {
BlockRecoveryCommand brCommand = new BlockRecoveryCommand( BlockRecoveryCommand brCommand = new BlockRecoveryCommand(
blocks.length); blocks.length);
for (BlockInfoUnderConstruction b : blocks) { for (BlockInfoUnderConstruction b : blocks) {
DatanodeDescriptor[] expectedLocations = b.getExpectedLocations(); final DatanodeStorageInfo[] storages = b.getExpectedStorageLocations();
// Skip stale nodes during recovery - not heart beated for some time (30s by default). // Skip stale nodes during recovery - not heart beated for some time (30s by default).
List<DatanodeDescriptor> recoveryLocations = final List<DatanodeStorageInfo> recoveryLocations =
new ArrayList<DatanodeDescriptor>(expectedLocations.length); new ArrayList<DatanodeStorageInfo>(storages.length);
for (int i = 0; i < expectedLocations.length; i++) { for (int i = 0; i < storages.length; i++) {
if (!expectedLocations[i].isStale(this.staleInterval)) { if (!storages[i].getDatanodeDescriptor().isStale(staleInterval)) {
recoveryLocations.add(expectedLocations[i]); recoveryLocations.add(storages[i]);
} }
} }
// If we only get 1 replica after eliminating stale nodes, then choose all // If we only get 1 replica after eliminating stale nodes, then choose all
// replicas for recovery and let the primary data node handle failures. // replicas for recovery and let the primary data node handle failures.
if (recoveryLocations.size() > 1) { if (recoveryLocations.size() > 1) {
if (recoveryLocations.size() != expectedLocations.length) { if (recoveryLocations.size() != storages.length) {
LOG.info("Skipped stale nodes for recovery : " + LOG.info("Skipped stale nodes for recovery : " +
(expectedLocations.length - recoveryLocations.size())); (storages.length - recoveryLocations.size()));
} }
brCommand.add(new RecoveringBlock( brCommand.add(new RecoveringBlock(
new ExtendedBlock(blockPoolId, b), new ExtendedBlock(blockPoolId, b),
recoveryLocations.toArray(new DatanodeDescriptor[recoveryLocations.size()]), DatanodeStorageInfo.toDatanodeInfos(recoveryLocations),
b.getBlockRecoveryId())); b.getBlockRecoveryId()));
} else { } else {
// If too many replicas are stale, then choose all replicas to participate // If too many replicas are stale, then choose all replicas to participate
// in block recovery. // in block recovery.
brCommand.add(new RecoveringBlock( brCommand.add(new RecoveringBlock(
new ExtendedBlock(blockPoolId, b), new ExtendedBlock(blockPoolId, b),
expectedLocations, DatanodeStorageInfo.toDatanodeInfos(storages),
b.getBlockRecoveryId())); b.getBlockRecoveryId()));
} }
} }
@ -1416,7 +1406,9 @@ public class DatanodeManager {
LOG.info("Marking all datandoes as stale"); LOG.info("Marking all datandoes as stale");
synchronized (datanodeMap) { synchronized (datanodeMap) {
for (DatanodeDescriptor dn : datanodeMap.values()) { for (DatanodeDescriptor dn : datanodeMap.values()) {
dn.markStaleAfterFailover(); for(DatanodeStorageInfo storage : dn.getStorageInfos()) {
storage.markStaleAfterFailover();
}
} }
} }
} }
@ -1451,7 +1443,15 @@ public class DatanodeManager {
return getClass().getSimpleName() + ": " + host2DatanodeMap; return getClass().getSimpleName() + ": " + host2DatanodeMap;
} }
public void clearPendingCachingCommands() {
for (DatanodeDescriptor dn : datanodeMap.values()) {
dn.getPendingCached().clear();
dn.getPendingUncached().clear();
}
}
public void setShouldSendCachingCommands(boolean shouldSendCachingCommands) { public void setShouldSendCachingCommands(boolean shouldSendCachingCommands) {
this.shouldSendCachingCommands = shouldSendCachingCommands; this.shouldSendCachingCommands = shouldSendCachingCommands;
} }
} }

View File

@ -0,0 +1,288 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.blockmanagement;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage.State;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
/**
* A Datanode has one or more storages. A storage in the Datanode is represented
* by this class.
*/
public class DatanodeStorageInfo {
public static final DatanodeStorageInfo[] EMPTY_ARRAY = {};
public static DatanodeInfo[] toDatanodeInfos(DatanodeStorageInfo[] storages) {
return toDatanodeInfos(Arrays.asList(storages));
}
static DatanodeInfo[] toDatanodeInfos(List<DatanodeStorageInfo> storages) {
final DatanodeInfo[] datanodes = new DatanodeInfo[storages.size()];
for(int i = 0; i < storages.size(); i++) {
datanodes[i] = storages.get(i).getDatanodeDescriptor();
}
return datanodes;
}
static DatanodeDescriptor[] toDatanodeDescriptors(
DatanodeStorageInfo[] storages) {
DatanodeDescriptor[] datanodes = new DatanodeDescriptor[storages.length];
for (int i = 0; i < storages.length; ++i) {
datanodes[i] = storages[i].getDatanodeDescriptor();
}
return datanodes;
}
public static String[] toStorageIDs(DatanodeStorageInfo[] storages) {
String[] storageIDs = new String[storages.length];
for(int i = 0; i < storageIDs.length; i++) {
storageIDs[i] = storages[i].getStorageID();
}
return storageIDs;
}
public static StorageType[] toStorageTypes(DatanodeStorageInfo[] storages) {
StorageType[] storageTypes = new StorageType[storages.length];
for(int i = 0; i < storageTypes.length; i++) {
storageTypes[i] = storages[i].getStorageType();
}
return storageTypes;
}
/**
* Iterates over the list of blocks belonging to the data-node.
*/
class BlockIterator implements Iterator<BlockInfo> {
private BlockInfo current;
BlockIterator(BlockInfo head) {
this.current = head;
}
public boolean hasNext() {
return current != null;
}
public BlockInfo next() {
BlockInfo res = current;
current = current.getNext(current.findStorageInfo(DatanodeStorageInfo.this));
return res;
}
public void remove() {
throw new UnsupportedOperationException("Sorry. can't remove.");
}
}
private final DatanodeDescriptor dn;
private final String storageID;
private final StorageType storageType;
private final State state;
private long capacity;
private long dfsUsed;
private long remaining;
private long blockPoolUsed;
private volatile BlockInfo blockList = null;
private int numBlocks = 0;
/** The number of block reports received */
private int blockReportCount = 0;
/**
* Set to false on any NN failover, and reset to true
* whenever a block report is received.
*/
private boolean heartbeatedSinceFailover = false;
/**
* At startup or at failover, the storages in the cluster may have pending
* block deletions from a previous incarnation of the NameNode. The block
* contents are considered as stale until a block report is received. When a
* storage is considered as stale, the replicas on it are also considered as
* stale. If any block has at least one stale replica, then no invalidations
* will be processed for this block. See HDFS-1972.
*/
private boolean blockContentsStale = true;
DatanodeStorageInfo(DatanodeDescriptor dn, DatanodeStorage s) {
this.dn = dn;
this.storageID = s.getStorageID();
this.storageType = s.getStorageType();
this.state = s.getState();
}
int getBlockReportCount() {
return blockReportCount;
}
void setBlockReportCount(int blockReportCount) {
this.blockReportCount = blockReportCount;
}
boolean areBlockContentsStale() {
return blockContentsStale;
}
void markStaleAfterFailover() {
heartbeatedSinceFailover = false;
blockContentsStale = true;
}
void receivedHeartbeat(StorageReport report) {
updateState(report);
heartbeatedSinceFailover = true;
}
void receivedBlockReport() {
if (heartbeatedSinceFailover) {
blockContentsStale = false;
}
blockReportCount++;
}
@VisibleForTesting
public void setUtilizationForTesting(long capacity, long dfsUsed,
long remaining, long blockPoolUsed) {
this.capacity = capacity;
this.dfsUsed = dfsUsed;
this.remaining = remaining;
this.blockPoolUsed = blockPoolUsed;
}
State getState() {
return this.state;
}
String getStorageID() {
return storageID;
}
StorageType getStorageType() {
return storageType;
}
long getCapacity() {
return capacity;
}
long getDfsUsed() {
return dfsUsed;
}
long getRemaining() {
return remaining;
}
long getBlockPoolUsed() {
return blockPoolUsed;
}
boolean addBlock(BlockInfo b) {
if(!b.addStorage(this))
return false;
// add to the head of the data-node list
blockList = b.listInsert(blockList, this);
numBlocks++;
return true;
}
boolean removeBlock(BlockInfo b) {
blockList = b.listRemove(blockList, this);
if (b.removeStorage(this)) {
numBlocks--;
return true;
} else {
return false;
}
}
int numBlocks() {
return numBlocks;
}
Iterator<BlockInfo> getBlockIterator() {
return new BlockIterator(blockList);
}
/**
* Move block to the head of the list of blocks belonging to the data-node.
* @return the index of the head of the blockList
*/
int moveBlockToHead(BlockInfo b, int curIndex, int headIndex) {
blockList = b.moveBlockToHead(blockList, this, curIndex, headIndex);
return curIndex;
}
/**
* Used for testing only
* @return the head of the blockList
*/
@VisibleForTesting
BlockInfo getBlockListHeadForTesting(){
return blockList;
}
void updateState(StorageReport r) {
capacity = r.getCapacity();
dfsUsed = r.getDfsUsed();
remaining = r.getRemaining();
blockPoolUsed = r.getBlockPoolUsed();
}
public DatanodeDescriptor getDatanodeDescriptor() {
return dn;
}
/** Increment the number of blocks scheduled for each given storage */
public static void incrementBlocksScheduled(DatanodeStorageInfo... storages) {
for (DatanodeStorageInfo s : storages) {
s.getDatanodeDescriptor().incrementBlocksScheduled();
}
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
} else if (obj == null || !(obj instanceof DatanodeStorageInfo)) {
return false;
}
final DatanodeStorageInfo that = (DatanodeStorageInfo)obj;
return this.storageID.equals(that.storageID);
}
@Override
public int hashCode() {
return storageID.hashCode();
}
@Override
public String toString() {
return "[" + storageType + "]" + storageID + ":" + state;
}
}

View File

@ -27,6 +27,7 @@ import org.apache.hadoop.hdfs.DFSConfigKeys;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.protocol.DatanodeID; import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.server.namenode.Namesystem; import org.apache.hadoop.hdfs.server.namenode.Namesystem;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
@ -181,7 +182,7 @@ class HeartbeatManager implements DatanodeStatistics {
addDatanode(d); addDatanode(d);
//update its timestamp //update its timestamp
d.updateHeartbeat(0L, 0L, 0L, 0L, 0L, 0L, 0, 0); d.updateHeartbeat(StorageReport.EMPTY_ARRAY, 0L, 0L, 0, 0);
} }
} }
@ -203,11 +204,11 @@ class HeartbeatManager implements DatanodeStatistics {
} }
synchronized void updateHeartbeat(final DatanodeDescriptor node, synchronized void updateHeartbeat(final DatanodeDescriptor node,
long capacity, long dfsUsed, long remaining, long blockPoolUsed, StorageReport[] reports, long cacheCapacity, long cacheUsed,
long cacheCapacity, long cacheUsed, int xceiverCount, int failedVolumes) { int xceiverCount, int failedVolumes) {
stats.subtract(node); stats.subtract(node);
node.updateHeartbeat(capacity, dfsUsed, remaining, blockPoolUsed, node.updateHeartbeat(reports, cacheCapacity, cacheUsed,
cacheCapacity, cacheUsed, xceiverCount, failedVolumes); xceiverCount, failedVolumes);
stats.add(node); stats.add(node);
} }
@ -358,3 +359,4 @@ class HeartbeatManager implements DatanodeStatistics {
} }
} }
} }

View File

@ -78,10 +78,10 @@ class InvalidateBlocks {
*/ */
synchronized void add(final Block block, final DatanodeInfo datanode, synchronized void add(final Block block, final DatanodeInfo datanode,
final boolean log) { final boolean log) {
LightWeightHashSet<Block> set = node2blocks.get(datanode.getStorageID()); LightWeightHashSet<Block> set = node2blocks.get(datanode.getDatanodeUuid());
if (set == null) { if (set == null) {
set = new LightWeightHashSet<Block>(); set = new LightWeightHashSet<Block>();
node2blocks.put(datanode.getStorageID(), set); node2blocks.put(datanode.getDatanodeUuid(), set);
} }
if (set.add(block)) { if (set.add(block)) {
numBlocks++; numBlocks++;

View File

@ -34,5 +34,5 @@ public interface MutableBlockCollection extends BlockCollection {
* and set the locations. * and set the locations.
*/ */
public BlockInfoUnderConstruction setLastBlock(BlockInfo lastBlock, public BlockInfoUnderConstruction setLastBlock(BlockInfo lastBlock,
DatanodeDescriptor[] locations) throws IOException; DatanodeStorageInfo[] storages) throws IOException;
} }

View File

@ -42,11 +42,13 @@ class PendingDataNodeMessages {
static class ReportedBlockInfo { static class ReportedBlockInfo {
private final Block block; private final Block block;
private final DatanodeDescriptor dn; private final DatanodeDescriptor dn;
private final String storageID;
private final ReplicaState reportedState; private final ReplicaState reportedState;
ReportedBlockInfo(DatanodeDescriptor dn, Block block, ReportedBlockInfo(DatanodeDescriptor dn, String storageID, Block block,
ReplicaState reportedState) { ReplicaState reportedState) {
this.dn = dn; this.dn = dn;
this.storageID = storageID;
this.block = block; this.block = block;
this.reportedState = reportedState; this.reportedState = reportedState;
} }
@ -59,6 +61,10 @@ class PendingDataNodeMessages {
return dn; return dn;
} }
String getStorageID() {
return storageID;
}
ReplicaState getReportedState() { ReplicaState getReportedState() {
return reportedState; return reportedState;
} }
@ -70,11 +76,11 @@ class PendingDataNodeMessages {
} }
} }
void enqueueReportedBlock(DatanodeDescriptor dn, Block block, void enqueueReportedBlock(DatanodeDescriptor dn, String storageID, Block block,
ReplicaState reportedState) { ReplicaState reportedState) {
block = new Block(block); block = new Block(block);
getBlockQueue(block).add( getBlockQueue(block).add(
new ReportedBlockInfo(dn, block, reportedState)); new ReportedBlockInfo(dn, storageID, block, reportedState));
count++; count++;
} }

View File

@ -118,16 +118,29 @@ public class JspHelper {
} }
} }
/**
* convenience method for canonicalizing host name.
* @param addr name:port or name
* @return canonicalized host name
*/
public static String canonicalize(String addr) {
// default port 1 is supplied to allow addr without port.
// the port will be ignored.
return NetUtils.createSocketAddr(addr, 1).getAddress()
.getCanonicalHostName();
}
/** /**
* A helper class that generates the correct URL for different schema. * A helper class that generates the correct URL for different schema.
* *
*/ */
public static final class Url { public static final class Url {
public static String authority(String scheme, DatanodeID d) { public static String authority(String scheme, DatanodeID d) {
String fqdn = canonicalize(d.getIpAddr());
if (scheme.equals("http")) { if (scheme.equals("http")) {
return d.getInfoAddr(); return fqdn + ":" + d.getInfoPort();
} else if (scheme.equals("https")) { } else if (scheme.equals("https")) {
return d.getInfoSecureAddr(); return fqdn + ":" + d.getInfoSecurePort();
} else { } else {
throw new IllegalArgumentException("Unknown scheme:" + scheme); throw new IllegalArgumentException("Unknown scheme:" + scheme);
} }

View File

@ -237,6 +237,8 @@ public abstract class Storage extends StorageInfo {
final StorageDirType dirType; // storage dir type final StorageDirType dirType; // storage dir type
FileLock lock; // storage lock FileLock lock; // storage lock
private String storageUuid = null; // Storage directory identifier.
public StorageDirectory(File dir) { public StorageDirectory(File dir) {
// default dirType is null // default dirType is null
this(dir, null, true); this(dir, null, true);
@ -246,6 +248,14 @@ public abstract class Storage extends StorageInfo {
this(dir, dirType, true); this(dir, dirType, true);
} }
public void setStorageUuid(String storageUuid) {
this.storageUuid = storageUuid;
}
public String getStorageUuid() {
return storageUuid;
}
/** /**
* Constructor * Constructor
* @param dir directory corresponding to the storage * @param dir directory corresponding to the storage

View File

@ -27,6 +27,7 @@ import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.Block; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -147,7 +148,7 @@ class BPOfferService {
return false; return false;
} }
String getBlockPoolId() { synchronized String getBlockPoolId() {
if (bpNSInfo != null) { if (bpNSInfo != null) {
return bpNSInfo.getBlockPoolID(); return bpNSInfo.getBlockPoolID();
} else { } else {
@ -162,29 +163,30 @@ class BPOfferService {
} }
@Override @Override
public String toString() { public synchronized String toString() {
if (bpNSInfo == null) { if (bpNSInfo == null) {
// If we haven't yet connected to our NN, we don't yet know our // If we haven't yet connected to our NN, we don't yet know our
// own block pool ID. // own block pool ID.
// If _none_ of the block pools have connected yet, we don't even // If _none_ of the block pools have connected yet, we don't even
// know the storage ID of this DN. // know the DatanodeID ID of this DN.
String storageId = dn.getStorageId(); String datanodeUuid = dn.getDatanodeUuid();
if (storageId == null || "".equals(storageId)) {
storageId = "unknown"; if (datanodeUuid == null || datanodeUuid.isEmpty()) {
datanodeUuid = "unassigned";
} }
return "Block pool <registering> (storage id " + storageId + return "Block pool <registering> (Datanode Uuid " + datanodeUuid + ")";
")";
} else { } else {
return "Block pool " + getBlockPoolId() + return "Block pool " + getBlockPoolId() +
" (storage id " + dn.getStorageId() + " (Datanode Uuid " + dn.getDatanodeUuid() +
")"; ")";
} }
} }
void reportBadBlocks(ExtendedBlock block) { void reportBadBlocks(ExtendedBlock block,
String storageUuid, StorageType storageType) {
checkBlock(block); checkBlock(block);
for (BPServiceActor actor : bpServices) { for (BPServiceActor actor : bpServices) {
actor.reportBadBlocks(block); actor.reportBadBlocks(block, storageUuid, storageType);
} }
} }
@ -193,7 +195,8 @@ class BPOfferService {
* till namenode is informed before responding with success to the * till namenode is informed before responding with success to the
* client? For now we don't. * client? For now we don't.
*/ */
void notifyNamenodeReceivedBlock(ExtendedBlock block, String delHint) { void notifyNamenodeReceivedBlock(
ExtendedBlock block, String delHint, String storageUuid) {
checkBlock(block); checkBlock(block);
checkDelHint(delHint); checkDelHint(delHint);
ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
@ -202,7 +205,7 @@ class BPOfferService {
delHint); delHint);
for (BPServiceActor actor : bpServices) { for (BPServiceActor actor : bpServices) {
actor.notifyNamenodeBlockImmediately(bInfo); actor.notifyNamenodeBlockImmediately(bInfo, storageUuid);
} }
} }
@ -219,23 +222,23 @@ class BPOfferService {
"delHint is null"); "delHint is null");
} }
void notifyNamenodeDeletedBlock(ExtendedBlock block) { void notifyNamenodeDeletedBlock(ExtendedBlock block, String storageUuid) {
checkBlock(block); checkBlock(block);
ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
block.getLocalBlock(), BlockStatus.DELETED_BLOCK, null); block.getLocalBlock(), BlockStatus.DELETED_BLOCK, null);
for (BPServiceActor actor : bpServices) { for (BPServiceActor actor : bpServices) {
actor.notifyNamenodeDeletedBlock(bInfo); actor.notifyNamenodeDeletedBlock(bInfo, storageUuid);
} }
} }
void notifyNamenodeReceivingBlock(ExtendedBlock block) { void notifyNamenodeReceivingBlock(ExtendedBlock block, String storageUuid) {
checkBlock(block); checkBlock(block);
ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo( ReceivedDeletedBlockInfo bInfo = new ReceivedDeletedBlockInfo(
block.getLocalBlock(), BlockStatus.RECEIVING_BLOCK, null); block.getLocalBlock(), BlockStatus.RECEIVING_BLOCK, null);
for (BPServiceActor actor : bpServices) { for (BPServiceActor actor : bpServices) {
actor.notifyNamenodeBlockImmediately(bInfo); actor.notifyNamenodeBlockImmediately(bInfo, storageUuid);
} }
} }
@ -274,12 +277,22 @@ class BPOfferService {
synchronized void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException { synchronized void verifyAndSetNamespaceInfo(NamespaceInfo nsInfo) throws IOException {
if (this.bpNSInfo == null) { if (this.bpNSInfo == null) {
this.bpNSInfo = nsInfo; this.bpNSInfo = nsInfo;
boolean success = false;
// Now that we know the namespace ID, etc, we can pass this to the DN. // Now that we know the namespace ID, etc, we can pass this to the DN.
// The DN can now initialize its local storage if we are the // The DN can now initialize its local storage if we are the
// first BP to handshake, etc. // first BP to handshake, etc.
try {
dn.initBlockPool(this); dn.initBlockPool(this);
return; success = true;
} finally {
if (!success) {
// The datanode failed to initialize the BP. We need to reset
// the namespace info so that other BPService actors still have
// a chance to set it, and re-initialize the datanode.
this.bpNSInfo = null;
}
}
} else { } else {
checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(), checkNSEquality(bpNSInfo.getBlockPoolID(), nsInfo.getBlockPoolID(),
"Blockpool ID"); "Blockpool ID");
@ -328,7 +341,7 @@ class BPOfferService {
} }
} }
synchronized DatanodeRegistration createRegistration() { synchronized DatanodeRegistration createRegistration() throws IOException {
Preconditions.checkState(bpNSInfo != null, Preconditions.checkState(bpNSInfo != null,
"getRegistration() can only be called after initial handshake"); "getRegistration() can only be called after initial handshake");
return dn.createBPRegistration(bpNSInfo); return dn.createBPRegistration(bpNSInfo);

View File

@ -22,7 +22,7 @@ import static org.apache.hadoop.util.Time.now;
import java.io.IOException; import java.io.IOException;
import java.net.InetSocketAddress; import java.net.InetSocketAddress;
import java.net.SocketTimeoutException; import java.net.SocketTimeoutException;
import java.net.URI; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -31,6 +31,7 @@ import org.apache.commons.logging.Log;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
import org.apache.hadoop.hdfs.DFSUtil; import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.protocol.BlockListAsLongs; import org.apache.hadoop.hdfs.protocol.BlockListAsLongs;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo; import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock; import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
@ -52,7 +53,6 @@ import org.apache.hadoop.hdfs.server.protocol.StorageReceivedDeletedBlocks;
import org.apache.hadoop.hdfs.server.protocol.StorageReport; import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.ipc.RemoteException; import org.apache.hadoop.ipc.RemoteException;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Time; import org.apache.hadoop.util.Time;
import org.apache.hadoop.util.VersionInfo; import org.apache.hadoop.util.VersionInfo;
import org.apache.hadoop.util.VersionUtil; import org.apache.hadoop.util.VersionUtil;
@ -100,8 +100,8 @@ class BPServiceActor implements Runnable {
* keyed by block ID, contains the pending changes which have yet to be * keyed by block ID, contains the pending changes which have yet to be
* reported to the NN. Access should be synchronized on this object. * reported to the NN. Access should be synchronized on this object.
*/ */
private final Map<Long, ReceivedDeletedBlockInfo> pendingIncrementalBR private final Map<String, PerStoragePendingIncrementalBR>
= Maps.newHashMap(); pendingIncrementalBRperStorage = Maps.newHashMap();
private volatile int pendingReceivedRequests = 0; private volatile int pendingReceivedRequests = 0;
private volatile boolean shouldServiceRun = true; private volatile boolean shouldServiceRun = true;
@ -244,12 +244,15 @@ class BPServiceActor implements Runnable {
resetBlockReportTime = true; // reset future BRs for randomness resetBlockReportTime = true; // reset future BRs for randomness
} }
void reportBadBlocks(ExtendedBlock block) { void reportBadBlocks(ExtendedBlock block,
String storageUuid, StorageType storageType) {
if (bpRegistration == null) { if (bpRegistration == null) {
return; return;
} }
DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) }; DatanodeInfo[] dnArr = { new DatanodeInfo(bpRegistration) };
LocatedBlock[] blocks = { new LocatedBlock(block, dnArr) }; String[] uuids = { storageUuid };
StorageType[] types = { storageType };
LocatedBlock[] blocks = { new LocatedBlock(block, dnArr, uuids, types) };
try { try {
bpNamenode.reportBadBlocks(blocks); bpNamenode.reportBadBlocks(blocks);
@ -263,49 +266,102 @@ class BPServiceActor implements Runnable {
} }
/** /**
* Report received blocks and delete hints to the Namenode * Report received blocks and delete hints to the Namenode for each
* storage.
* *
* @throws IOException * @throws IOException
*/ */
private void reportReceivedDeletedBlocks() throws IOException { private void reportReceivedDeletedBlocks() throws IOException {
// check if there are newly received blocks // Generate a list of the pending reports for each storage under the lock
ReceivedDeletedBlockInfo[] receivedAndDeletedBlockArray = null; ArrayList<StorageReceivedDeletedBlocks> reports =
synchronized (pendingIncrementalBR) { new ArrayList<StorageReceivedDeletedBlocks>(pendingIncrementalBRperStorage.size());
int numBlocks = pendingIncrementalBR.size(); synchronized (pendingIncrementalBRperStorage) {
if (numBlocks > 0) { for (Map.Entry<String, PerStoragePendingIncrementalBR> entry :
// pendingIncrementalBRperStorage.entrySet()) {
final String storageUuid = entry.getKey();
final PerStoragePendingIncrementalBR perStorageMap = entry.getValue();
if (perStorageMap.getBlockInfoCount() > 0) {
// Send newly-received and deleted blockids to namenode // Send newly-received and deleted blockids to namenode
// ReceivedDeletedBlockInfo[] rdbi = perStorageMap.dequeueBlockInfos();
receivedAndDeletedBlockArray = pendingIncrementalBR pendingReceivedRequests =
.values().toArray(new ReceivedDeletedBlockInfo[numBlocks]); (pendingReceivedRequests > rdbi.length ?
(pendingReceivedRequests - rdbi.length) : 0);
reports.add(new StorageReceivedDeletedBlocks(storageUuid, rdbi));
} }
pendingIncrementalBR.clear();
} }
if (receivedAndDeletedBlockArray != null) { }
StorageReceivedDeletedBlocks[] report = { new StorageReceivedDeletedBlocks(
bpRegistration.getStorageID(), receivedAndDeletedBlockArray) }; if (reports.size() == 0) {
// Nothing new to report.
return;
}
// Send incremental block reports to the Namenode outside the lock
boolean success = false; boolean success = false;
try { try {
bpNamenode.blockReceivedAndDeleted(bpRegistration, bpos.getBlockPoolId(), bpNamenode.blockReceivedAndDeleted(bpRegistration,
report); bpos.getBlockPoolId(),
reports.toArray(new StorageReceivedDeletedBlocks[reports.size()]));
success = true; success = true;
} finally { } finally {
synchronized (pendingIncrementalBR) {
if (!success) { if (!success) {
synchronized (pendingIncrementalBRperStorage) {
for (StorageReceivedDeletedBlocks report : reports) {
// If we didn't succeed in sending the report, put all of the // If we didn't succeed in sending the report, put all of the
// blocks back onto our queue, but only in the case where we didn't // blocks back onto our queue, but only in the case where we
// put something newer in the meantime. // didn't put something newer in the meantime.
for (ReceivedDeletedBlockInfo rdbi : receivedAndDeletedBlockArray) { PerStoragePendingIncrementalBR perStorageMap =
if (!pendingIncrementalBR.containsKey(rdbi.getBlock().getBlockId())) { pendingIncrementalBRperStorage.get(report.getStorageID());
pendingIncrementalBR.put(rdbi.getBlock().getBlockId(), rdbi); pendingReceivedRequests +=
perStorageMap.putMissingBlockInfos(report.getBlocks());
} }
} }
} }
pendingReceivedRequests = pendingIncrementalBR.size();
} }
} }
/**
* Retrieve the incremental BR state for a given storage UUID
* @param storageUuid
* @return
*/
private PerStoragePendingIncrementalBR getIncrementalBRMapForStorage(
String storageUuid) {
PerStoragePendingIncrementalBR mapForStorage =
pendingIncrementalBRperStorage.get(storageUuid);
if (mapForStorage == null) {
// This is the first time we are adding incremental BR state for
// this storage so create a new map. This is required once per
// storage, per service actor.
mapForStorage = new PerStoragePendingIncrementalBR();
pendingIncrementalBRperStorage.put(storageUuid, mapForStorage);
} }
return mapForStorage;
}
/**
* Add a blockInfo for notification to NameNode. If another entry
* exists for the same block it is removed.
*
* Caller must synchronize access using pendingIncrementalBRperStorage.
* @param bInfo
* @param storageUuid
*/
void addPendingReplicationBlockInfo(ReceivedDeletedBlockInfo bInfo,
String storageUuid) {
// Make sure another entry for the same block is first removed.
// There may only be one such entry.
for (Map.Entry<String, PerStoragePendingIncrementalBR> entry :
pendingIncrementalBRperStorage.entrySet()) {
if (entry.getValue().removeBlockInfo(bInfo)) {
break;
}
}
getIncrementalBRMapForStorage(storageUuid).putBlockInfo(bInfo);
} }
/* /*
@ -313,19 +369,19 @@ class BPServiceActor implements Runnable {
* till namenode is informed before responding with success to the * till namenode is informed before responding with success to the
* client? For now we don't. * client? For now we don't.
*/ */
void notifyNamenodeBlockImmediately(ReceivedDeletedBlockInfo bInfo) { void notifyNamenodeBlockImmediately(
synchronized (pendingIncrementalBR) { ReceivedDeletedBlockInfo bInfo, String storageUuid) {
pendingIncrementalBR.put( synchronized (pendingIncrementalBRperStorage) {
bInfo.getBlock().getBlockId(), bInfo); addPendingReplicationBlockInfo(bInfo, storageUuid);
pendingReceivedRequests++; pendingReceivedRequests++;
pendingIncrementalBR.notifyAll(); pendingIncrementalBRperStorage.notifyAll();
} }
} }
void notifyNamenodeDeletedBlock(ReceivedDeletedBlockInfo bInfo) { void notifyNamenodeDeletedBlock(
synchronized (pendingIncrementalBR) { ReceivedDeletedBlockInfo bInfo, String storageUuid) {
pendingIncrementalBR.put( synchronized (pendingIncrementalBRperStorage) {
bInfo.getBlock().getBlockId(), bInfo); addPendingReplicationBlockInfo(bInfo, storageUuid);
} }
} }
@ -334,13 +390,13 @@ class BPServiceActor implements Runnable {
*/ */
@VisibleForTesting @VisibleForTesting
void triggerBlockReportForTests() { void triggerBlockReportForTests() {
synchronized (pendingIncrementalBR) { synchronized (pendingIncrementalBRperStorage) {
lastBlockReport = 0; lastBlockReport = 0;
lastHeartbeat = 0; lastHeartbeat = 0;
pendingIncrementalBR.notifyAll(); pendingIncrementalBRperStorage.notifyAll();
while (lastBlockReport == 0) { while (lastBlockReport == 0) {
try { try {
pendingIncrementalBR.wait(100); pendingIncrementalBRperStorage.wait(100);
} catch (InterruptedException e) { } catch (InterruptedException e) {
return; return;
} }
@ -350,12 +406,12 @@ class BPServiceActor implements Runnable {
@VisibleForTesting @VisibleForTesting
void triggerHeartbeatForTests() { void triggerHeartbeatForTests() {
synchronized (pendingIncrementalBR) { synchronized (pendingIncrementalBRperStorage) {
lastHeartbeat = 0; lastHeartbeat = 0;
pendingIncrementalBR.notifyAll(); pendingIncrementalBRperStorage.notifyAll();
while (lastHeartbeat == 0) { while (lastHeartbeat == 0) {
try { try {
pendingIncrementalBR.wait(100); pendingIncrementalBRperStorage.wait(100);
} catch (InterruptedException e) { } catch (InterruptedException e) {
return; return;
} }
@ -365,13 +421,13 @@ class BPServiceActor implements Runnable {
@VisibleForTesting @VisibleForTesting
void triggerDeletionReportForTests() { void triggerDeletionReportForTests() {
synchronized (pendingIncrementalBR) { synchronized (pendingIncrementalBRperStorage) {
lastDeletedReport = 0; lastDeletedReport = 0;
pendingIncrementalBR.notifyAll(); pendingIncrementalBRperStorage.notifyAll();
while (lastDeletedReport == 0) { while (lastDeletedReport == 0) {
try { try {
pendingIncrementalBR.wait(100); pendingIncrementalBRperStorage.wait(100);
} catch (InterruptedException e) { } catch (InterruptedException e) {
return; return;
} }
@ -395,23 +451,38 @@ class BPServiceActor implements Runnable {
// a FINALIZED one. // a FINALIZED one.
reportReceivedDeletedBlocks(); reportReceivedDeletedBlocks();
// Send one block report per known storage.
// Create block report // Create block report
long brCreateStartTime = now(); long brCreateStartTime = now();
BlockListAsLongs bReport = dn.getFSDataset().getBlockReport( long totalBlockCount = 0;
bpos.getBlockPoolId());
Map<DatanodeStorage, BlockListAsLongs> perVolumeBlockLists =
dn.getFSDataset().getBlockReports(bpos.getBlockPoolId());
// Send block report // Send block report
long brSendStartTime = now(); long brSendStartTime = now();
StorageBlockReport[] report = { new StorageBlockReport( StorageBlockReport[] reports =
new DatanodeStorage(bpRegistration.getStorageID()), new StorageBlockReport[perVolumeBlockLists.size()];
bReport.getBlockListAsLongs()) };
cmd = bpNamenode.blockReport(bpRegistration, bpos.getBlockPoolId(), report); int i = 0;
for(Map.Entry<DatanodeStorage, BlockListAsLongs> kvPair : perVolumeBlockLists.entrySet()) {
DatanodeStorage dnStorage = kvPair.getKey();
BlockListAsLongs blockList = kvPair.getValue();
totalBlockCount += blockList.getNumberOfBlocks();
reports[i++] =
new StorageBlockReport(
dnStorage, blockList.getBlockListAsLongs());
}
cmd = bpNamenode.blockReport(bpRegistration, bpos.getBlockPoolId(), reports);
// Log the block report processing stats from Datanode perspective // Log the block report processing stats from Datanode perspective
long brSendCost = now() - brSendStartTime; long brSendCost = now() - brSendStartTime;
long brCreateCost = brSendStartTime - brCreateStartTime; long brCreateCost = brSendStartTime - brCreateStartTime;
dn.getMetrics().addBlockReport(brSendCost); dn.getMetrics().addBlockReport(brSendCost);
LOG.info("BlockReport of " + bReport.getNumberOfBlocks() LOG.info("BlockReport of " + totalBlockCount
+ " blocks took " + brCreateCost + " msec to generate and " + " blocks took " + brCreateCost + " msec to generate and "
+ brSendCost + " msecs for RPC and NN processing"); + brSendCost + " msecs for RPC and NN processing");
@ -466,17 +537,15 @@ class BPServiceActor implements Runnable {
} }
HeartbeatResponse sendHeartBeat() throws IOException { HeartbeatResponse sendHeartBeat() throws IOException {
StorageReport[] reports =
dn.getFSDataset().getStorageReports(bpos.getBlockPoolId());
if (LOG.isDebugEnabled()) { if (LOG.isDebugEnabled()) {
LOG.debug("Sending heartbeat from service actor: " + this); LOG.debug("Sending heartbeat with " + reports.length +
" storage reports from service actor: " + this);
} }
// reports number of failed volumes
StorageReport[] report = { new StorageReport(bpRegistration.getStorageID(), return bpNamenode.sendHeartbeat(bpRegistration,
false, reports,
dn.getFSDataset().getCapacity(),
dn.getFSDataset().getDfsUsed(),
dn.getFSDataset().getRemaining(),
dn.getFSDataset().getBlockPoolUsed(bpos.getBlockPoolId())) };
return bpNamenode.sendHeartbeat(bpRegistration, report,
dn.getFSDataset().getCacheCapacity(), dn.getFSDataset().getCacheCapacity(),
dn.getFSDataset().getCacheUsed(), dn.getFSDataset().getCacheUsed(),
dn.getXmitsInProgress(), dn.getXmitsInProgress(),
@ -496,9 +565,9 @@ class BPServiceActor implements Runnable {
} }
private String formatThreadName() { private String formatThreadName() {
Collection<URI> dataDirs = DataNode.getStorageDirs(dn.getConf()); Collection<StorageLocation> dataDirs =
return "DataNode: [" + DataNode.getStorageLocations(dn.getConf());
StringUtils.uriToString(dataDirs.toArray(new URI[0])) + "] " + return "DataNode: [" + dataDirs.toString() + "] " +
" heartbeating to " + nnAddr; " heartbeating to " + nnAddr;
} }
@ -608,10 +677,10 @@ class BPServiceActor implements Runnable {
// //
long waitTime = dnConf.heartBeatInterval - long waitTime = dnConf.heartBeatInterval -
(Time.now() - lastHeartbeat); (Time.now() - lastHeartbeat);
synchronized(pendingIncrementalBR) { synchronized(pendingIncrementalBRperStorage) {
if (waitTime > 0 && pendingReceivedRequests == 0) { if (waitTime > 0 && pendingReceivedRequests == 0) {
try { try {
pendingIncrementalBR.wait(waitTime); pendingIncrementalBRperStorage.wait(waitTime);
} catch (InterruptedException ie) { } catch (InterruptedException ie) {
LOG.warn("BPOfferService for " + this + " interrupted"); LOG.warn("BPOfferService for " + this + " interrupted");
} }
@ -782,4 +851,68 @@ class BPServiceActor implements Runnable {
} }
} }
private static class PerStoragePendingIncrementalBR {
private Map<Long, ReceivedDeletedBlockInfo> pendingIncrementalBR =
Maps.newHashMap();
/**
* Return the number of blocks on this storage that have pending
* incremental block reports.
* @return
*/
int getBlockInfoCount() {
return pendingIncrementalBR.size();
}
/**
* Dequeue and return all pending incremental block report state.
* @return
*/
ReceivedDeletedBlockInfo[] dequeueBlockInfos() {
ReceivedDeletedBlockInfo[] blockInfos =
pendingIncrementalBR.values().toArray(
new ReceivedDeletedBlockInfo[getBlockInfoCount()]);
pendingIncrementalBR.clear();
return blockInfos;
}
/**
* Add blocks from blockArray to pendingIncrementalBR, unless the
* block already exists in pendingIncrementalBR.
* @param blockArray list of blocks to add.
* @return the number of missing blocks that we added.
*/
int putMissingBlockInfos(ReceivedDeletedBlockInfo[] blockArray) {
int blocksPut = 0;
for (ReceivedDeletedBlockInfo rdbi : blockArray) {
if (!pendingIncrementalBR.containsKey(rdbi.getBlock().getBlockId())) {
pendingIncrementalBR.put(rdbi.getBlock().getBlockId(), rdbi);
++blocksPut;
}
}
return blocksPut;
}
/**
* Add pending incremental block report for a single block.
* @param blockID
* @param blockInfo
*/
void putBlockInfo(ReceivedDeletedBlockInfo blockInfo) {
pendingIncrementalBR.put(blockInfo.getBlock().getBlockId(), blockInfo);
}
/**
* Remove pending incremental block report for a single block if it
* exists.
*
* @param blockInfo
* @return true if a report was removed, false if no report existed for
* the given block.
*/
boolean removeBlockInfo(ReceivedDeletedBlockInfo blockInfo) {
return (pendingIncrementalBR.remove(blockInfo.getBlock().getBlockId()) != null);
}
}
} }

View File

@ -21,10 +21,13 @@ import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.DataInputStream; import java.io.DataInputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.EOFException;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.DataChecksum; import org.apache.hadoop.util.DataChecksum;
@ -67,6 +70,28 @@ public class BlockMetadataHeader {
return checksum; return checksum;
} }
/**
* Read the header without changing the position of the FileChannel.
*
* @param fc The FileChannel to read.
* @return the Metadata Header.
* @throws IOException on error.
*/
public static BlockMetadataHeader preadHeader(FileChannel fc)
throws IOException {
byte arr[] = new byte[2 + DataChecksum.HEADER_LEN];
ByteBuffer buf = ByteBuffer.wrap(arr);
while (buf.hasRemaining()) {
if (fc.read(buf, 0) <= 0) {
throw new EOFException("unexpected EOF while reading " +
"metadata file header");
}
}
short version = (short)((arr[0] << 8) | (arr[1] & 0xff));
DataChecksum dataChecksum = DataChecksum.newDataChecksum(arr, 2);
return new BlockMetadataHeader(version, dataChecksum);
}
/** /**
* This reads all the fields till the beginning of checksum. * This reads all the fields till the beginning of checksum.

View File

@ -187,7 +187,7 @@ class BlockPoolSliceScanner {
+ hours + " hours for block pool " + bpid); + hours + " hours for block pool " + bpid);
// get the list of blocks and arrange them in random order // get the list of blocks and arrange them in random order
List<Block> arr = dataset.getFinalizedBlocks(blockPoolId); List<FinalizedReplica> arr = dataset.getFinalizedBlocks(blockPoolId);
Collections.shuffle(arr); Collections.shuffle(arr);
long scanTime = -1; long scanTime = -1;

View File

@ -162,7 +162,8 @@ class BlockReceiver implements Closeable {
switch (stage) { switch (stage) {
case PIPELINE_SETUP_CREATE: case PIPELINE_SETUP_CREATE:
replicaInfo = datanode.data.createRbw(block); replicaInfo = datanode.data.createRbw(block);
datanode.notifyNamenodeReceivingBlock(block); datanode.notifyNamenodeReceivingBlock(
block, replicaInfo.getStorageUuid());
break; break;
case PIPELINE_SETUP_STREAMING_RECOVERY: case PIPELINE_SETUP_STREAMING_RECOVERY:
replicaInfo = datanode.data.recoverRbw( replicaInfo = datanode.data.recoverRbw(
@ -176,7 +177,8 @@ class BlockReceiver implements Closeable {
block.getLocalBlock()); block.getLocalBlock());
} }
block.setGenerationStamp(newGs); block.setGenerationStamp(newGs);
datanode.notifyNamenodeReceivingBlock(block); datanode.notifyNamenodeReceivingBlock(
block, replicaInfo.getStorageUuid());
break; break;
case PIPELINE_SETUP_APPEND_RECOVERY: case PIPELINE_SETUP_APPEND_RECOVERY:
replicaInfo = datanode.data.recoverAppend(block, newGs, minBytesRcvd); replicaInfo = datanode.data.recoverAppend(block, newGs, minBytesRcvd);
@ -185,7 +187,8 @@ class BlockReceiver implements Closeable {
block.getLocalBlock()); block.getLocalBlock());
} }
block.setGenerationStamp(newGs); block.setGenerationStamp(newGs);
datanode.notifyNamenodeReceivingBlock(block); datanode.notifyNamenodeReceivingBlock(
block, replicaInfo.getStorageUuid());
break; break;
case TRANSFER_RBW: case TRANSFER_RBW:
case TRANSFER_FINALIZED: case TRANSFER_FINALIZED:
@ -252,6 +255,10 @@ class BlockReceiver implements Closeable {
/** Return the datanode object. */ /** Return the datanode object. */
DataNode getDataNode() {return datanode;} DataNode getDataNode() {return datanode;}
String getStorageUuid() {
return replicaInfo.getStorageUuid();
}
/** /**
* close files. * close files.
*/ */
@ -1073,14 +1080,15 @@ class BlockReceiver implements Closeable {
: 0; : 0;
block.setNumBytes(replicaInfo.getNumBytes()); block.setNumBytes(replicaInfo.getNumBytes());
datanode.data.finalizeBlock(block); datanode.data.finalizeBlock(block);
datanode.closeBlock(block, DataNode.EMPTY_DEL_HINT); datanode.closeBlock(
block, DataNode.EMPTY_DEL_HINT, replicaInfo.getStorageUuid());
if (ClientTraceLog.isInfoEnabled() && isClient) { if (ClientTraceLog.isInfoEnabled() && isClient) {
long offset = 0; long offset = 0;
DatanodeRegistration dnR = datanode.getDNRegistrationForBP(block DatanodeRegistration dnR = datanode.getDNRegistrationForBP(block
.getBlockPoolId()); .getBlockPoolId());
ClientTraceLog.info(String.format(DN_CLIENTTRACE_FORMAT, inAddr, ClientTraceLog.info(String.format(DN_CLIENTTRACE_FORMAT, inAddr,
myAddr, block.getNumBytes(), "HDFS_WRITE", clientname, offset, myAddr, block.getNumBytes(), "HDFS_WRITE", clientname, offset,
dnR.getStorageID(), block, endTime - startTime)); dnR.getDatanodeUuid(), block, endTime - startTime));
} else { } else {
LOG.info("Received " + block + " size " + block.getNumBytes() LOG.info("Received " + block + " size " + block.getNumBytes()
+ " from " + inAddr); + " from " + inAddr);

View File

@ -21,8 +21,8 @@ package org.apache.hadoop.hdfs.server.datanode;
* The caching strategy we should use for an HDFS read or write operation. * The caching strategy we should use for an HDFS read or write operation.
*/ */
public class CachingStrategy { public class CachingStrategy {
private Boolean dropBehind; // null = use server defaults private final Boolean dropBehind; // null = use server defaults
private Long readahead; // null = use server defaults private final Long readahead; // null = use server defaults
public static CachingStrategy newDefaultStrategy() { public static CachingStrategy newDefaultStrategy() {
return new CachingStrategy(null, null); return new CachingStrategy(null, null);
@ -32,8 +32,28 @@ public class CachingStrategy {
return new CachingStrategy(true, null); return new CachingStrategy(true, null);
} }
public CachingStrategy duplicate() { public static class Builder {
return new CachingStrategy(this.dropBehind, this.readahead); private Boolean dropBehind;
private Long readahead;
public Builder(CachingStrategy prev) {
this.dropBehind = prev.dropBehind;
this.readahead = prev.readahead;
}
public Builder setDropBehind(Boolean dropBehind) {
this.dropBehind = dropBehind;
return this;
}
public Builder setReadahead(Long readahead) {
this.readahead = readahead;
return this;
}
public CachingStrategy build() {
return new CachingStrategy(dropBehind, readahead);
}
} }
public CachingStrategy(Boolean dropBehind, Long readahead) { public CachingStrategy(Boolean dropBehind, Long readahead) {
@ -45,18 +65,10 @@ public class CachingStrategy {
return dropBehind; return dropBehind;
} }
public void setDropBehind(Boolean dropBehind) {
this.dropBehind = dropBehind;
}
public Long getReadahead() { public Long getReadahead() {
return readahead; return readahead;
} }
public void setReadahead(Long readahead) {
this.readahead = readahead;
}
public String toString() { public String toString() {
return "CachingStrategy(dropBehind=" + dropBehind + return "CachingStrategy(dropBehind=" + dropBehind +
", readahead=" + readahead + ")"; ", readahead=" + readahead + ")";

View File

@ -17,10 +17,40 @@
*/ */
package org.apache.hadoop.hdfs.server.datanode; package org.apache.hadoop.hdfs.server.datanode;
import com.google.common.annotations.VisibleForTesting; import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import com.google.common.base.Joiner; import static org.apache.hadoop.util.ExitUtil.terminate;
import com.google.common.base.Preconditions;
import com.google.protobuf.BlockingService; import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.UnknownHostException;
import java.nio.channels.ClosedByInterruptException;
import java.nio.channels.SocketChannel;
import java.security.PrivilegedExceptionAction;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import javax.management.ObjectName;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -38,21 +68,42 @@ import org.apache.hadoop.hdfs.HDFSPolicyProvider;
import org.apache.hadoop.hdfs.HdfsConfiguration; import org.apache.hadoop.hdfs.HdfsConfiguration;
import org.apache.hadoop.hdfs.net.DomainPeerServer; import org.apache.hadoop.hdfs.net.DomainPeerServer;
import org.apache.hadoop.hdfs.net.TcpPeerServer; import org.apache.hadoop.hdfs.net.TcpPeerServer;
import org.apache.hadoop.hdfs.protocol.*; import org.apache.hadoop.hdfs.protocol.Block;
import org.apache.hadoop.hdfs.protocol.datatransfer.*; import org.apache.hadoop.hdfs.protocol.BlockLocalPathInfo;
import org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol;
import org.apache.hadoop.hdfs.protocol.DatanodeID;
import org.apache.hadoop.hdfs.protocol.DatanodeInfo;
import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata;
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
import org.apache.hadoop.hdfs.protocol.RecoveryInProgressException;
import org.apache.hadoop.hdfs.protocol.datatransfer.BlockConstructionStage;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferEncryptor;
import org.apache.hadoop.hdfs.protocol.datatransfer.DataTransferProtocol;
import org.apache.hadoop.hdfs.protocol.datatransfer.IOStreamPair;
import org.apache.hadoop.hdfs.protocol.datatransfer.Sender;
import org.apache.hadoop.hdfs.protocol.proto.ClientDatanodeProtocolProtos.ClientDatanodeProtocolService; import org.apache.hadoop.hdfs.protocol.proto.ClientDatanodeProtocolProtos.ClientDatanodeProtocolService;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.DNTransferAckProto; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.DNTransferAckProto;
import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status; import org.apache.hadoop.hdfs.protocol.proto.DataTransferProtos.Status;
import org.apache.hadoop.hdfs.protocol.proto.InterDatanodeProtocolProtos.InterDatanodeProtocolService; import org.apache.hadoop.hdfs.protocol.proto.InterDatanodeProtocolProtos.InterDatanodeProtocolService;
import org.apache.hadoop.hdfs.protocolPB.*; import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolPB;
import org.apache.hadoop.hdfs.security.token.block.*; import org.apache.hadoop.hdfs.protocolPB.ClientDatanodeProtocolServerSideTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.DatanodeProtocolClientSideTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.InterDatanodeProtocolPB;
import org.apache.hadoop.hdfs.protocolPB.InterDatanodeProtocolServerSideTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.InterDatanodeProtocolTranslatorPB;
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
import org.apache.hadoop.hdfs.security.token.block.BlockPoolTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenIdentifier;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager;
import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode; import org.apache.hadoop.hdfs.security.token.block.BlockTokenSecretManager.AccessMode;
import org.apache.hadoop.hdfs.security.token.block.ExportedBlockKeys;
import org.apache.hadoop.hdfs.security.token.block.InvalidBlockTokenException;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.ReplicaState;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.JspHelper; import org.apache.hadoop.hdfs.server.common.JspHelper;
import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.common.Util;
import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources; import org.apache.hadoop.hdfs.server.datanode.SecureDataNodeStarter.SecureResources;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi; import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
@ -61,7 +112,11 @@ import org.apache.hadoop.hdfs.server.datanode.web.resources.DatanodeWebHdfsMetho
import org.apache.hadoop.hdfs.server.namenode.FileChecksumServlets; import org.apache.hadoop.hdfs.server.namenode.FileChecksumServlets;
import org.apache.hadoop.hdfs.server.namenode.StreamFile; import org.apache.hadoop.hdfs.server.namenode.StreamFile;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.hdfs.server.protocol.*; import org.apache.hadoop.hdfs.server.protocol.DatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.DatanodeRegistration;
import org.apache.hadoop.hdfs.server.protocol.InterDatanodeProtocol;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.hdfs.web.WebHdfsFileSystem; import org.apache.hadoop.hdfs.web.WebHdfsFileSystem;
import org.apache.hadoop.hdfs.web.resources.Param; import org.apache.hadoop.hdfs.web.resources.Param;
import org.apache.hadoop.http.HttpConfig; import org.apache.hadoop.http.HttpConfig;
@ -84,23 +139,21 @@ import org.apache.hadoop.security.UserGroupInformation.AuthenticationMethod;
import org.apache.hadoop.security.authorize.AccessControlList; import org.apache.hadoop.security.authorize.AccessControlList;
import org.apache.hadoop.security.token.Token; import org.apache.hadoop.security.token.Token;
import org.apache.hadoop.security.token.TokenIdentifier; import org.apache.hadoop.security.token.TokenIdentifier;
import org.apache.hadoop.util.*; import org.apache.hadoop.util.Daemon;
import org.apache.hadoop.util.DiskChecker;
import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException; import org.apache.hadoop.util.DiskChecker.DiskOutOfSpaceException;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.JvmPauseMonitor;
import org.apache.hadoop.util.ServicePlugin;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.VersionInfo;
import org.mortbay.util.ajax.JSON; import org.mortbay.util.ajax.JSON;
import java.io.*; import com.google.common.annotations.VisibleForTesting;
import java.net.*; import com.google.common.base.Joiner;
import java.nio.channels.ClosedByInterruptException; import com.google.common.base.Preconditions;
import java.nio.channels.SocketChannel; import com.google.protobuf.BlockingService;
import java.security.PrivilegedExceptionAction;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import javax.management.ObjectName;
import static org.apache.hadoop.hdfs.DFSConfigKeys.*;
import static org.apache.hadoop.util.ExitUtil.terminate;
/********************************************************** /**********************************************************
* DataNode is a class (and program) that stores a set of * DataNode is a class (and program) that stores a set of
@ -209,7 +262,7 @@ public class DataNode extends Configured
private JvmPauseMonitor pauseMonitor; private JvmPauseMonitor pauseMonitor;
private SecureResources secureResources = null; private SecureResources secureResources = null;
private AbstractList<File> dataDirs; private List<StorageLocation> dataDirs;
private Configuration conf; private Configuration conf;
private final long maxNumberOfBlocksToLog; private final long maxNumberOfBlocksToLog;
@ -219,21 +272,12 @@ public class DataNode extends Configured
private final boolean getHdfsBlockLocationsEnabled; private final boolean getHdfsBlockLocationsEnabled;
private ObjectName dataNodeInfoBeanName; private ObjectName dataNodeInfoBeanName;
/**
* Create the DataNode given a configuration and an array of dataDirs.
* 'dataDirs' is where the blocks are stored.
*/
DataNode(final Configuration conf,
final AbstractList<File> dataDirs) throws IOException {
this(conf, dataDirs, null);
}
/** /**
* Create the DataNode given a configuration, an array of dataDirs, * Create the DataNode given a configuration, an array of dataDirs,
* and a namenode proxy * and a namenode proxy
*/ */
DataNode(final Configuration conf, DataNode(final Configuration conf,
final AbstractList<File> dataDirs, final List<StorageLocation> dataDirs,
final SecureResources resources) throws IOException { final SecureResources resources) throws IOException {
super(conf); super(conf);
this.maxNumberOfBlocksToLog = conf.getLong(DFS_MAX_NUM_BLOCKS_TO_LOG_KEY, this.maxNumberOfBlocksToLog = conf.getLong(DFS_MAX_NUM_BLOCKS_TO_LOG_KEY,
@ -566,10 +610,11 @@ public class DataNode extends Configured
} }
// calls specific to BP // calls specific to BP
protected void notifyNamenodeReceivedBlock(ExtendedBlock block, String delHint) { protected void notifyNamenodeReceivedBlock(
ExtendedBlock block, String delHint, String storageUuid) {
BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
if(bpos != null) { if(bpos != null) {
bpos.notifyNamenodeReceivedBlock(block, delHint); bpos.notifyNamenodeReceivedBlock(block, delHint, storageUuid);
} else { } else {
LOG.error("Cannot find BPOfferService for reporting block received for bpid=" LOG.error("Cannot find BPOfferService for reporting block received for bpid="
+ block.getBlockPoolId()); + block.getBlockPoolId());
@ -577,10 +622,11 @@ public class DataNode extends Configured
} }
// calls specific to BP // calls specific to BP
protected void notifyNamenodeReceivingBlock(ExtendedBlock block) { protected void notifyNamenodeReceivingBlock(
ExtendedBlock block, String storageUuid) {
BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
if(bpos != null) { if(bpos != null) {
bpos.notifyNamenodeReceivingBlock(block); bpos.notifyNamenodeReceivingBlock(block, storageUuid);
} else { } else {
LOG.error("Cannot find BPOfferService for reporting block receiving for bpid=" LOG.error("Cannot find BPOfferService for reporting block receiving for bpid="
+ block.getBlockPoolId()); + block.getBlockPoolId());
@ -588,10 +634,10 @@ public class DataNode extends Configured
} }
/** Notify the corresponding namenode to delete the block. */ /** Notify the corresponding namenode to delete the block. */
public void notifyNamenodeDeletedBlock(ExtendedBlock block) { public void notifyNamenodeDeletedBlock(ExtendedBlock block, String storageUuid) {
BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
if (bpos != null) { if (bpos != null) {
bpos.notifyNamenodeDeletedBlock(block); bpos.notifyNamenodeDeletedBlock(block, storageUuid);
} else { } else {
LOG.error("Cannot find BPOfferService for reporting block deleted for bpid=" LOG.error("Cannot find BPOfferService for reporting block deleted for bpid="
+ block.getBlockPoolId()); + block.getBlockPoolId());
@ -603,7 +649,9 @@ public class DataNode extends Configured
*/ */
public void reportBadBlocks(ExtendedBlock block) throws IOException{ public void reportBadBlocks(ExtendedBlock block) throws IOException{
BPOfferService bpos = getBPOSForBlock(block); BPOfferService bpos = getBPOSForBlock(block);
bpos.reportBadBlocks(block); FsVolumeSpi volume = getFSDataset().getVolume(block);
bpos.reportBadBlocks(
block, volume.getStorageID(), volume.getStorageType());
} }
/** /**
@ -675,7 +723,7 @@ public class DataNode extends Configured
* @throws IOException * @throws IOException
*/ */
void startDataNode(Configuration conf, void startDataNode(Configuration conf,
AbstractList<File> dataDirs, List<StorageLocation> dataDirs,
// DatanodeProtocol namenode, // DatanodeProtocol namenode,
SecureResources resources SecureResources resources
) throws IOException { ) throws IOException {
@ -736,19 +784,40 @@ public class DataNode extends Configured
readaheadPool = ReadaheadPool.getInstance(); readaheadPool = ReadaheadPool.getInstance();
} }
public static String generateUuid() {
return UUID.randomUUID().toString();
}
/**
* Verify that the DatanodeUuid has been initialized. If this is a new
* datanode then we generate a new Datanode Uuid and persist it to disk.
*
* @throws IOException
*/
private synchronized void checkDatanodeUuid() throws IOException {
if (storage.getDatanodeUuid() == null) {
storage.setDatanodeUuid(generateUuid());
storage.writeAll();
LOG.info("Generated and persisted new Datanode UUID " +
storage.getDatanodeUuid());
}
}
/** /**
* Create a DatanodeRegistration for a specific block pool. * Create a DatanodeRegistration for a specific block pool.
* @param nsInfo the namespace info from the first part of the NN handshake * @param nsInfo the namespace info from the first part of the NN handshake
*/ */
DatanodeRegistration createBPRegistration(NamespaceInfo nsInfo) { DatanodeRegistration createBPRegistration(NamespaceInfo nsInfo)
throws IOException {
StorageInfo storageInfo = storage.getBPStorage(nsInfo.getBlockPoolID()); StorageInfo storageInfo = storage.getBPStorage(nsInfo.getBlockPoolID());
if (storageInfo == null) { if (storageInfo == null) {
// it's null in the case of SimulatedDataSet // it's null in the case of SimulatedDataSet
storageInfo = new StorageInfo(nsInfo); storageInfo = new StorageInfo(nsInfo);
} }
DatanodeID dnId = new DatanodeID( DatanodeID dnId = new DatanodeID(
streamingAddr.getAddress().getHostAddress(), hostName, streamingAddr.getAddress().getHostAddress(), hostName,
getStorageId(), getXferPort(), getInfoPort(), storage.getDatanodeUuid(), getXferPort(), getInfoPort(),
infoSecurePort, getIpcPort()); infoSecurePort, getIpcPort());
return new DatanodeRegistration(dnId, storageInfo, return new DatanodeRegistration(dnId, storageInfo,
new ExportedBlockKeys(), VersionInfo.getVersion()); new ExportedBlockKeys(), VersionInfo.getVersion());
@ -767,16 +836,10 @@ public class DataNode extends Configured
id = bpRegistration; id = bpRegistration;
} }
if (storage.getStorageID().equals("")) { if(!storage.getDatanodeUuid().equals(bpRegistration.getDatanodeUuid())) {
// This is a fresh datanode, persist the NN-provided storage ID throw new IOException("Inconsistent Datanode IDs. Name-node returned "
storage.setStorageID(bpRegistration.getStorageID()); + bpRegistration.getDatanodeUuid()
storage.writeAll(); + ". Expecting " + storage.getDatanodeUuid());
LOG.info("New storage id " + bpRegistration.getStorageID()
+ " is assigned to data-node " + bpRegistration);
} else if(!storage.getStorageID().equals(bpRegistration.getStorageID())) {
throw new IOException("Inconsistent storage IDs. Name-node returned "
+ bpRegistration.getStorageID()
+ ". Expecting " + storage.getStorageID());
} }
registerBlockPoolWithSecretManager(bpRegistration, blockPoolId); registerBlockPoolWithSecretManager(bpRegistration, blockPoolId);
@ -897,9 +960,12 @@ public class DataNode extends Configured
final StorageInfo bpStorage = storage.getBPStorage(bpid); final StorageInfo bpStorage = storage.getBPStorage(bpid);
LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID() LOG.info("Setting up storage: nsid=" + bpStorage.getNamespaceID()
+ ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion() + ";bpid=" + bpid + ";lv=" + storage.getLayoutVersion()
+ ";nsInfo=" + nsInfo); + ";nsInfo=" + nsInfo + ";dnuuid=" + storage.getDatanodeUuid());
} }
// If this is a newly formatted DataNode then assign a new DatanodeUuid.
checkDatanodeUuid();
synchronized(this) { synchronized(this) {
if (data == null) { if (data == null) {
data = factory.newInstance(this, storage, conf); data = factory.newInstance(this, storage, conf);
@ -924,10 +990,6 @@ public class DataNode extends Configured
return streamingAddr.getPort(); return streamingAddr.getPort();
} }
String getStorageId() {
return storage.getStorageID();
}
/** /**
* @return name useful for logging * @return name useful for logging
*/ */
@ -1013,34 +1075,6 @@ public class DataNode extends Configured
return metrics; return metrics;
} }
public static void setNewStorageID(DatanodeID dnId) {
LOG.info("Datanode is " + dnId);
dnId.setStorageID(createNewStorageId(dnId.getXferPort()));
}
/**
* @return a unique storage ID of form "DS-randInt-ipaddr-port-timestamp"
*/
static String createNewStorageId(int port) {
// It is unlikely that we will create a non-unique storage ID
// for the following reasons:
// a) SecureRandom is a cryptographically strong random number generator
// b) IP addresses will likely differ on different hosts
// c) DataNode xfer ports will differ on the same host
// d) StorageIDs will likely be generated at different times (in ms)
// A conflict requires that all four conditions are violated.
// NB: The format of this string can be changed in the future without
// requiring that old SotrageIDs be updated.
String ip = "unknownIP";
try {
ip = DNS.getDefaultIP("default");
} catch (UnknownHostException ignored) {
LOG.warn("Could not find an IP address for the \"default\" inteface.");
}
int rand = DFSUtil.getSecureRandom().nextInt(Integer.MAX_VALUE);
return "DS-" + rand + "-" + ip + "-" + port + "-" + Time.now();
}
/** Ensure the authentication method is kerberos */ /** Ensure the authentication method is kerberos */
private void checkKerberosAuthMethod(String msg) throws IOException { private void checkKerberosAuthMethod(String msg) throws IOException {
// User invoking the call must be same as the datanode user // User invoking the call must be same as the datanode user
@ -1370,8 +1404,10 @@ public class DataNode extends Configured
// Check if NN recorded length matches on-disk length // Check if NN recorded length matches on-disk length
long onDiskLength = data.getLength(block); long onDiskLength = data.getLength(block);
if (block.getNumBytes() > onDiskLength) { if (block.getNumBytes() > onDiskLength) {
FsVolumeSpi volume = getFSDataset().getVolume(block);
// Shorter on-disk len indicates corruption so report NN the corrupt block // Shorter on-disk len indicates corruption so report NN the corrupt block
bpos.reportBadBlocks(block); bpos.reportBadBlocks(
block, volume.getStorageID(), volume.getStorageType());
LOG.warn("Can't replicate block " + block LOG.warn("Can't replicate block " + block
+ " because on-disk length " + onDiskLength + " because on-disk length " + onDiskLength
+ " is shorter than NameNode recorded length " + block.getNumBytes()); + " is shorter than NameNode recorded length " + block.getNumBytes());
@ -1635,11 +1671,11 @@ public class DataNode extends Configured
* @param block * @param block
* @param delHint * @param delHint
*/ */
void closeBlock(ExtendedBlock block, String delHint) { void closeBlock(ExtendedBlock block, String delHint, String storageUuid) {
metrics.incrBlocksWritten(); metrics.incrBlocksWritten();
BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId()); BPOfferService bpos = blockPoolManager.get(block.getBlockPoolId());
if(bpos != null) { if(bpos != null) {
bpos.notifyNamenodeReceivedBlock(block, delHint); bpos.notifyNamenodeReceivedBlock(block, delHint, storageUuid);
} else { } else {
LOG.warn("Cannot find BPOfferService for reporting block received for bpid=" LOG.warn("Cannot find BPOfferService for reporting block received for bpid="
+ block.getBlockPoolId()); + block.getBlockPoolId());
@ -1703,17 +1739,32 @@ public class DataNode extends Configured
printUsage(System.err); printUsage(System.err);
return null; return null;
} }
Collection<URI> dataDirs = getStorageDirs(conf); Collection<StorageLocation> dataLocations = getStorageLocations(conf);
UserGroupInformation.setConfiguration(conf); UserGroupInformation.setConfiguration(conf);
SecurityUtil.login(conf, DFS_DATANODE_KEYTAB_FILE_KEY, SecurityUtil.login(conf, DFS_DATANODE_KEYTAB_FILE_KEY,
DFS_DATANODE_USER_NAME_KEY); DFS_DATANODE_USER_NAME_KEY);
return makeInstance(dataDirs, conf, resources); return makeInstance(dataLocations, conf, resources);
} }
static Collection<URI> getStorageDirs(Configuration conf) { public static List<StorageLocation> getStorageLocations(Configuration conf) {
Collection<String> dirNames = Collection<String> rawLocations =
conf.getTrimmedStringCollection(DFS_DATANODE_DATA_DIR_KEY); conf.getTrimmedStringCollection(DFS_DATANODE_DATA_DIR_KEY);
return Util.stringCollectionAsURIs(dirNames); List<StorageLocation> locations =
new ArrayList<StorageLocation>(rawLocations.size());
for(String locationString : rawLocations) {
final StorageLocation location;
try {
location = StorageLocation.parse(locationString);
} catch (IOException ioe) {
throw new IllegalArgumentException("Failed to parse conf property "
+ DFS_DATANODE_DATA_DIR_KEY + ": " + locationString, ioe);
}
locations.add(location);
}
return locations;
} }
/** Instantiate & Start a single datanode daemon and wait for it to finish. /** Instantiate & Start a single datanode daemon and wait for it to finish.
@ -1779,57 +1830,52 @@ public class DataNode extends Configured
* no directory from this directory list can be created. * no directory from this directory list can be created.
* @throws IOException * @throws IOException
*/ */
static DataNode makeInstance(Collection<URI> dataDirs, Configuration conf, static DataNode makeInstance(Collection<StorageLocation> dataDirs,
SecureResources resources) throws IOException { Configuration conf, SecureResources resources) throws IOException {
LocalFileSystem localFS = FileSystem.getLocal(conf); LocalFileSystem localFS = FileSystem.getLocal(conf);
FsPermission permission = new FsPermission( FsPermission permission = new FsPermission(
conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY, conf.get(DFS_DATANODE_DATA_DIR_PERMISSION_KEY,
DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT)); DFS_DATANODE_DATA_DIR_PERMISSION_DEFAULT));
DataNodeDiskChecker dataNodeDiskChecker = DataNodeDiskChecker dataNodeDiskChecker =
new DataNodeDiskChecker(permission); new DataNodeDiskChecker(permission);
ArrayList<File> dirs = List<StorageLocation> locations =
getDataDirsFromURIs(dataDirs, localFS, dataNodeDiskChecker); checkStorageLocations(dataDirs, localFS, dataNodeDiskChecker);
DefaultMetricsSystem.initialize("DataNode"); DefaultMetricsSystem.initialize("DataNode");
assert dirs.size() > 0 : "number of data directories should be > 0"; assert locations.size() > 0 : "number of data directories should be > 0";
return new DataNode(conf, dirs, resources); return new DataNode(conf, locations, resources);
} }
// DataNode ctor expects AbstractList instead of List or Collection... // DataNode ctor expects AbstractList instead of List or Collection...
static ArrayList<File> getDataDirsFromURIs(Collection<URI> dataDirs, static List<StorageLocation> checkStorageLocations(
Collection<StorageLocation> dataDirs,
LocalFileSystem localFS, DataNodeDiskChecker dataNodeDiskChecker) LocalFileSystem localFS, DataNodeDiskChecker dataNodeDiskChecker)
throws IOException { throws IOException {
ArrayList<File> dirs = new ArrayList<File>(); ArrayList<StorageLocation> locations = new ArrayList<StorageLocation>();
StringBuilder invalidDirs = new StringBuilder(); StringBuilder invalidDirs = new StringBuilder();
for (URI dirURI : dataDirs) { for (StorageLocation location : dataDirs) {
if (!"file".equalsIgnoreCase(dirURI.getScheme())) { final URI uri = location.getUri();
LOG.warn("Unsupported URI schema in " + dirURI + ". Ignoring ...");
invalidDirs.append("\"").append(dirURI).append("\" ");
continue;
}
// drop any (illegal) authority in the URI for backwards compatibility
File dir = new File(dirURI.getPath());
try { try {
dataNodeDiskChecker.checkDir(localFS, new Path(dir.toURI())); dataNodeDiskChecker.checkDir(localFS, new Path(uri));
dirs.add(dir); locations.add(location);
} catch (IOException ioe) { } catch (IOException ioe) {
LOG.warn("Invalid " + DFS_DATANODE_DATA_DIR_KEY + " " LOG.warn("Invalid " + DFS_DATANODE_DATA_DIR_KEY + " "
+ dir + " : ", ioe); + location.getFile() + " : ", ioe);
invalidDirs.append("\"").append(dirURI.getPath()).append("\" "); invalidDirs.append("\"").append(uri.getPath()).append("\" ");
} }
} }
if (dirs.size() == 0) { if (locations.size() == 0) {
throw new IOException("All directories in " throw new IOException("All directories in "
+ DFS_DATANODE_DATA_DIR_KEY + " are invalid: " + DFS_DATANODE_DATA_DIR_KEY + " are invalid: "
+ invalidDirs); + invalidDirs);
} }
return dirs; return locations;
} }
@Override @Override
public String toString() { public String toString() {
return "DataNode{data=" + data + ", localName='" + getDisplayName() return "DataNode{data=" + data + ", localName='" + getDisplayName()
+ "', storageID='" + getStorageId() + "', xmitsInProgress=" + "', datanodeUuid='" + storage.getDatanodeUuid() + "', xmitsInProgress="
+ xmitsInProgress.get() + "}"; + xmitsInProgress.get() + "}";
} }
@ -1883,7 +1929,6 @@ public class DataNode extends Configured
} }
/** /**
* This method is used for testing.
* Examples are adding and deleting blocks directly. * Examples are adding and deleting blocks directly.
* The most common usage will be when the data node's storage is simulated. * The most common usage will be when the data node's storage is simulated.
* *
@ -1983,7 +2028,7 @@ public class DataNode extends Configured
ExtendedBlock newBlock = new ExtendedBlock(oldBlock); ExtendedBlock newBlock = new ExtendedBlock(oldBlock);
newBlock.setGenerationStamp(recoveryId); newBlock.setGenerationStamp(recoveryId);
newBlock.setNumBytes(newLength); newBlock.setNumBytes(newLength);
notifyNamenodeReceivedBlock(newBlock, ""); notifyNamenodeReceivedBlock(newBlock, "", storageID);
return storageID; return storageID;
} }
@ -2443,6 +2488,10 @@ public class DataNode extends Configured
return dnConf; return dnConf;
} }
public String getDatanodeUuid() {
return id == null ? null : id.getDatanodeUuid();
}
boolean shouldRun() { boolean shouldRun() {
return shouldRun; return shouldRun;
} }

View File

@ -24,13 +24,7 @@ import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.nio.channels.FileLock; import java.nio.channels.FileLock;
import java.util.ArrayList; import java.util.*;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
@ -50,6 +44,7 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException; import org.apache.hadoop.hdfs.server.common.InconsistentFSStateException;
import org.apache.hadoop.hdfs.server.common.Storage; import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.common.StorageInfo; import org.apache.hadoop.hdfs.server.common.StorageInfo;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Daemon; import org.apache.hadoop.util.Daemon;
@ -71,8 +66,13 @@ public class DataStorage extends Storage {
public final static String STORAGE_DIR_FINALIZED = "finalized"; public final static String STORAGE_DIR_FINALIZED = "finalized";
public final static String STORAGE_DIR_TMP = "tmp"; public final static String STORAGE_DIR_TMP = "tmp";
/** Unique storage ID. {@see DataNode#createNewStorageId(int)} for details */ /**
private String storageID; * Datanode UUID that this storage is currently attached to. This
* is the same as the legacy StorageID for datanodes that were
* upgraded from a pre-UUID version. For compatibility with prior
* versions of Datanodes we cannot make this field a UUID.
*/
private String datanodeUuid = null;
// Flag to ensure we only initialize storage once // Flag to ensure we only initialize storage once
private boolean initialized = false; private boolean initialized = false;
@ -84,33 +84,29 @@ public class DataStorage extends Storage {
DataStorage() { DataStorage() {
super(NodeType.DATA_NODE); super(NodeType.DATA_NODE);
storageID = "";
} }
public StorageInfo getBPStorage(String bpid) { public StorageInfo getBPStorage(String bpid) {
return bpStorageMap.get(bpid); return bpStorageMap.get(bpid);
} }
public DataStorage(StorageInfo storageInfo, String strgID) { public DataStorage(StorageInfo storageInfo) {
super(NodeType.DATA_NODE, storageInfo); super(NodeType.DATA_NODE, storageInfo);
this.storageID = strgID;
} }
/** @return storage ID. */ public synchronized String getDatanodeUuid() {
public synchronized String getStorageID() { return datanodeUuid;
return storageID;
} }
synchronized void setStorageID(String newStorageID) { public synchronized void setDatanodeUuid(String newDatanodeUuid) {
this.storageID = newStorageID; this.datanodeUuid = newDatanodeUuid;
} }
/** Create an ID for this storage. */ /** Create an ID for this storage. */
public synchronized void createStorageID(int datanodePort) { public synchronized void createStorageID(StorageDirectory sd) {
if (storageID != null && !storageID.isEmpty()) { if (sd.getStorageUuid() == null) {
return; sd.setStorageUuid(DatanodeStorage.generateUuid());
} }
storageID = DataNode.createNewStorageId(datanodePort);
} }
/** /**
@ -128,7 +124,8 @@ public class DataStorage extends Storage {
* @throws IOException * @throws IOException
*/ */
synchronized void recoverTransitionRead(DataNode datanode, synchronized void recoverTransitionRead(DataNode datanode,
NamespaceInfo nsInfo, Collection<File> dataDirs, StartupOption startOpt) NamespaceInfo nsInfo, Collection<StorageLocation> dataDirs,
StartupOption startOpt)
throws IOException { throws IOException {
if (initialized) { if (initialized) {
// DN storage has been initialized, no need to do anything // DN storage has been initialized, no need to do anything
@ -144,8 +141,8 @@ public class DataStorage extends Storage {
// Format and recover. // Format and recover.
this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size()); this.storageDirs = new ArrayList<StorageDirectory>(dataDirs.size());
ArrayList<StorageState> dataDirStates = new ArrayList<StorageState>(dataDirs.size()); ArrayList<StorageState> dataDirStates = new ArrayList<StorageState>(dataDirs.size());
for(Iterator<File> it = dataDirs.iterator(); it.hasNext();) { for(Iterator<StorageLocation> it = dataDirs.iterator(); it.hasNext();) {
File dataDir = it.next(); File dataDir = it.next().getFile();
StorageDirectory sd = new StorageDirectory(dataDir); StorageDirectory sd = new StorageDirectory(dataDir);
StorageState curState; StorageState curState;
try { try {
@ -162,7 +159,7 @@ public class DataStorage extends Storage {
case NOT_FORMATTED: // format case NOT_FORMATTED: // format
LOG.info("Storage directory " + dataDir + " is not formatted"); LOG.info("Storage directory " + dataDir + " is not formatted");
LOG.info("Formatting ..."); LOG.info("Formatting ...");
format(sd, nsInfo); format(sd, nsInfo, datanode.getDatanodeUuid());
break; break;
default: // recovery part is common default: // recovery part is common
sd.doRecover(curState); sd.doRecover(curState);
@ -191,11 +188,9 @@ public class DataStorage extends Storage {
doTransition(datanode, getStorageDir(idx), nsInfo, startOpt); doTransition(datanode, getStorageDir(idx), nsInfo, startOpt);
assert this.getLayoutVersion() == nsInfo.getLayoutVersion() : assert this.getLayoutVersion() == nsInfo.getLayoutVersion() :
"Data-node and name-node layout versions must be the same."; "Data-node and name-node layout versions must be the same.";
createStorageID(getStorageDir(idx));
} }
// make sure we have storage id set - if not - generate new one
createStorageID(datanode.getXferPort());
// 3. Update all storages. Some of them might have just been formatted. // 3. Update all storages. Some of them might have just been formatted.
this.writeAll(); this.writeAll();
@ -214,14 +209,14 @@ public class DataStorage extends Storage {
* @throws IOException on error * @throws IOException on error
*/ */
void recoverTransitionRead(DataNode datanode, String bpID, NamespaceInfo nsInfo, void recoverTransitionRead(DataNode datanode, String bpID, NamespaceInfo nsInfo,
Collection<File> dataDirs, StartupOption startOpt) throws IOException { Collection<StorageLocation> dataDirs, StartupOption startOpt) throws IOException {
// First ensure datanode level format/snapshot/rollback is completed // First ensure datanode level format/snapshot/rollback is completed
recoverTransitionRead(datanode, nsInfo, dataDirs, startOpt); recoverTransitionRead(datanode, nsInfo, dataDirs, startOpt);
// Create list of storage directories for the block pool // Create list of storage directories for the block pool
Collection<File> bpDataDirs = new ArrayList<File>(); Collection<File> bpDataDirs = new ArrayList<File>();
for(Iterator<File> it = dataDirs.iterator(); it.hasNext();) { for(StorageLocation dir : dataDirs) {
File dnRoot = it.next(); File dnRoot = dir.getFile();
File bpRoot = BlockPoolSliceStorage.getBpRoot(bpID, new File(dnRoot, File bpRoot = BlockPoolSliceStorage.getBpRoot(bpID, new File(dnRoot,
STORAGE_DIR_CURRENT)); STORAGE_DIR_CURRENT));
bpDataDirs.add(bpRoot); bpDataDirs.add(bpRoot);
@ -263,19 +258,28 @@ public class DataStorage extends Storage {
} }
} }
void format(StorageDirectory sd, NamespaceInfo nsInfo) throws IOException { void format(StorageDirectory sd, NamespaceInfo nsInfo,
String datanodeUuid) throws IOException {
sd.clearDirectory(); // create directory sd.clearDirectory(); // create directory
this.layoutVersion = HdfsConstants.LAYOUT_VERSION; this.layoutVersion = HdfsConstants.LAYOUT_VERSION;
this.clusterID = nsInfo.getClusterID(); this.clusterID = nsInfo.getClusterID();
this.namespaceID = nsInfo.getNamespaceID(); this.namespaceID = nsInfo.getNamespaceID();
this.cTime = 0; this.cTime = 0;
// store storageID as it currently is this.datanodeUuid = datanodeUuid;
if (sd.getStorageUuid() == null) {
// Assign a new Storage UUID.
sd.setStorageUuid(DatanodeStorage.generateUuid());
}
writeProperties(sd); writeProperties(sd);
} }
/* /*
* Set ClusterID, StorageID, StorageType, CTime into * Set ClusterID, StorageID, StorageType, CTime into
* DataStorage VERSION file * DataStorage VERSION file.
* Always called just before writing the properties to
* the VERSION file.
*/ */
@Override @Override
protected void setPropertiesFromFields(Properties props, protected void setPropertiesFromFields(Properties props,
@ -285,7 +289,13 @@ public class DataStorage extends Storage {
props.setProperty("clusterID", clusterID); props.setProperty("clusterID", clusterID);
props.setProperty("cTime", String.valueOf(cTime)); props.setProperty("cTime", String.valueOf(cTime));
props.setProperty("layoutVersion", String.valueOf(layoutVersion)); props.setProperty("layoutVersion", String.valueOf(layoutVersion));
props.setProperty("storageID", getStorageID()); props.setProperty("storageID", sd.getStorageUuid());
String datanodeUuid = getDatanodeUuid();
if (datanodeUuid != null) {
props.setProperty("datanodeUuid", datanodeUuid);
}
// Set NamespaceID in version before federation // Set NamespaceID in version before federation
if (!LayoutVersion.supports(Feature.FEDERATION, layoutVersion)) { if (!LayoutVersion.supports(Feature.FEDERATION, layoutVersion)) {
props.setProperty("namespaceID", String.valueOf(namespaceID)); props.setProperty("namespaceID", String.valueOf(namespaceID));
@ -295,6 +305,7 @@ public class DataStorage extends Storage {
/* /*
* Read ClusterID, StorageID, StorageType, CTime from * Read ClusterID, StorageID, StorageType, CTime from
* DataStorage VERSION file and verify them. * DataStorage VERSION file and verify them.
* Always called just after reading the properties from the VERSION file.
*/ */
@Override @Override
protected void setFieldsFromProperties(Properties props, StorageDirectory sd) protected void setFieldsFromProperties(Properties props, StorageDirectory sd)
@ -318,20 +329,36 @@ public class DataStorage extends Storage {
setNamespaceID(props, sd); setNamespaceID(props, sd);
} }
// valid storage id, storage id may be empty // valid storage id, storage id may be empty
String ssid = props.getProperty("storageID"); String ssid = props.getProperty("storageID");
if (ssid == null) { if (ssid == null) {
throw new InconsistentFSStateException(sd.getRoot(), "file " throw new InconsistentFSStateException(sd.getRoot(), "file "
+ STORAGE_FILE_VERSION + " is invalid."); + STORAGE_FILE_VERSION + " is invalid.");
} }
String sid = getStorageID(); String sid = sd.getStorageUuid();
if (!(sid.equals("") || ssid.equals("") || sid.equals(ssid))) { if (!(sid == null || sid.equals("") ||
ssid.equals("") || sid.equals(ssid))) {
throw new InconsistentFSStateException(sd.getRoot(), throw new InconsistentFSStateException(sd.getRoot(),
"has incompatible storage Id."); "has incompatible storage Id.");
} }
if (sid.equals("")) { // update id only if it was empty if (sid == null) { // update id only if it was null
setStorageID(ssid); sd.setStorageUuid(ssid);
}
// Update the datanode UUID if present.
if (props.getProperty("datanodeUuid") != null) {
String dnUuid = props.getProperty("datanodeUuid");
if (getDatanodeUuid() == null) {
setDatanodeUuid(dnUuid);
} else if (getDatanodeUuid().compareTo(dnUuid) != 0) {
throw new InconsistentFSStateException(sd.getRoot(),
"Root " + sd.getRoot() + ": DatanodeUuid=" + dnUuid +
", does not match " + getDatanodeUuid() + " from other" +
" StorageDirectory.");
}
} }
} }

View File

@ -284,7 +284,7 @@ class DataXceiver extends Receiver implements Runnable {
BlockSender.ClientTraceLog.info(String.format( BlockSender.ClientTraceLog.info(String.format(
"src: 127.0.0.1, dest: 127.0.0.1, op: REQUEST_SHORT_CIRCUIT_FDS," + "src: 127.0.0.1, dest: 127.0.0.1, op: REQUEST_SHORT_CIRCUIT_FDS," +
" blockid: %s, srvID: %s, success: %b", " blockid: %s, srvID: %s, success: %b",
blk.getBlockId(), dnR.getStorageID(), (fis != null) blk.getBlockId(), dnR.getDatanodeUuid(), (fis != null)
)); ));
} }
if (fis != null) { if (fis != null) {
@ -317,7 +317,7 @@ class DataXceiver extends Receiver implements Runnable {
clientName.length() > 0 && ClientTraceLog.isInfoEnabled() clientName.length() > 0 && ClientTraceLog.isInfoEnabled()
? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress, ? String.format(DN_CLIENTTRACE_FORMAT, localAddress, remoteAddress,
"%d", "HDFS_READ", clientName, "%d", "%d", "HDFS_READ", clientName, "%d",
dnR.getStorageID(), block, "%d") dnR.getDatanodeUuid(), block, "%d")
: dnR + " Served block " + block + " to " + : dnR + " Served block " + block + " to " +
remoteAddress; remoteAddress;
@ -447,6 +447,7 @@ class DataXceiver extends Receiver implements Runnable {
String mirrorNode = null; // the name:port of next target String mirrorNode = null; // the name:port of next target
String firstBadLink = ""; // first datanode that failed in connection setup String firstBadLink = ""; // first datanode that failed in connection setup
Status mirrorInStatus = SUCCESS; Status mirrorInStatus = SUCCESS;
final String storageUuid;
try { try {
if (isDatanode || if (isDatanode ||
stage != BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) { stage != BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) {
@ -457,8 +458,10 @@ class DataXceiver extends Receiver implements Runnable {
stage, latestGenerationStamp, minBytesRcvd, maxBytesRcvd, stage, latestGenerationStamp, minBytesRcvd, maxBytesRcvd,
clientname, srcDataNode, datanode, requestedChecksum, clientname, srcDataNode, datanode, requestedChecksum,
cachingStrategy); cachingStrategy);
storageUuid = blockReceiver.getStorageUuid();
} else { } else {
datanode.data.recoverClose(block, latestGenerationStamp, minBytesRcvd); storageUuid = datanode.data.recoverClose(
block, latestGenerationStamp, minBytesRcvd);
} }
// //
@ -590,7 +593,7 @@ class DataXceiver extends Receiver implements Runnable {
// the block is finalized in the PacketResponder. // the block is finalized in the PacketResponder.
if (isDatanode || if (isDatanode ||
stage == BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) { stage == BlockConstructionStage.PIPELINE_CLOSE_RECOVERY) {
datanode.closeBlock(block, DataNode.EMPTY_DEL_HINT); datanode.closeBlock(block, DataNode.EMPTY_DEL_HINT, storageUuid);
LOG.info("Received " + block + " src: " + remoteAddress + " dest: " LOG.info("Received " + block + " src: " + remoteAddress + " dest: "
+ localAddress + " of size " + block.getNumBytes()); + localAddress + " of size " + block.getNumBytes());
} }
@ -859,9 +862,11 @@ class DataXceiver extends Receiver implements Runnable {
dataXceiverServer.balanceThrottler, null); dataXceiverServer.balanceThrottler, null);
// notify name node // notify name node
datanode.notifyNamenodeReceivedBlock(block, delHint); datanode.notifyNamenodeReceivedBlock(
block, delHint, blockReceiver.getStorageUuid());
LOG.info("Moved " + block + " from " + peer.getRemoteAddressString()); LOG.info("Moved " + block + " from " + peer.getRemoteAddressString()
+ ", delHint=" + delHint);
} catch (IOException ioe) { } catch (IOException ioe) {
opStatus = ERROR; opStatus = ERROR;

View File

@ -77,18 +77,6 @@ public class DatanodeJspHelper {
}); });
} }
/**
* Internal convenience method for canonicalizing host name.
* @param addr name:port or name
* @return canonicalized host name
*/
private static String canonicalize(String addr) {
// default port 1 is supplied to allow addr without port.
// the port will be ignored.
return NetUtils.createSocketAddr(addr, 1).getAddress()
.getCanonicalHostName();
}
/** /**
* Get the default chunk size. * Get the default chunk size.
* @param conf the configuration * @param conf the configuration
@ -228,7 +216,7 @@ public class DatanodeJspHelper {
} }
} }
out.print("<br><a href=\"///" out.print("<br><a href=\"///"
+ canonicalize(nnAddr) + ":" + JspHelper.canonicalize(nnAddr) + ":"
+ namenodeInfoPort + "/dfshealth.jsp\">Go back to DFS home</a>"); + namenodeInfoPort + "/dfshealth.jsp\">Go back to DFS home</a>");
dfs.close(); dfs.close();
} }
@ -359,7 +347,7 @@ public class DatanodeJspHelper {
// generate a table and dump the info // generate a table and dump the info
out.println("\n<table>"); out.println("\n<table>");
String nnCanonicalName = canonicalize(nnAddr); String nnCanonicalName = JspHelper.canonicalize(nnAddr);
for (LocatedBlock cur : blocks) { for (LocatedBlock cur : blocks) {
out.print("<tr>"); out.print("<tr>");
final String blockidstring = Long.toString(cur.getBlock().getBlockId()); final String blockidstring = Long.toString(cur.getBlock().getBlockId());

View File

@ -19,7 +19,6 @@ package org.apache.hadoop.hdfs.server.datanode;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedList; import java.util.LinkedList;
@ -230,10 +229,6 @@ public class DirectoryScanner implements Runnable {
throw new RuntimeException(prefix + " is not a prefix of " + fullPath); throw new RuntimeException(prefix + " is not a prefix of " + fullPath);
} }
ScanInfo(long blockId) {
this(blockId, null, null, null);
}
ScanInfo(long blockId, File blockFile, File metaFile, FsVolumeSpi vol) { ScanInfo(long blockId, File blockFile, File metaFile, FsVolumeSpi vol) {
this.blockId = blockId; this.blockId = blockId;
String condensedVolPath = vol == null ? null : String condensedVolPath = vol == null ? null :
@ -439,8 +434,8 @@ public class DirectoryScanner implements Runnable {
diffs.put(bpid, diffRecord); diffs.put(bpid, diffRecord);
statsRecord.totalBlocks = blockpoolReport.length; statsRecord.totalBlocks = blockpoolReport.length;
List<Block> bl = dataset.getFinalizedBlocks(bpid); List<FinalizedReplica> bl = dataset.getFinalizedBlocks(bpid);
Block[] memReport = bl.toArray(new Block[bl.size()]); FinalizedReplica[] memReport = bl.toArray(new FinalizedReplica[bl.size()]);
Arrays.sort(memReport); // Sort based on blockId Arrays.sort(memReport); // Sort based on blockId
int d = 0; // index for blockpoolReport int d = 0; // index for blockpoolReport
@ -458,7 +453,8 @@ public class DirectoryScanner implements Runnable {
} }
if (info.getBlockId() > memBlock.getBlockId()) { if (info.getBlockId() > memBlock.getBlockId()) {
// Block is missing on the disk // Block is missing on the disk
addDifference(diffRecord, statsRecord, memBlock.getBlockId()); addDifference(diffRecord, statsRecord,
memBlock.getBlockId(), info.getVolume());
m++; m++;
continue; continue;
} }
@ -478,7 +474,9 @@ public class DirectoryScanner implements Runnable {
m++; m++;
} }
while (m < memReport.length) { while (m < memReport.length) {
addDifference(diffRecord, statsRecord, memReport[m++].getBlockId()); FinalizedReplica current = memReport[m++];
addDifference(diffRecord, statsRecord,
current.getBlockId(), current.getVolume());
} }
while (d < blockpoolReport.length) { while (d < blockpoolReport.length) {
statsRecord.missingMemoryBlocks++; statsRecord.missingMemoryBlocks++;
@ -502,10 +500,11 @@ public class DirectoryScanner implements Runnable {
/** Block is not found on the disk */ /** Block is not found on the disk */
private void addDifference(LinkedList<ScanInfo> diffRecord, private void addDifference(LinkedList<ScanInfo> diffRecord,
Stats statsRecord, long blockId) { Stats statsRecord, long blockId,
FsVolumeSpi vol) {
statsRecord.missingBlockFile++; statsRecord.missingBlockFile++;
statsRecord.missingMetaFile++; statsRecord.missingMetaFile++;
diffRecord.add(new ScanInfo(blockId)); diffRecord.add(new ScanInfo(blockId, null, null, vol));
} }
/** Is the given volume still valid in the dataset? */ /** Is the given volume still valid in the dataset? */

View File

@ -54,4 +54,9 @@ public interface Replica {
* @return the number of bytes that are visible to readers * @return the number of bytes that are visible to readers
*/ */
public long getVisibleLength(); public long getVisibleLength();
/**
* Return the storageUuid of the volume that stores this replica.
*/
public String getStorageUuid();
} }

View File

@ -138,6 +138,14 @@ abstract public class ReplicaInfo extends Block implements Replica {
this.volume = vol; this.volume = vol;
} }
/**
* Get the storageUuid of the volume that stores this replica.
*/
@Override
public String getStorageUuid() {
return volume.getStorageID();
}
/** /**
* Return the parent directory path where this replica is located * Return the parent directory path where this replica is located
* @return the parent directory path where this replica is located * @return the parent directory path where this replica is located

View File

@ -87,6 +87,7 @@ public class SecureDataNodeStarter implements Daemon {
public static SecureResources getSecureResources(Configuration conf) public static SecureResources getSecureResources(Configuration conf)
throws Exception { throws Exception {
HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf); HttpConfig.Policy policy = DFSUtil.getHttpPolicy(conf);
boolean isSecure = UserGroupInformation.isSecurityEnabled();
// Obtain secure port for data streaming to datanode // Obtain secure port for data streaming to datanode
InetSocketAddress streamingAddr = DataNode.getStreamingAddr(conf); InetSocketAddress streamingAddr = DataNode.getStreamingAddr(conf);
@ -106,6 +107,11 @@ public class SecureDataNodeStarter implements Daemon {
+ ss.getLocalPort()); + ss.getLocalPort());
} }
if (ss.getLocalPort() > 1023 && isSecure) {
throw new RuntimeException(
"Cannot start secure datanode with unprivileged RPC ports");
}
System.err.println("Opened streaming server at " + streamingAddr); System.err.println("Opened streaming server at " + streamingAddr);
// Bind a port for the web server. The code intends to bind HTTP server to // Bind a port for the web server. The code intends to bind HTTP server to
@ -126,9 +132,9 @@ public class SecureDataNodeStarter implements Daemon {
System.err.println("Successfully obtained privileged resources (streaming port = " System.err.println("Successfully obtained privileged resources (streaming port = "
+ ss + " ) (http listener port = " + listener.getConnection() +")"); + ss + " ) (http listener port = " + listener.getConnection() +")");
if ((ss.getLocalPort() > 1023 || listener.getPort() > 1023) && if (listener.getPort() > 1023 && isSecure) {
UserGroupInformation.isSecurityEnabled()) { throw new RuntimeException(
throw new RuntimeException("Cannot start secure datanode with unprivileged ports"); "Cannot start secure datanode with unprivileged HTTP ports");
} }
System.err.println("Opened info server at " + infoSocAddr); System.err.println("Opened info server at " + infoSocAddr);
} }

View File

@ -0,0 +1,101 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.datanode;
import java.util.regex.Pattern;
import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.regex.Matcher;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hdfs.StorageType;
import org.apache.hadoop.hdfs.server.common.Util;
/**
* Encapsulates the URI and storage medium that together describe a
* storage directory.
* The default storage medium is assumed to be DISK, if none is specified.
*
*/
@InterfaceAudience.Private
public class StorageLocation {
final StorageType storageType;
final File file;
/** Regular expression that describes a storage uri with a storage type.
* e.g. [Disk]/storages/storage1/
*/
private static final Pattern regex = Pattern.compile("^\\[(\\w*)\\](.+)$");
private StorageLocation(StorageType storageType, URI uri) {
this.storageType = storageType;
if (uri.getScheme() == null ||
"file".equalsIgnoreCase(uri.getScheme())) {
// drop any (illegal) authority in the URI for backwards compatibility
this.file = new File(uri.getPath());
} else {
throw new IllegalArgumentException("Unsupported URI schema in " + uri);
}
}
public StorageType getStorageType() {
return this.storageType;
}
URI getUri() {
return file.toURI();
}
public File getFile() {
return this.file;
}
/**
* Attempt to parse a storage uri with storage class and URI. The storage
* class component of the uri is case-insensitive.
*
* @param rawLocation Location string of the format [type]uri, where [type] is
* optional.
* @return A StorageLocation object if successfully parsed, null otherwise.
* Does not throw any exceptions.
*/
static StorageLocation parse(String rawLocation) throws IOException {
Matcher matcher = regex.matcher(rawLocation);
StorageType storageType = StorageType.DEFAULT;
String location = rawLocation;
if (matcher.matches()) {
String classString = matcher.group(1);
location = matcher.group(2);
if (!classString.isEmpty()) {
storageType = StorageType.valueOf(classString.toUpperCase());
}
}
return new StorageLocation(storageType, Util.stringAsURI(location));
}
@Override
public String toString() {
return "[" + storageType + "]" + file.toURI();
}
}

View File

@ -34,12 +34,15 @@ import org.apache.hadoop.hdfs.protocol.ExtendedBlock;
import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata; import org.apache.hadoop.hdfs.protocol.HdfsBlocksMetadata;
import org.apache.hadoop.hdfs.server.datanode.DataNode; import org.apache.hadoop.hdfs.server.datanode.DataNode;
import org.apache.hadoop.hdfs.server.datanode.DataStorage; import org.apache.hadoop.hdfs.server.datanode.DataStorage;
import org.apache.hadoop.hdfs.server.datanode.FinalizedReplica;
import org.apache.hadoop.hdfs.server.datanode.Replica; import org.apache.hadoop.hdfs.server.datanode.Replica;
import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface; import org.apache.hadoop.hdfs.server.datanode.ReplicaInPipelineInterface;
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory; import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetFactory;
import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean; import org.apache.hadoop.hdfs.server.datanode.metrics.FSDatasetMBean;
import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock; import org.apache.hadoop.hdfs.server.protocol.BlockRecoveryCommand.RecoveringBlock;
import org.apache.hadoop.hdfs.server.protocol.DatanodeStorage;
import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo; import org.apache.hadoop.hdfs.server.protocol.ReplicaRecoveryInfo;
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
import org.apache.hadoop.util.DiskChecker.DiskErrorException; import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.ReflectionUtils; import org.apache.hadoop.util.ReflectionUtils;
@ -86,17 +89,18 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
/** @return a list of volumes. */ /** @return a list of volumes. */
public List<V> getVolumes(); public List<V> getVolumes();
/** @return one or more storage reports for attached volumes. */
public StorageReport[] getStorageReports(String bpid)
throws IOException;
/** @return the volume that contains a replica of the block. */ /** @return the volume that contains a replica of the block. */
public V getVolume(ExtendedBlock b); public V getVolume(ExtendedBlock b);
/** @return a volume information map (name => info). */ /** @return a volume information map (name => info). */
public Map<String, Object> getVolumeInfoMap(); public Map<String, Object> getVolumeInfoMap();
/** @return a list of block pools. */
public String[] getBlockPoolList();
/** @return a list of finalized blocks for the given block pool. */ /** @return a list of finalized blocks for the given block pool. */
public List<Block> getFinalizedBlocks(String bpid); public List<FinalizedReplica> getFinalizedBlocks(String bpid);
/** /**
* Check whether the in-memory block record matches the block on the disk, * Check whether the in-memory block record matches the block on the disk,
@ -239,9 +243,10 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
* @param b block * @param b block
* @param newGS the new generation stamp for the replica * @param newGS the new generation stamp for the replica
* @param expectedBlockLen the number of bytes the replica is expected to have * @param expectedBlockLen the number of bytes the replica is expected to have
* @return the storage uuid of the replica.
* @throws IOException * @throws IOException
*/ */
public void recoverClose(ExtendedBlock b, long newGS, long expectedBlockLen public String recoverClose(ExtendedBlock b, long newGS, long expectedBlockLen
) throws IOException; ) throws IOException;
/** /**
@ -262,12 +267,11 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
public void unfinalizeBlock(ExtendedBlock b) throws IOException; public void unfinalizeBlock(ExtendedBlock b) throws IOException;
/** /**
* Returns the block report - the full list of blocks stored under a * Returns one block report per volume.
* block pool
* @param bpid Block Pool Id * @param bpid Block Pool Id
* @return - the block report - the full list of blocks stored * @return - a map of DatanodeStorage to block report for the volume.
*/ */
public BlockListAsLongs getBlockReport(String bpid); public Map<DatanodeStorage, BlockListAsLongs> getBlockReports(String bpid);
/** /**
* Returns the cache report - the full list of cached block IDs of a * Returns the cache report - the full list of cached block IDs of a
@ -408,3 +412,4 @@ public interface FsDatasetSpi<V extends FsVolumeSpi> extends FSDatasetMBean {
public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks) public HdfsBlocksMetadata getHdfsBlocksMetadata(List<ExtendedBlock> blocks)
throws IOException; throws IOException;
} }

Some files were not shown because too many files have changed in this diff Show More