HADOOP-12666. Support Microsoft Azure Data Lake - as a file system in Hadoop. Contributed by Vishwajeet Dusane.

This commit is contained in:
Chris Nauroth 2016-06-09 14:33:31 -07:00
parent e383b732c5
commit 9581fb715c
30 changed files with 2954 additions and 0 deletions

View File

@ -2213,4 +2213,64 @@
needs to be specified in net.topology.script.file.name. needs to be specified in net.topology.script.file.name.
</description> </description>
</property> </property>
<!-- Azure Data Lake File System Configurations -->
<property>
<name>adl.feature.override.readahead</name>
<value>true</value>
<description>
Enables read aheads in the ADL client, the feature is used to
improve read throughput.
This works in conjunction with the value set in
adl.feature.override.readahead.max.buffersize.
When set to false the read ahead feature is turned off.
Default : True if not configured.
</description>
</property>
<property>
<name>adl.feature.override.readahead.max.buffersize</name>
<value>8388608</value>
<description>
Define maximum buffer size to cache read ahead data, this is
allocated per process to
cache read ahead data. Applicable only when
adl.feature.override.readahead is set to true.
Default : 8388608 Byte i.e. 8MB if not configured.
</description>
</property>
<property>
<name>adl.feature.override.readahead.max.concurrent.connection</name>
<value>2</value>
<description>
Define maximum concurrent connection can be established to
read ahead. If the data size is less than 4MB then only 1 read n/w
connection
is set. If the data size is less than 4MB but less than 8MB then 2 read
n/w connection
is set. Data greater than 8MB then value set under the property would
take
effect. Applicable only when adl.feature.override.readahead is set
to true and buffer size is greater than 8MB.
It is recommended to reset this property if the
adl.feature.override.readahead.max.buffersize
is less than 8MB to gain performance. Application has to consider
throttling limit for the account as well before configuring large
buffer size.
</description>
</property>
<property>
<name>fs.adl.impl</name>
<value>org.apache.hadoop.fs.adl.AdlFileSystem</value>
</property>
<property>
<name>fs.AbstractFileSystem.adl.impl</name>
<value>org.apache.hadoop.fs.adl.Adl</value>
</property>
</configuration> </configuration>

View File

@ -102,6 +102,12 @@ public class TestCommonConfigurationFields extends TestConfigurationFieldsBase {
xmlPrefixToSkipCompare.add("s3."); xmlPrefixToSkipCompare.add("s3.");
xmlPrefixToSkipCompare.add("s3native."); xmlPrefixToSkipCompare.add("s3native.");
// ADL properties are in a different subtree
// - org.apache.hadoop.hdfs.web.ADLConfKeys
xmlPrefixToSkipCompare.add("adl.");
xmlPropsToSkipCompare.add("fs.adl.impl");
xmlPropsToSkipCompare.add("fs.AbstractFileSystem.adl.impl");
// Deprecated properties. These should eventually be removed from the // Deprecated properties. These should eventually be removed from the
// class. // class.
configurationPropsToSkipCompare configurationPropsToSkipCompare

View File

@ -146,6 +146,8 @@
<menu name="Hadoop Compatible File Systems" inherit="top"> <menu name="Hadoop Compatible File Systems" inherit="top">
<item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/> <item name="Amazon S3" href="hadoop-aws/tools/hadoop-aws/index.html"/>
<item name="Azure Blob Storage" href="hadoop-azure/index.html"/> <item name="Azure Blob Storage" href="hadoop-azure/index.html"/>
<item name="Azure Data Lake Storage"
href="hadoop-azure-datalake/index.html"/>
<item name="OpenStack Swift" href="hadoop-openstack/index.html"/> <item name="OpenStack Swift" href="hadoop-openstack/index.html"/>
</menu> </menu>

View File

@ -0,0 +1,24 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<FindBugsFilter>
<!-- Buffer object is accessed withing trusted code and intentionally assigned instead of array copy -->
<Match>
<Class name="org.apache.hadoop.hdfs.web.PrivateAzureDataLakeFileSystem$BatchAppendOutputStream$CommitTask"/>
<Bug pattern="EI_EXPOSE_REP2"/>
<Priority value="2"/>
</Match>
</FindBugsFilter>

View File

@ -0,0 +1,180 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-project</artifactId>
<version>3.0.0-alpha1-SNAPSHOT</version>
<relativePath>../../hadoop-project</relativePath>
</parent>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-azure-datalake</artifactId>
<name>Apache Hadoop Azure Data Lake support</name>
<description>
This module contains code to support integration with Azure Data Lake.
</description>
<packaging>jar</packaging>
<properties>
<okHttpVersion>2.4.0</okHttpVersion>
<minimalJsonVersion>0.9.1</minimalJsonVersion>
<file.encoding>UTF-8</file.encoding>
<downloadSources>true</downloadSources>
</properties>
<build>
<plugins>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>findbugs-maven-plugin</artifactId>
<configuration>
<findbugsXmlOutput>true</findbugsXmlOutput>
<xmlOutput>true</xmlOutput>
<excludeFilterFile>
${basedir}/dev-support/findbugs-exclude.xml
</excludeFilterFile>
<effort>Max</effort>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-project-info-reports-plugin</artifactId>
<configuration>
<dependencyDetailsEnabled>false</dependencyDetailsEnabled>
<dependencyLocationsEnabled>false
</dependencyLocationsEnabled>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-jar-plugin</artifactId>
<executions>
<execution>
<goals>
<goal>test-jar</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>deplist</id>
<phase>compile</phase>
<goals>
<goal>list</goal>
</goals>
<configuration>
<!-- build a shellprofile -->
<outputFile>${project.basedir}/target/hadoop-tools-deps/${project.artifactId}.tools-optional.txt</outputFile>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<!--
The following is to suppress a m2e warning in eclipse
(m2e doesn't know how to handle maven-enforcer:enforce, so we have to tell m2e to ignore it)
see: http://stackoverflow.com/questions/13040788/how-to-elimate-the-maven-enforcer-plugin-goal-enforce-is-ignored-by-m2e-wa
-->
<pluginManagement>
<plugins>
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins
</groupId>
<artifactId>maven-enforcer-plugin
</artifactId>
<versionRange>[1.0.0,)</versionRange>
<goals>
<goal>enforce</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
<dependencies>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<exclusions>
<exclusion>
<artifactId>servlet-api</artifactId>
<groupId>javax.servlet</groupId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs-client</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-all</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.eclipsesource.minimal-json</groupId>
<artifactId>minimal-json</artifactId>
<version>0.9.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>com.squareup.okhttp</groupId>
<artifactId>mockwebserver</artifactId>
<version>2.4.0</version>
<scope>test</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.fs.adl;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.DelegateToFileSystem;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
/**
* Expose adl:// scheme to access ADL file system.
*/
public class Adl extends DelegateToFileSystem {
Adl(URI theUri, Configuration conf) throws IOException, URISyntaxException {
super(theUri, createDataLakeFileSystem(conf), conf, AdlFileSystem.SCHEME,
false);
}
private static AdlFileSystem createDataLakeFileSystem(Configuration conf) {
AdlFileSystem fs = new AdlFileSystem();
fs.setConf(conf);
return fs;
}
/**
* @return Default port for ADL File system to communicate
*/
@Override
public final int getUriDefaultPort() {
return AdlFileSystem.DEFAULT_PORT;
}
}

View File

@ -0,0 +1,41 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.fs.adl;
import org.apache.hadoop.hdfs.web.PrivateAzureDataLakeFileSystem;
/**
* Expose adl:// scheme to access ADL file system.
*/
public class AdlFileSystem extends PrivateAzureDataLakeFileSystem {
public static final String SCHEME = "adl";
public static final int DEFAULT_PORT = 443;
@Override
public String getScheme() {
return SCHEME;
}
@Override
public int getDefaultPort() {
return DEFAULT_PORT;
}
}

View File

@ -0,0 +1,135 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.fs.adl.oauth2;
import java.io.IOException;
import java.util.Map;
import java.util.LinkedHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.web.oauth2.AccessTokenProvider;
import org.apache.hadoop.hdfs.web.oauth2.ConfRefreshTokenBasedAccessTokenProvider;
import org.apache.hadoop.hdfs.web.oauth2.PrivateCachedRefreshTokenBasedAccessTokenProvider;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.OAUTH_CLIENT_ID_KEY;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.OAUTH_REFRESH_URL_KEY;
import static org.apache.hadoop.hdfs.web.oauth2.ConfRefreshTokenBasedAccessTokenProvider.OAUTH_REFRESH_TOKEN_KEY;
/**
* Share refresh tokens across all ADLS instances with a common client ID. The
* {@link AccessTokenProvider} can be shared across multiple instances,
* amortizing the cost of refreshing tokens.
*/
public class CachedRefreshTokenBasedAccessTokenProvider
extends PrivateCachedRefreshTokenBasedAccessTokenProvider {
public static final String FORCE_REFRESH = "adl.force.token.refresh";
private static final Logger LOG =
LoggerFactory.getLogger(CachedRefreshTokenBasedAccessTokenProvider.class);
/** Limit size of provider cache. */
static final int MAX_PROVIDERS = 10;
@SuppressWarnings("serial")
private static final Map<String, AccessTokenProvider> CACHE =
new LinkedHashMap<String, AccessTokenProvider>() {
@Override
public boolean removeEldestEntry(
Map.Entry<String, AccessTokenProvider> e) {
return size() > MAX_PROVIDERS;
}
};
private AccessTokenProvider instance = null;
/**
* Create handle for cached instance.
*/
public CachedRefreshTokenBasedAccessTokenProvider() {
}
/**
* Gets the access token from internally cached
* ConfRefreshTokenBasedAccessTokenProvider instance.
*
* @return Valid OAuth2 access token for the user.
* @throws IOException when system error, internal server error or user error
*/
@Override
public synchronized String getAccessToken() throws IOException {
return instance.getAccessToken();
}
/**
* @return A cached Configuration consistent with the parameters of this
* instance.
*/
@Override
public synchronized Configuration getConf() {
return instance.getConf();
}
/**
* Configure cached instance. Note that the Configuration instance returned
* from subsequent calls to {@link #getConf() getConf} may be from a
* previous, cached entry.
* @param conf Configuration instance
*/
@Override
public synchronized void setConf(Configuration conf) {
String id = conf.get(OAUTH_CLIENT_ID_KEY);
if (null == id) {
throw new IllegalArgumentException("Missing client ID");
}
synchronized (CACHE) {
instance = CACHE.get(id);
if (null == instance
|| conf.getBoolean(FORCE_REFRESH, false)
|| replace(instance, conf)) {
instance = newInstance();
// clone configuration
instance.setConf(new Configuration(conf));
CACHE.put(id, instance);
LOG.debug("Created new client {}", id);
}
}
}
AccessTokenProvider newInstance() {
return new ConfRefreshTokenBasedAccessTokenProvider();
}
private static boolean replace(AccessTokenProvider cached, Configuration c2) {
// ConfRefreshTokenBasedAccessTokenProvider::setConf asserts !null
final Configuration c1 = cached.getConf();
for (String key : new String[] {
OAUTH_REFRESH_TOKEN_KEY, OAUTH_REFRESH_URL_KEY }) {
if (!c1.get(key).equals(c2.get(key))) {
// replace cached instance for this clientID
return true;
}
}
return false;
}
}

View File

@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* public interface to expose OAuth2 authentication related features.
*/
package org.apache.hadoop.fs.adl.oauth2;

View File

@ -0,0 +1,23 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* Supporting classes for metrics instrumentation.
*/
package org.apache.hadoop.fs.adl;

View File

@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web;
/**
* Constants.
*/
public final class ADLConfKeys {
public static final String
ADL_FEATURE_CONCURRENT_READ_AHEAD_MAX_CONCURRENT_CONN =
"adl.feature.override.readahead.max.concurrent.connection";
public static final int
ADL_FEATURE_CONCURRENT_READ_AHEAD_MAX_CONCURRENT_CONN_DEFAULT = 2;
public static final String ADL_WEBSDK_VERSION_KEY = "ADLFeatureSet";
static final String ADL_DEBUG_OVERRIDE_LOCAL_USER_AS_OWNER =
"adl.debug.override.localuserasfileowner";
static final boolean ADL_DEBUG_SET_LOCAL_USER_AS_OWNER_DEFAULT = false;
static final String ADL_FEATURE_REDIRECT_OFF =
"adl.feature.override.redirection.off";
static final boolean ADL_FEATURE_REDIRECT_OFF_DEFAULT = true;
static final String ADL_FEATURE_GET_BLOCK_LOCATION_LOCALLY_BUNDLED =
"adl.feature.override.getblocklocation.locally.bundled";
static final boolean ADL_FEATURE_GET_BLOCK_LOCATION_LOCALLY_BUNDLED_DEFAULT
= true;
static final String ADL_FEATURE_CONCURRENT_READ_WITH_READ_AHEAD =
"adl.feature.override.readahead";
static final boolean ADL_FEATURE_CONCURRENT_READ_WITH_READ_AHEAD_DEFAULT =
true;
static final String ADL_FEATURE_CONCURRENT_READ_WITH_READ_AHEAD_BUFFER_SIZE =
"adl.feature.override.readahead.max.buffersize";
static final int KB = 1024;
static final int MB = KB * KB;
static final int DEFAULT_BLOCK_SIZE = 4 * MB;
static final int DEFAULT_EXTENT_SIZE = 256 * MB;
static final int DEFAULT_TIMEOUT_IN_SECONDS = 120;
static final int
ADL_FEATURE_CONCURRENT_READ_WITH_READ_AHEAD_BUFFER_SIZE_DEFAULT =
8 * MB;
private ADLConfKeys() {
}
}

View File

@ -0,0 +1,180 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web;
/**
* Responsible for holding buffered data in the process. Hold only 1 and only
* 1 buffer block in the memory. Buffer block
* information is for the given file and the offset from the which the block
* is fetched. Across the webhdfs instances if
* same buffer block has been used then backend trip is avoided. Buffer block
* is certainly important since ADL fetches
* large amount of data (Default is 4MB however can be configured through
* core-site.xml) from the backend.
* Observation is in case of ORC/Avro kind of compressed file, buffer block
* does not avoid few backend calls across
* webhdfs
* instances.
*/
final class BufferManager {
private static final BufferManager BUFFER_MANAGER_INSTANCE = new
BufferManager();
private static Object lock = new Object();
private Buffer buffer = null;
private String fileName;
/**
* Constructor.
*/
private BufferManager() {
}
public static Object getLock() {
return lock;
}
public static BufferManager getInstance() {
return BUFFER_MANAGER_INSTANCE;
}
/**
* Validate if the current buffer block is of given stream.
*
* @param path ADL stream path
* @param offset Stream offset that caller is interested in
* @return True if the buffer block is available otherwise false
*/
boolean hasValidDataForOffset(String path, long offset) {
if (this.fileName == null) {
return false;
}
if (!this.fileName.equals(path)) {
return false;
}
if (buffer == null) {
return false;
}
if ((offset < buffer.offset) || (offset >= (buffer.offset
+ buffer.data.length))) {
return false;
}
return true;
}
/**
* Clean buffer block.
*/
void clear() {
buffer = null;
}
/**
* Validate if the current buffer block is of given stream. For now partial
* data available is not supported.
* Data must be available exactly or within the range of offset and size
* passed as parameter.
*
* @param path Stream path
* @param offset Offset of the stream
* @param size Size of the data from the offset of the stream caller
* interested in
* @return True if the data is available from the given offset and of the
* size caller is interested in.
*/
boolean hasData(String path, long offset, int size) {
if (!hasValidDataForOffset(path, offset)) {
return false;
}
if ((size + offset) > (buffer.data.length + buffer.offset)) {
return false;
}
return true;
}
/**
* Return the buffer block from the requested offset. It is caller
* responsibility to check if the buffer block is
* of there interest and offset is valid.
*
* @param data Byte array to be filed from the buffer block
* @param offset Data to be fetched from the offset.
*/
void get(byte[] data, long offset) {
System.arraycopy(buffer.data, (int) (offset - buffer.offset), data, 0,
data.length);
}
/**
* Create new empty buffer block of the given size.
*
* @param len Size of the buffer block.
* @return Empty byte array.
*/
byte[] getEmpty(int len) {
return new byte[len];
}
/**
* This function allows caller to specify new buffer block for the stream
* which is pulled from the backend.
*
* @param data Buffer
* @param path Stream path to which buffer belongs to
* @param offset Stream offset where buffer start with
*/
void add(byte[] data, String path, long offset) {
if (data == null) {
return;
}
buffer = new Buffer();
buffer.data = data;
buffer.offset = offset;
this.fileName = path;
}
/**
* @return Size of the buffer.
*/
int getBufferSize() {
return buffer.data.length;
}
/**
* @return Stream offset where buffer start with
*/
long getBufferOffset() {
return buffer.offset;
}
/**
* Buffer container.
*/
static class Buffer {
private byte[] data;
private long offset;
}
}

View File

@ -0,0 +1,37 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.oauth2;
import org.apache.hadoop.classification.InterfaceAudience.Private;
import org.apache.hadoop.classification.InterfaceStability.Unstable;
/**
* Exposing AccessTokenProvider publicly to extend in com.microsoft.azure
* .datalake package. Extended version to cache
* token for the process to gain performance gain.
*/
@Private
@Unstable
public abstract class PrivateCachedRefreshTokenBasedAccessTokenProvider
extends AccessTokenProvider {
// visibility workaround
}

View File

@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* A distributed implementation of {@link
* org.apache.hadoop.hdfs.web.oauth2} for oauth2 token management support.
*/
package org.apache.hadoop.hdfs.web.oauth2;

View File

@ -0,0 +1,25 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* A distributed implementation of {@link org.apache.hadoop.hdfs.web} for
* reading and writing files on Azure data lake file system. This
* implementation is derivation from the webhdfs specification.
*/
package org.apache.hadoop.hdfs.web;

View File

@ -0,0 +1,49 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
/**
* Query parameter to notify backend server that the all the data has been
* pushed to over the stream.
*
* Used in operation code Create and Append.
*/
public class ADLFlush extends BooleanParam {
/**
* Parameter name.
*/
public static final String NAME = "flush";
private static final Domain DOMAIN = new Domain(NAME);
/**
* Constructor.
*
* @param value the parameter value.
*/
public ADLFlush(final Boolean value) {
super(DOMAIN, value);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,96 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
import java.net.HttpURLConnection;
/**
* Extended Webhdfs GetOpParam to avoid redirect operation for azure data
* lake storage.
*/
public class ADLGetOpParam extends HttpOpParam<ADLGetOpParam.Op> {
private static final Domain<Op> DOMAIN = new Domain<Op>(NAME, Op.class);
/**
* Constructor.
*
* @param str a string representation of the parameter value.
*/
public ADLGetOpParam(final String str) {
super(DOMAIN, DOMAIN.parse(str));
}
@Override
public final String getName() {
return NAME;
}
/**
* Get operations.
*/
public static enum Op implements HttpOpParam.Op {
OPEN(false, HttpURLConnection.HTTP_OK);
private final boolean redirect;
private final int expectedHttpResponseCode;
private final boolean requireAuth;
Op(final boolean doRedirect, final int expectHttpResponseCode) {
this(doRedirect, expectHttpResponseCode, false);
}
Op(final boolean doRedirect, final int expectHttpResponseCode,
final boolean doRequireAuth) {
this.redirect = doRedirect;
this.expectedHttpResponseCode = expectHttpResponseCode;
this.requireAuth = doRequireAuth;
}
@Override
public HttpOpParam.Type getType() {
return HttpOpParam.Type.GET;
}
@Override
public boolean getRequireAuth() {
return requireAuth;
}
@Override
public boolean getDoOutput() {
return false;
}
@Override
public boolean getRedirect() {
return redirect;
}
@Override
public int getExpectedHttpResponseCode() {
return expectedHttpResponseCode;
}
@Override
public String toQueryString() {
return NAME + "=" + this;
}
}
}

View File

@ -0,0 +1,97 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
import java.net.HttpURLConnection;
/**
* Extended Webhdfs PostOpParam to avoid redirect during append operation for
* azure data lake storage.
*/
public class ADLPostOpParam extends HttpOpParam<ADLPostOpParam.Op> {
private static final Domain<Op> DOMAIN = new Domain<ADLPostOpParam.Op>(NAME,
Op.class);
/**
* Constructor.
*
* @param str a string representation of the parameter value.
*/
public ADLPostOpParam(final String str) {
super(DOMAIN, DOMAIN.parse(str));
}
@Override
public final String getName() {
return NAME;
}
/**
* Post operations.
*/
public static enum Op implements HttpOpParam.Op {
APPEND(true, false, HttpURLConnection.HTTP_OK);
private final boolean redirect;
private final boolean doOutput;
private final int expectedHttpResponseCode;
Op(final boolean doOut, final boolean doRedirect,
final int expectHttpResponseCode) {
this.doOutput = doOut;
this.redirect = doRedirect;
this.expectedHttpResponseCode = expectHttpResponseCode;
}
@Override
public Type getType() {
return Type.POST;
}
@Override
public boolean getRequireAuth() {
return false;
}
@Override
public boolean getDoOutput() {
return doOutput;
}
@Override
public boolean getRedirect() {
return redirect;
}
@Override
public int getExpectedHttpResponseCode() {
return expectedHttpResponseCode;
}
/**
* @return a URI query string.
*/
@Override
public String toQueryString() {
return NAME + "=" + this;
}
}
}

View File

@ -0,0 +1,94 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
import java.net.HttpURLConnection;
/**
* Extended Webhdfs PutOpParam to avoid redirect during Create operation for
* azure data lake storage.
*/
public class ADLPutOpParam extends HttpOpParam<ADLPutOpParam.Op> {
private static final Domain<Op> DOMAIN = new Domain<Op>(NAME, Op.class);
/**
* Constructor.
*
* @param str a string representation of the parameter value.
*/
public ADLPutOpParam(final String str) {
super(DOMAIN, DOMAIN.parse(str));
}
@Override
public final String getName() {
return NAME;
}
/**
* Put operations.
*/
public static enum Op implements HttpOpParam.Op {
CREATE(true, false, HttpURLConnection.HTTP_CREATED);
private final boolean redirect;
private final boolean doOutput;
private final int expectedHttpResponseCode;
private final boolean requireAuth;
Op(final boolean doOut, final boolean doRedirect,
final int expectHttpResponseCode) {
this.doOutput = doOut;
this.redirect = doRedirect;
this.expectedHttpResponseCode = expectHttpResponseCode;
this.requireAuth = false;
}
@Override
public HttpOpParam.Type getType() {
return HttpOpParam.Type.PUT;
}
@Override
public boolean getRequireAuth() {
return requireAuth;
}
@Override
public boolean getDoOutput() {
return doOutput;
}
@Override
public boolean getRedirect() {
return redirect;
}
@Override
public int getExpectedHttpResponseCode() {
return expectedHttpResponseCode;
}
@Override
public String toQueryString() {
return NAME + "=" + this;
}
}
}

View File

@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
import org.apache.hadoop.hdfs.web.ADLConfKeys;
import java.util.regex.Pattern;
/**
* Capture ADL Jar version information. Require for debugging and analysis
* purpose in the backend.
*/
public class ADLVersionInfo extends StringParam {
/**
* Parameter name.
*/
public static final String NAME = ADLConfKeys.ADL_WEBSDK_VERSION_KEY;
private static final StringParam.Domain DOMAIN = new StringParam.Domain(NAME,
Pattern.compile(".+"));
/**
* Constructor.
* @param featureSetVersion Enabled featured information
*/
public ADLVersionInfo(String featureSetVersion) {
super(DOMAIN, featureSetVersion);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,45 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
/**
* Overwrite parameter.
*/
public class AppendADLNoRedirectParam extends BooleanParam {
/**
* Parameter name.
*/
public static final String NAME = "append";
private static final Domain DOMAIN = new Domain(NAME);
/**
* Constructor.
*
* @param value the parameter value.
*/
public AppendADLNoRedirectParam(final Boolean value) {
super(DOMAIN, value);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,44 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.web.resources;
/**
* Overwrite parameter.
*/
public class CreateADLNoRedirectParam extends BooleanParam {
/**
* Parameter name.
*/
public static final String NAME = "write";
private static final Domain DOMAIN = new Domain(NAME);
/**
* Constructor.
*
* @param value the parameter value.
*/
public CreateADLNoRedirectParam(final Boolean value) {
super(DOMAIN, value);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.hdfs.web.resources;
/**
* To support single writer semantics. Notify to ADL backend if the stream
* needs to locked in order to protect
* concurrent write operation on the same stream.
*
* Used in append operation.
*/
public class LeaseParam extends StringParam {
public static final String NAME = "leaseId";
/**
* Default parameter value.
*/
public static final String DEFAULT = NULL;
private static final StringParam.Domain DOMAIN = new StringParam.Domain(NAME,
null);
/**
* Constructor.
*
* @param str a string representation of the parameter value.
*/
public LeaseParam(final String str) {
super(DOMAIN, str == null || str.equals(DEFAULT) ? null : str);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,44 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.web.resources;
/**
* Overwrite parameter.
*/
public class ReadADLNoRedirectParam extends BooleanParam {
/**
* Parameter name.
*/
public static final String NAME = "read";
private static final Domain DOMAIN = new Domain(NAME);
/**
* Constructor.
*
* @param value the parameter value.
*/
public ReadADLNoRedirectParam(final Boolean value) {
super(DOMAIN, value);
}
@Override
public final String getName() {
return NAME;
}
}

View File

@ -0,0 +1,27 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/**
* A distributed implementation of {@link
* org.apache.hadoop.hdfs.web.resources} for reading or extending query
* parameter for webhdfs specification. ADL
* specific
* query parameter also goes in the same package.
*/
package org.apache.hadoop.hdfs.web.resources;

View File

@ -0,0 +1,219 @@
<!---
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. See accompanying LICENSE file.
-->
# Hadoop Azure Data Lake Support
* [Introduction](#Introduction)
* [Features](#Features)
* [Limitations](#Limitations)
* [Usage](#Usage)
* [Concepts](#Concepts)
* [Webhdfs Compliance](#Webhdfs_Specification_Compliance)
* [OAuth2 Support](#OAuth2_Support)
* [Read Ahead Buffer Management](Read_Ahead_Buffer_Management)
* [Configuring Credentials & FileSystem](#Configuring_Credentials)
* [Accessing adl URLs](#Accessing_adl_URLs)
* [Testing the hadoop-azure Module](#Testing_the_hadoop-azure_Module)
## <a name="Introduction" />Introduction
The hadoop-azure-datalake module provides support for integration with
[Azure Data Lake Store]( https://azure.microsoft.com/en-in/documentation/services/data-lake-store/).
The jar file is named azure-datalake-store.jar.
## <a name="Features" />Features
* Read and write data stored in an Azure Data Lake Storage account.
* Partial support for [Webhdfs Specification 2.7.0](https://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-hdfs/WebHDFS.html)
* Reference file system paths using URLs using the `adl` scheme for Secure Webhdfs i.e. SSL
encrypted access.
* Can act as a source of data in a MapReduce job, or a sink.
* Tested on both Linux and Windows.
* Tested for scale.
## <a name="Limitations" />Limitations
Partial or no support for the following operations in [Webhdfs Specification 2.7.0](https://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-hdfs/WebHDFS.html):
* Operation on Symbolic Link
* Proxy Users
* File Truncate
* File Checksum
* File replication factor
* Home Directory Partial supported based on OAuth2 token information and not the active user on Hadoop cluster.
* Extended Attributes(XAttrs) Operations
* Snapshot Operations
* Delegation Token Operations
* User and group information returned as ListStatus and GetFileStatus is in form of GUID associated in Azure Active Directory.
## <a name="Usage" />Usage
### <a name="Concepts" />Concepts
Azure Data Lake Storage access path syntax is
adl://<Account Name>.azuredatalakestore.net/
Get started with azure data lake account with [https://azure.microsoft.com/en-in/documentation/articles/data-lake-store-get-started-portal/](https://azure.microsoft.com/en-in/documentation/articles/data-lake-store-get-started-portal/)
#### <a name="Webhdfs_Specification_Compliance" />Webhdfs Compliance
Azure Data Lake Storage exposes a public REST endpoint as per [Webhdfs Specification 2.7.0](https://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-hdfs/WebHDFS.html) to access storage file system.
Syntax to access Azure data lake storage account over [Webhdfs Specification 2.7.0](https://hadoop.apache.org/docs/r2.7.0/hadoop-project-dist/hadoop-hdfs/WebHDFS.html) is
https://<Account Name>.azuredatalakestore.net/webhdfs/v1/<File System Path>?<Query paramaters>
#### <a name="#OAuth2_Support" />OAuth2 Support
Usage of Azure Data Lake Storage requires OAuth2 bearer token to be present as part of the HTTPS header as per OAuth2 specification. Valid OAuth2 bearer token should be obtained from Azure Active Directory for valid users who have access to Azure Data Lake Storage Account.
Azure Active Directory (Azure AD) is Microsofts multi-tenant cloud based directory and identity management service. See [https://azure.microsoft.com/en-in/documentation/articles/active-directory-whatis/](https://azure.microsoft.com/en-in/documentation/articles/active-directory-whatis/)
Following sections describes on OAuth2 configuration in core-site.xml.
#### <a name="#Read_Ahead_Buffer_Management" />Read Ahead Buffer Management
Azure Data Lake Storage offers high throughput. To maximize throughput, applications can use this feature to buffer data concurrently, in memory during read operation. This data is cached in memory per process per stream.
To Enable/Disable read ahead feature.
<property>
<name>adl.feature.override.readahead</name>
<value>true</value>
<description>
Enables read aheads in the ADL client, the feature is used to improve read throughput.
This works in conjunction with the value set in adl.feature.override.readahead.max.buffersize.
When set to false the read ahead feature is turned off.
Default : True if not configured.
</description>
</property>
To configure read ahead buffer size.
<property>
<name>adl.feature.override.readahead.max.buffersize</name>
<value>8388608</value>
<description>
Define maximum buffer size to cache read ahead data, this is allocated per process to
cache read ahead data. Applicable only when adl.feature.override.readahead is set to true.
Default : 8388608 Byte i.e. 8MB if not configured.
</description>
</property>
To configure number of concurrent connection to Azure Data Lake Storage Account.
<property>
<name>adl.feature.override.readahead.max.concurrent.connection</name>
<value>2</value>
<description>
Define maximum concurrent connection can be established to
read ahead. If the data size is<4MB then only 1 read n/w connection
is set. If the data size is >4MB but<8MB then 2 read n/w
connection
is set. Data >8MB then value set under the property would
take
effect. Applicable only when adl.feature.override.readahead is set
to true and buffer size is >8MB.
It is recommended to reset this property if the adl.feature.override.readahead.max.buffersize
is < 8MB to gain performance. Application has to consider
throttling
limit for the account as well before configuring large buffer size.
</description>
</property>
## <a name="Configuring_Credentials" />Configuring Credentials & FileSystem
Update core-site.xml for OAuth2 configuration
<property>
<name>dfs.webhdfs.oauth2.refresh.token.expires.ms.since.epoch</name>
<value>0</value>
</property>
<property>
<name>dfs.webhdfs.oauth2.credential</name>
<value>bearer.and.refresh.token</value>
</property>
<property>
<name>dfs.webhdfs.oauth2.access.token</name>
<value>NOT_SET</value>
</property>
<property>
<name>dfs.webhdfs.oauth2.refresh.url</name>
<value>https://login.windows.net/common/oauth2/token/</value>
</property>
<property>
<name>dfs.webhdfs.oauth2.access.token.provider</name>
<value>org.apache.hadoop.fs.adl.oauth2.CachedRefreshTokenBasedAccessTokenProvider</value>
</property>
Application require to set Client id and OAuth2 refresh token from Azure Active Directory associated with client id. See [https://github.com/AzureAD/azure-activedirectory-library-for-java](https://github.com/AzureAD/azure-activedirectory-library-for-java).
**Do not share client id and refresh token, it must be kept secret.**
<property>
<name>dfs.webhdfs.oauth2.client.id</name>
<value></value>
</property>
<property>
<name>dfs.webhdfs.oauth2.refresh.token</name>
<value></value>
</property>
For ADL FileSystem to take effect. Update core-site.xml with
<property>
<name>fs.adl.impl</name>
<value>org.apache.hadoop.fs.adl.AdlFileSystem</value>
</property>
<property>
<name>fs.AbstractFileSystem.adl.impl</name>
<value>org.apache.hadoop.fs.adl.Adl</value>
</property>
### <a name="Accessing_adl_URLs" />Accessing adl URLs
After credentials are configured in core-site.xml, any Hadoop component may
reference files in that Azure Data Lake Storage account by using URLs of the following
format:
adl://<Account Name>.azuredatalakestore.net/<path>
The schemes `adl` identify a URL on a file system backed by Azure
Data Lake Storage. `adl` utilizes encrypted HTTPS access for all interaction with
the Azure Data Lake Storage API.
For example, the following
[FileSystem Shell](../hadoop-project-dist/hadoop-common/FileSystemShell.html)
commands demonstrate access to a storage account named `youraccount`.
> hadoop fs -mkdir adl://yourcontainer.azuredatalakestore.net/testDir
> hadoop fs -put testFile adl://yourcontainer.azuredatalakestore.net/testDir/testFile
> hadoop fs -cat adl://yourcontainer.azuredatalakestore.net/testDir/testFile
test file content
## <a name="Testing_the_hadoop-azure_Module" />Testing the azure-datalake-store Module
The hadoop-azure module includes a full suite of unit tests. Most of the tests will run without additional configuration by running mvn test. This includes tests against mocked storage, which is an in-memory emulation of Azure Data Lake Storage.
A selection of tests can run against the Azure Data Lake Storage. To run tests against Adl storage. Please configure contract-test-options.xml with Adl account information mentioned in the above sections. Also turn on contract test execution flag to trigger tests against Azure Data Lake Storage.
<property>
<name>dfs.adl.test.contract.enable</name>
<value>true</value>
</property>

View File

@ -0,0 +1,147 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.hadoop.fs.adl.oauth2;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.web.oauth2.AccessTokenProvider;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import static org.junit.Assert.*;
import static org.mockito.Mockito.*;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.OAUTH_CLIENT_ID_KEY;
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.OAUTH_REFRESH_URL_KEY;
import static org.apache.hadoop.hdfs.web.oauth2.ConfRefreshTokenBasedAccessTokenProvider.OAUTH_REFRESH_TOKEN_KEY;
/**
* Verify cache behavior of ConfRefreshTokenBasedAccessTokenProvider instances.
*/
public class TestCachedRefreshTokenBasedAccessTokenProvider {
private Configuration conf;
@Rule public TestName name = new TestName();
String clientId(int id) {
return name.getMethodName() + "_clientID" + id;
}
@Before
public void initConfig() {
conf = new Configuration(false);
conf.set(OAUTH_CLIENT_ID_KEY, clientId(0));
conf.set(OAUTH_REFRESH_TOKEN_KEY, "01234567890abcdef");
conf.set(OAUTH_REFRESH_URL_KEY, "http://dingo.invalid:80");
}
@Test
public void testCacheInstance() throws Exception {
final AccessTokenProvider inst0 = mock(AccessTokenProvider.class);
when(inst0.getConf()).thenReturn(conf);
// verify config
CachedRefreshTokenBasedAccessTokenProvider t1 = new MockProvider(inst0);
t1.setConf(conf);
verify(inst0).setConf(any(Configuration.class)); // cloned, not exact match
// verify cache hit
CachedRefreshTokenBasedAccessTokenProvider t2 =
new CachedRefreshTokenBasedAccessTokenProvider() {
@Override
AccessTokenProvider newInstance() {
fail("Failed to return cached instance");
return null;
}
};
t2.setConf(conf);
// verify force refresh
conf.setBoolean(
CachedRefreshTokenBasedAccessTokenProvider.FORCE_REFRESH, true);
final AccessTokenProvider inst1 = mock(AccessTokenProvider.class);
when(inst1.getConf()).thenReturn(conf);
CachedRefreshTokenBasedAccessTokenProvider t3 = new MockProvider(inst1);
t3.setConf(conf);
verify(inst1).setConf(any(Configuration.class));
// verify cache miss
conf.set(OAUTH_REFRESH_URL_KEY, "http://yak.invalid:80");
final AccessTokenProvider inst2 = mock(AccessTokenProvider.class);
when(inst2.getConf()).thenReturn(conf);
CachedRefreshTokenBasedAccessTokenProvider t4 = new MockProvider(inst2);
t4.setConf(conf);
verify(inst2).setConf(any(Configuration.class));
}
@Test
public void testCacheLimit() throws Exception {
final int iter = CachedRefreshTokenBasedAccessTokenProvider.MAX_PROVIDERS;
for (int i = 0; i < iter; ++i) {
conf.set(OAUTH_CLIENT_ID_KEY, clientId(i));
AccessTokenProvider inst = mock(AccessTokenProvider.class);
when(inst.getConf()).thenReturn(conf);
CachedRefreshTokenBasedAccessTokenProvider t = new MockProvider(inst);
t.setConf(conf);
verify(inst).setConf(any(Configuration.class));
}
// verify cache hit
for (int i = 0; i < iter; ++i) {
conf.set(OAUTH_CLIENT_ID_KEY, clientId(i));
CachedRefreshTokenBasedAccessTokenProvider t =
new CachedRefreshTokenBasedAccessTokenProvider() {
@Override
AccessTokenProvider newInstance() {
fail("Failed to return cached instance");
return null;
}
};
t.setConf(conf);
}
// verify miss, evict 0
conf.set(OAUTH_CLIENT_ID_KEY, clientId(iter));
final AccessTokenProvider inst = mock(AccessTokenProvider.class);
when(inst.getConf()).thenReturn(conf);
CachedRefreshTokenBasedAccessTokenProvider t = new MockProvider(inst);
t.setConf(conf);
verify(inst).setConf(any(Configuration.class));
// verify miss
conf.set(OAUTH_CLIENT_ID_KEY, clientId(0));
final AccessTokenProvider inst0 = mock(AccessTokenProvider.class);
when(inst0.getConf()).thenReturn(conf);
CachedRefreshTokenBasedAccessTokenProvider t0 = new MockProvider(inst0);
t0.setConf(conf);
verify(inst0).setConf(any(Configuration.class));
}
static class MockProvider extends CachedRefreshTokenBasedAccessTokenProvider {
private final AccessTokenProvider inst;
MockProvider(AccessTokenProvider inst) {
this.inst = inst;
}
@Override
AccessTokenProvider newInstance() {
return inst;
}
}
}

View File

@ -105,6 +105,12 @@
<artifactId>hadoop-sls</artifactId> <artifactId>hadoop-sls</artifactId>
<scope>compile</scope> <scope>compile</scope>
</dependency> </dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-azure-datalake</artifactId>
<scope>compile</scope>
<version>${project.version}</version>
</dependency>
</dependencies> </dependencies>
<build> <build>

View File

@ -46,6 +46,7 @@
<module>hadoop-azure</module> <module>hadoop-azure</module>
<module>hadoop-aws</module> <module>hadoop-aws</module>
<module>hadoop-kafka</module> <module>hadoop-kafka</module>
<module>hadoop-azure-datalake</module>
</modules> </modules>
<build> <build>