HADOOP-11009. Add Timestamp Preservation to DistCp (Gary Steelman via aw)

This commit is contained in:
Allen Wittenauer 2014-09-24 15:39:55 -07:00
parent 6e5194099c
commit ea1fb1461a
8 changed files with 990 additions and 110 deletions

View File

@ -204,6 +204,8 @@ Release 2.6.0 - UNRELEASED
HADOOP-11017. KMS delegation token secret manager should be able to use
zookeeper as store. (asuresh via tucu)
HADOOP-11009. Add Timestamp Preservation to DistCp (Gary Steelman via aw)
OPTIMIZATIONS
HADOOP-10838. Byte array native checksumming. (James Thomas via todd)

View File

@ -101,7 +101,7 @@ public final class CopyListingFileStatus extends FileStatus {
* @return Map<String, byte[]> containing all xAttrs
*/
public Map<String, byte[]> getXAttrs() {
return xAttrs;
return xAttrs != null ? xAttrs : Collections.<String, byte[]>emptyMap();
}
/**

View File

@ -37,18 +37,21 @@ public enum DistCpOptionSwitch {
/**
* Preserves status of file/path in the target.
* Default behavior with -p, is to preserve replication,
* block size, user, group, permission and checksum type on the target file.
* Note that when preserving checksum type, block size is also preserved.
* block size, user, group, permission, checksum type and timestamps on the
* target file. Note that when preserving checksum type, block size is also
* preserved.
*
* If any of the optional switches are present among rbugpc, then
* @see PRESERVE_STATUS_DEFAULT
*
* If any of the optional switches are present among rbugpcaxt, then
* only the corresponding file attribute is preserved.
*
*/
PRESERVE_STATUS(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
new Option("p", true, "preserve status (rbugpcax)(replication, " +
"block-size, user, group, permission, checksum-type, ACL, XATTR). " +
"If -p is specified with no <arg>, then preserves replication, " +
"block size, user, group, permission and checksum type." +
new Option("p", true, "preserve status (rbugpcaxt)(replication, " +
"block-size, user, group, permission, checksum-type, ACL, XATTR, " +
"timestamps). If -p is specified with no <arg>, then preserves " +
"replication, block size, user, group, permission, checksum type " +
"and timestamps. " +
"raw.* xattrs are preserved when both the source and destination " +
"paths are in the /.reserved/raw hierarchy (HDFS only). raw.* xattr" +
"preservation is independent of the -p flag." +
@ -166,7 +169,7 @@ public enum DistCpOptionSwitch {
BANDWIDTH(DistCpConstants.CONF_LABEL_BANDWIDTH_MB,
new Option("bandwidth", true, "Specify bandwidth per map in MB"));
static final String PRESERVE_STATUS_DEFAULT = "-prbugpc";
public static final String PRESERVE_STATUS_DEFAULT = "-prbugpct";
private final String confLabel;
private final Option option;

View File

@ -68,7 +68,7 @@ public class DistCpOptions {
private boolean targetPathExists = true;
public static enum FileAttribute{
REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION, CHECKSUMTYPE, ACL, XATTR;
REPLICATION, BLOCKSIZE, USER, GROUP, PERMISSION, CHECKSUMTYPE, ACL, XATTR, TIMES;
public static FileAttribute getAttribute(char symbol) {
for (FileAttribute attribute : values()) {

View File

@ -18,39 +18,39 @@
package org.apache.hadoop.tools.util;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URI;
import java.net.UnknownHostException;
import java.text.DecimalFormat;
import java.util.EnumSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileChecksum;
import org.apache.hadoop.fs.XAttr;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclUtil;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.tools.CopyListing.AclsNotSupportedException;
import org.apache.hadoop.tools.CopyListing.XAttrsNotSupportedException;
import org.apache.hadoop.tools.CopyListingFileStatus;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.tools.DistCpOptions.FileAttribute;
import org.apache.hadoop.tools.mapred.UniformSizeInputFormat;
import org.apache.hadoop.tools.CopyListing.AclsNotSupportedException;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.mapreduce.InputFormat;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.text.DecimalFormat;
import java.net.URI;
import java.net.InetAddress;
import java.net.UnknownHostException;
import com.google.common.collect.Maps;
/**
* Utility functions used in DistCp.
@ -163,7 +163,7 @@ public class DistCpUtils {
}
/**
* Un packs preservation attribute string containing the first character of
* Unpacks preservation attribute string containing the first character of
* each preservation attribute back to a set of attributes to preserve
* @param attributes - Attribute string
* @return - Attribute set
@ -209,7 +209,7 @@ public class DistCpUtils {
if (!srcAcl.equals(targetAcl)) {
targetFS.setAcl(path, srcAcl);
}
// setAcl can't preserve sticky bit, so also call setPermission if needed.
// setAcl doesn't preserve sticky bit, so also call setPermission if needed.
if (srcFileStatus.getPermission().getStickyBit() !=
targetFileStatus.getPermission().getStickyBit()) {
targetFS.setPermission(path, srcFileStatus.getPermission());
@ -225,30 +225,28 @@ public class DistCpUtils {
Map<String, byte[]> srcXAttrs = srcFileStatus.getXAttrs();
Map<String, byte[]> targetXAttrs = getXAttrs(targetFS, path);
if (srcXAttrs != null && !srcXAttrs.equals(targetXAttrs)) {
Iterator<Entry<String, byte[]>> iter = srcXAttrs.entrySet().iterator();
while (iter.hasNext()) {
Entry<String, byte[]> entry = iter.next();
final String xattrName = entry.getKey();
for (Entry<String, byte[]> entry : srcXAttrs.entrySet()) {
String xattrName = entry.getKey();
if (xattrName.startsWith(rawNS) || preserveXAttrs) {
targetFS.setXAttr(path, entry.getKey(), entry.getValue());
targetFS.setXAttr(path, xattrName, entry.getValue());
}
}
}
}
if (attributes.contains(FileAttribute.REPLICATION) && ! targetFileStatus.isDirectory() &&
srcFileStatus.getReplication() != targetFileStatus.getReplication()) {
if (attributes.contains(FileAttribute.REPLICATION) && !targetFileStatus.isDirectory() &&
(srcFileStatus.getReplication() != targetFileStatus.getReplication())) {
targetFS.setReplication(path, srcFileStatus.getReplication());
}
if (attributes.contains(FileAttribute.GROUP) &&
!group.equals(srcFileStatus.getGroup())) {
!group.equals(srcFileStatus.getGroup())) {
group = srcFileStatus.getGroup();
chown = true;
}
if (attributes.contains(FileAttribute.USER) &&
!user.equals(srcFileStatus.getOwner())) {
!user.equals(srcFileStatus.getOwner())) {
user = srcFileStatus.getOwner();
chown = true;
}
@ -256,6 +254,12 @@ public class DistCpUtils {
if (chown) {
targetFS.setOwner(path, user, group);
}
if (attributes.contains(FileAttribute.TIMES)) {
targetFS.setTimes(path,
srcFileStatus.getModificationTime(),
srcFileStatus.getAccessTime());
}
}
/**

View File

@ -497,7 +497,7 @@ public class TestOptionsParser {
attribIterator.next();
i++;
}
Assert.assertEquals(i, 6);
Assert.assertEquals(i, DistCpOptionSwitch.PRESERVE_STATUS_DEFAULT.length() - 2);
try {
OptionsParser.parse(new String[] {

View File

@ -590,6 +590,7 @@ public class TestCopyMapper {
EnumSet.allOf(DistCpOptions.FileAttribute.class);
preserveStatus.remove(DistCpOptions.FileAttribute.ACL);
preserveStatus.remove(DistCpOptions.FileAttribute.XATTR);
preserveStatus.remove(DistCpOptions.FileAttribute.TIMES);
context.getConfiguration().set(DistCpConstants.CONF_LABEL_PRESERVE_STATUS,
DistCpUtils.packAttributes(preserveStatus));