Fix potential FSImage corruption. Contributed by Daryn Sharp and Vinayakumar B.

(cherry picked from commit f1996ccbaee734d423caa9d47a571cfff98ef42c)
This commit is contained in:
Xiao Chen 2018-10-24 15:49:27 -07:00
parent 4167275e89
commit 96cedb87b9
16 changed files with 443 additions and 256 deletions

View File

@ -26,6 +26,10 @@ import java.io.Serializable;
public class LongBitFormat implements Serializable {
private static final long serialVersionUID = 1L;
public interface Enum {
int getLength();
}
private final String NAME;
/** Bit offset */
private final int OFFSET;
@ -69,4 +73,8 @@ public class LongBitFormat implements Serializable {
public long getMin() {
return MIN;
}
public int getLength() {
return LENGTH;
}
}

View File

@ -794,6 +794,12 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
public static final long DFS_IMAGE_TRANSFER_BOOTSTRAP_STANDBY_RATE_DEFAULT =
0; //no throttling
// String table in the fsimage utilizes an expanded bit range.
public static final String DFS_IMAGE_EXPANDED_STRING_TABLES_KEY =
"dfs.image.string-tables.expanded";
public static final boolean DFS_IMAGE_EXPANDED_STRING_TABLES_DEFAULT =
false;
// Image transfer timeout
public static final String DFS_IMAGE_TRANSFER_TIMEOUT_KEY = "dfs.image.transfer.timeout";
public static final int DFS_IMAGE_TRANSFER_TIMEOUT_DEFAULT = 60 * 1000;

View File

@ -31,25 +31,23 @@ import com.google.common.collect.ImmutableList;
/**
* Class to pack an AclEntry into an integer. <br>
* An ACL entry is represented by a 32-bit integer in Big Endian format. <br>
* The bits can be divided in four segments: <br>
* [0:1) || [1:3) || [3:6) || [6:7) || [7:32) <br>
* <br>
* [0:1) -- the scope of the entry (AclEntryScope) <br>
* [1:3) -- the type of the entry (AclEntryType) <br>
* [3:6) -- the permission of the entry (FsAction) <br>
* [6:7) -- A flag to indicate whether Named entry or not <br>
* [7:8) -- Reserved <br>
* [8:32) -- the name of the entry, which is an ID that points to a <br>
* string in the StringTableSection. <br>
*
* Note: this format is used both in-memory and on-disk. Changes will be
* incompatible.
*
*/
public enum AclEntryStatusFormat {
public enum AclEntryStatusFormat implements LongBitFormat.Enum {
SCOPE(null, 1),
TYPE(SCOPE.BITS, 2),
PERMISSION(TYPE.BITS, 3),
NAMED_ENTRY_CHECK(PERMISSION.BITS, 1),
RESERVED(NAMED_ENTRY_CHECK.BITS, 1),
NAME(RESERVED.BITS, 24);
PERMISSION(null, 3),
TYPE(PERMISSION.BITS, 2),
SCOPE(TYPE.BITS, 1),
NAME(SCOPE.BITS, 24);
private static final FsAction[] FSACTION_VALUES = FsAction.values();
private static final AclEntryScope[] ACL_ENTRY_SCOPE_VALUES =
AclEntryScope.values();
private static final AclEntryType[] ACL_ENTRY_TYPE_VALUES =
AclEntryType.values();
private final LongBitFormat BITS;
@ -59,30 +57,29 @@ public enum AclEntryStatusFormat {
static AclEntryScope getScope(int aclEntry) {
int ordinal = (int) SCOPE.BITS.retrieve(aclEntry);
return AclEntryScope.values()[ordinal];
return ACL_ENTRY_SCOPE_VALUES[ordinal];
}
static AclEntryType getType(int aclEntry) {
int ordinal = (int) TYPE.BITS.retrieve(aclEntry);
return AclEntryType.values()[ordinal];
return ACL_ENTRY_TYPE_VALUES[ordinal];
}
static FsAction getPermission(int aclEntry) {
int ordinal = (int) PERMISSION.BITS.retrieve(aclEntry);
return FsAction.values()[ordinal];
return FSACTION_VALUES[ordinal];
}
static String getName(int aclEntry) {
int nameExists = (int) NAMED_ENTRY_CHECK.BITS.retrieve(aclEntry);
if (nameExists == 0) {
return null;
return getName(aclEntry, null);
}
int id = (int) NAME.BITS.retrieve(aclEntry);
AclEntryType type = getType(aclEntry);
if (type == AclEntryType.USER) {
return SerialNumberManager.INSTANCE.getUser(id);
} else if (type == AclEntryType.GROUP) {
return SerialNumberManager.INSTANCE.getGroup(id);
static String getName(int aclEntry,
SerialNumberManager.StringTable stringTable) {
SerialNumberManager snm = getSerialNumberManager(getType(aclEntry));
if (snm != null) {
int nid = (int)NAME.BITS.retrieve(aclEntry);
return snm.getString(nid, stringTable);
}
return null;
}
@ -94,29 +91,26 @@ public enum AclEntryStatusFormat {
aclEntryInt = TYPE.BITS.combine(aclEntry.getType().ordinal(), aclEntryInt);
aclEntryInt = PERMISSION.BITS.combine(aclEntry.getPermission().ordinal(),
aclEntryInt);
if (aclEntry.getName() != null) {
aclEntryInt = NAMED_ENTRY_CHECK.BITS.combine(1, aclEntryInt);
if (aclEntry.getType() == AclEntryType.USER) {
int userId = SerialNumberManager.INSTANCE.getUserSerialNumber(aclEntry
.getName());
aclEntryInt = NAME.BITS.combine(userId, aclEntryInt);
} else if (aclEntry.getType() == AclEntryType.GROUP) {
int groupId = SerialNumberManager.INSTANCE
.getGroupSerialNumber(aclEntry.getName());
aclEntryInt = NAME.BITS.combine(groupId, aclEntryInt);
}
SerialNumberManager snm = getSerialNumberManager(aclEntry.getType());
if (snm != null) {
int nid = snm.getSerialNumber(aclEntry.getName());
aclEntryInt = NAME.BITS.combine(nid, aclEntryInt);
}
return (int) aclEntryInt;
}
static AclEntry toAclEntry(int aclEntry) {
AclEntry.Builder builder = new AclEntry.Builder();
builder.setScope(getScope(aclEntry)).setType(getType(aclEntry))
.setPermission(getPermission(aclEntry));
if (getName(aclEntry) != null) {
builder.setName(getName(aclEntry));
return toAclEntry(aclEntry, null);
}
return builder.build();
static AclEntry toAclEntry(int aclEntry,
SerialNumberManager.StringTable stringTable) {
return new AclEntry.Builder()
.setScope(getScope(aclEntry))
.setType(getType(aclEntry))
.setPermission(getPermission(aclEntry))
.setName(getName(aclEntry, stringTable))
.build();
}
public static int[] toInt(List<AclEntry> aclEntries) {
@ -127,12 +121,19 @@ public enum AclEntryStatusFormat {
return entries;
}
public static ImmutableList<AclEntry> toAclEntries(int[] entries) {
ImmutableList.Builder<AclEntry> b = new ImmutableList.Builder<AclEntry>();
for (int entry : entries) {
AclEntry aclEntry = toAclEntry(entry);
b.add(aclEntry);
}
return b.build();
private static SerialNumberManager getSerialNumberManager(AclEntryType type) {
switch (type) {
case USER:
return SerialNumberManager.USER;
case GROUP:
return SerialNumberManager.GROUP;
default:
return null;
}
}
@Override
public int getLength() {
return BITS.getLength();
}
}

View File

@ -265,6 +265,8 @@ public class FSDirectory implements Closeable {
};
FSDirectory(FSNamesystem ns, Configuration conf) throws IOException {
// used to enable/disable the use of expanded string tables.
SerialNumberManager.initialize(conf);
this.dirLock = new ReentrantReadWriteLock(true); // fair
this.inodeId = new INodeId();
rootDir = createRoot(ns);

View File

@ -31,10 +31,6 @@ import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.HadoopIllegalArgumentException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.fs.permission.AclEntry;
import org.apache.hadoop.fs.permission.AclEntryScope;
import org.apache.hadoop.fs.permission.AclEntryType;
import org.apache.hadoop.fs.permission.FsAction;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.fs.permission.PermissionStatus;
import org.apache.hadoop.fs.StorageType;
import org.apache.hadoop.fs.XAttr;
@ -60,6 +56,8 @@ import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.XAttrCom
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.XAttrFeatureProto;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.QuotaByStorageTypeEntryProto;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.QuotaByStorageTypeFeatureProto;
import org.apache.hadoop.hdfs.server.namenode.INodeWithAdditionalFields.PermissionStatusFormat;
import org.apache.hadoop.hdfs.server.namenode.SerialNumberManager.StringTable;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.StartupProgress;
@ -74,22 +72,11 @@ import com.google.protobuf.ByteString;
@InterfaceAudience.Private
public final class FSImageFormatPBINode {
private final static long USER_GROUP_STRID_MASK = (1 << 24) - 1;
private final static int USER_STRID_OFFSET = 40;
private final static int GROUP_STRID_OFFSET = 16;
public static final int ACL_ENTRY_NAME_MASK = (1 << 24) - 1;
public static final int ACL_ENTRY_NAME_OFFSET = 6;
public static final int ACL_ENTRY_TYPE_OFFSET = 3;
public static final int ACL_ENTRY_SCOPE_OFFSET = 5;
public static final int ACL_ENTRY_PERM_MASK = 7;
private static final int ACL_ENTRY_TYPE_MASK = 3;
private static final int ACL_ENTRY_SCOPE_MASK = 1;
private static final FsAction[] FSACTION_VALUES = FsAction.values();
private static final AclEntryScope[] ACL_ENTRY_SCOPE_VALUES = AclEntryScope
.values();
private static final AclEntryType[] ACL_ENTRY_TYPE_VALUES = AclEntryType
.values();
public static final int XATTR_NAMESPACE_MASK = 3;
public static final int XATTR_NAMESPACE_OFFSET = 30;
@ -100,55 +87,35 @@ public final class FSImageFormatPBINode {
public static final int XATTR_NAMESPACE_EXT_OFFSET = 5;
public static final int XATTR_NAMESPACE_EXT_MASK = 1;
private static final XAttr.NameSpace[] XATTR_NAMESPACE_VALUES =
XAttr.NameSpace.values();
private static final Log LOG = LogFactory.getLog(FSImageFormatPBINode.class);
// the loader must decode all fields referencing serial number based fields
// via to<Item> methods with the string table.
public final static class Loader {
public static PermissionStatus loadPermission(long id,
final String[] stringTable) {
short perm = (short) (id & ((1 << GROUP_STRID_OFFSET) - 1));
int gsid = (int) ((id >> GROUP_STRID_OFFSET) & USER_GROUP_STRID_MASK);
int usid = (int) ((id >> USER_STRID_OFFSET) & USER_GROUP_STRID_MASK);
return new PermissionStatus(stringTable[usid], stringTable[gsid],
new FsPermission(perm));
final StringTable stringTable) {
return PermissionStatusFormat.toPermissionStatus(id, stringTable);
}
public static ImmutableList<AclEntry> loadAclEntries(
AclFeatureProto proto, final String[] stringTable) {
AclFeatureProto proto, final StringTable stringTable) {
ImmutableList.Builder<AclEntry> b = ImmutableList.builder();
for (int v : proto.getEntriesList()) {
int p = v & ACL_ENTRY_PERM_MASK;
int t = (v >> ACL_ENTRY_TYPE_OFFSET) & ACL_ENTRY_TYPE_MASK;
int s = (v >> ACL_ENTRY_SCOPE_OFFSET) & ACL_ENTRY_SCOPE_MASK;
int nid = (v >> ACL_ENTRY_NAME_OFFSET) & ACL_ENTRY_NAME_MASK;
String name = stringTable[nid];
b.add(new AclEntry.Builder().setName(name)
.setPermission(FSACTION_VALUES[p])
.setScope(ACL_ENTRY_SCOPE_VALUES[s])
.setType(ACL_ENTRY_TYPE_VALUES[t]).build());
b.add(AclEntryStatusFormat.toAclEntry(v, stringTable));
}
return b.build();
}
public static List<XAttr> loadXAttrs(
XAttrFeatureProto proto, final String[] stringTable) {
XAttrFeatureProto proto, final StringTable stringTable) {
List<XAttr> b = new ArrayList<>();
for (XAttrCompactProto xAttrCompactProto : proto.getXAttrsList()) {
int v = xAttrCompactProto.getName();
int nid = (v >> XATTR_NAME_OFFSET) & XATTR_NAME_MASK;
int ns = (v >> XATTR_NAMESPACE_OFFSET) & XATTR_NAMESPACE_MASK;
ns |=
((v >> XATTR_NAMESPACE_EXT_OFFSET) & XATTR_NAMESPACE_EXT_MASK) << 2;
String name = stringTable[nid];
byte[] value = null;
if (xAttrCompactProto.getValue() != null) {
value = xAttrCompactProto.getValue().toByteArray();
}
b.add(new XAttr.Builder().setNameSpace(XATTR_NAMESPACE_VALUES[ns])
.setName(name).setValue(value).build());
b.add(XAttrFormat.toXAttr(v, value, stringTable));
}
return b;
@ -438,46 +405,30 @@ public final class FSImageFormatPBINode {
}
}
// the saver can directly write out fields referencing serial numbers.
// the serial number maps will be compacted when loading.
public final static class Saver {
private long numImageErrors;
private static long buildPermissionStatus(INodeAttributes n,
final SaverContext.DeduplicationMap<String> stringMap) {
long userId = stringMap.getId(n.getUserName());
long groupId = stringMap.getId(n.getGroupName());
return ((userId & USER_GROUP_STRID_MASK) << USER_STRID_OFFSET)
| ((groupId & USER_GROUP_STRID_MASK) << GROUP_STRID_OFFSET)
| n.getFsPermissionShort();
private static long buildPermissionStatus(INodeAttributes n) {
return n.getPermissionLong();
}
private static AclFeatureProto.Builder buildAclEntries(AclFeature f,
final SaverContext.DeduplicationMap<String> map) {
private static AclFeatureProto.Builder buildAclEntries(AclFeature f) {
AclFeatureProto.Builder b = AclFeatureProto.newBuilder();
for (int pos = 0, e; pos < f.getEntriesSize(); pos++) {
e = f.getEntryAt(pos);
int nameId = map.getId(AclEntryStatusFormat.getName(e));
int v = ((nameId & ACL_ENTRY_NAME_MASK) << ACL_ENTRY_NAME_OFFSET)
| (AclEntryStatusFormat.getType(e).ordinal() << ACL_ENTRY_TYPE_OFFSET)
| (AclEntryStatusFormat.getScope(e).ordinal() << ACL_ENTRY_SCOPE_OFFSET)
| (AclEntryStatusFormat.getPermission(e).ordinal());
b.addEntries(v);
b.addEntries(e);
}
return b;
}
private static XAttrFeatureProto.Builder buildXAttrs(XAttrFeature f,
final SaverContext.DeduplicationMap<String> stringMap) {
private static XAttrFeatureProto.Builder buildXAttrs(XAttrFeature f) {
XAttrFeatureProto.Builder b = XAttrFeatureProto.newBuilder();
for (XAttr a : f.getXAttrs()) {
XAttrCompactProto.Builder xAttrCompactBuilder = XAttrCompactProto.
newBuilder();
int nsOrd = a.getNameSpace().ordinal();
Preconditions.checkArgument(nsOrd < 8, "Too many namespaces.");
int v = ((nsOrd & XATTR_NAMESPACE_MASK) << XATTR_NAMESPACE_OFFSET)
| ((stringMap.getId(a.getName()) & XATTR_NAME_MASK) <<
XATTR_NAME_OFFSET);
v |= (((nsOrd >> 2) & XATTR_NAMESPACE_EXT_MASK) <<
XATTR_NAMESPACE_EXT_OFFSET);
int v = XAttrFormat.toInt(a);
xAttrCompactBuilder.setName(v);
if (a.getValue() != null) {
xAttrCompactBuilder.setValue(PBHelperClient.getByteString(a.getValue()));
@ -509,7 +460,7 @@ public final class FSImageFormatPBINode {
INodeSection.INodeFile.Builder b = INodeSection.INodeFile.newBuilder()
.setAccessTime(file.getAccessTime())
.setModificationTime(file.getModificationTime())
.setPermission(buildPermissionStatus(file, state.getStringMap()))
.setPermission(buildPermissionStatus(file))
.setPreferredBlockSize(file.getPreferredBlockSize())
.setStoragePolicyID(file.getLocalStoragePolicyID())
.setBlockType(PBHelperClient.convert(file.getBlockType()));
@ -522,11 +473,11 @@ public final class FSImageFormatPBINode {
AclFeature f = file.getAclFeature();
if (f != null) {
b.setAcl(buildAclEntries(f, state.getStringMap()));
b.setAcl(buildAclEntries(f));
}
XAttrFeature xAttrFeature = file.getXAttrFeature();
if (xAttrFeature != null) {
b.setXAttrs(buildXAttrs(xAttrFeature, state.getStringMap()));
b.setXAttrs(buildXAttrs(xAttrFeature));
}
return b;
}
@ -538,7 +489,7 @@ public final class FSImageFormatPBINode {
.newBuilder().setModificationTime(dir.getModificationTime())
.setNsQuota(quota.getNameSpace())
.setDsQuota(quota.getStorageSpace())
.setPermission(buildPermissionStatus(dir, state.getStringMap()));
.setPermission(buildPermissionStatus(dir));
if (quota.getTypeSpaces().anyGreaterOrEqual(0)) {
b.setTypeQuotas(buildQuotaByStorageTypeEntries(quota));
@ -546,11 +497,11 @@ public final class FSImageFormatPBINode {
AclFeature f = dir.getAclFeature();
if (f != null) {
b.setAcl(buildAclEntries(f, state.getStringMap()));
b.setAcl(buildAclEntries(f));
}
XAttrFeature xAttrFeature = dir.getXAttrFeature();
if (xAttrFeature != null) {
b.setXAttrs(buildXAttrs(xAttrFeature, state.getStringMap()));
b.setXAttrs(buildXAttrs(xAttrFeature));
}
return b;
}
@ -711,7 +662,7 @@ public final class FSImageFormatPBINode {
SaverContext state = parent.getSaverContext();
INodeSection.INodeSymlink.Builder b = INodeSection.INodeSymlink
.newBuilder()
.setPermission(buildPermissionStatus(n, state.getStringMap()))
.setPermission(buildPermissionStatus(n))
.setTarget(ByteString.copyFrom(n.getSymlink()))
.setModificationTime(n.getModificationTime())
.setAccessTime(n.getAccessTime());

View File

@ -85,10 +85,10 @@ public final class FSImageFormatProtobuf {
.getLogger(FSImageFormatProtobuf.class);
public static final class LoaderContext {
private String[] stringTable;
private SerialNumberManager.StringTable stringTable;
private final ArrayList<INodeReference> refList = Lists.newArrayList();
public String[] getStringTable() {
public SerialNumberManager.StringTable getStringTable() {
return stringTable;
}
@ -129,13 +129,6 @@ public final class FSImageFormatProtobuf {
}
private final ArrayList<INodeReference> refList = Lists.newArrayList();
private final DeduplicationMap<String> stringMap = DeduplicationMap
.newMap();
public DeduplicationMap<String> getStringMap() {
return stringMap;
}
public ArrayList<INodeReference> getRefList() {
return refList;
}
@ -327,11 +320,12 @@ public final class FSImageFormatProtobuf {
private void loadStringTableSection(InputStream in) throws IOException {
StringTableSection s = StringTableSection.parseDelimitedFrom(in);
ctx.stringTable = new String[s.getNumEntry() + 1];
ctx.stringTable =
SerialNumberManager.newStringTable(s.getNumEntry(), s.getMaskBits());
for (int i = 0; i < s.getNumEntry(); ++i) {
StringTableSection.Entry e = StringTableSection.Entry
.parseDelimitedFrom(in);
ctx.stringTable[e.getId()] = e.getStr();
ctx.stringTable.put(e.getId(), e.getStr());
}
}
@ -651,12 +645,16 @@ public final class FSImageFormatProtobuf {
private void saveStringTableSection(FileSummary.Builder summary)
throws IOException {
OutputStream out = sectionOutputStream;
SerialNumberManager.StringTable stringTable =
SerialNumberManager.getStringTable();
StringTableSection.Builder b = StringTableSection.newBuilder()
.setNumEntry(saverContext.stringMap.size());
.setNumEntry(stringTable.size())
.setMaskBits(stringTable.getMaskBits());
b.build().writeDelimitedTo(out);
for (Entry<String, Integer> e : saverContext.stringMap.entrySet()) {
for (Entry<Integer, String> e : stringTable) {
StringTableSection.Entry.Builder eb = StringTableSection.Entry
.newBuilder().setId(e.getValue()).setStr(e.getKey());
.newBuilder().setId(e.getKey()).setStr(e.getValue());
eb.build().writeDelimitedTo(out);
}
commitSection(summary, SectionName.STRING_TABLE);

View File

@ -33,7 +33,9 @@ import com.google.common.base.Preconditions;
@InterfaceAudience.Private
public abstract class INodeWithAdditionalFields extends INode
implements LinkedElement {
enum PermissionStatusFormat {
// Note: this format is used both in-memory and on-disk. Changes will be
// incompatible.
enum PermissionStatusFormat implements LongBitFormat.Enum {
MODE(null, 16),
GROUP(MODE.BITS, 24),
USER(GROUP.BITS, 24);
@ -46,12 +48,14 @@ public abstract class INodeWithAdditionalFields extends INode
static String getUser(long permission) {
final int n = (int)USER.BITS.retrieve(permission);
return SerialNumberManager.INSTANCE.getUser(n);
String s = SerialNumberManager.USER.getString(n);
assert s != null;
return s;
}
static String getGroup(long permission) {
final int n = (int)GROUP.BITS.retrieve(permission);
return SerialNumberManager.INSTANCE.getGroup(n);
return SerialNumberManager.GROUP.getString(n);
}
static short getMode(long permission) {
@ -61,16 +65,34 @@ public abstract class INodeWithAdditionalFields extends INode
/** Encode the {@link PermissionStatus} to a long. */
static long toLong(PermissionStatus ps) {
long permission = 0L;
final int user = SerialNumberManager.INSTANCE.getUserSerialNumber(
final int user = SerialNumberManager.USER.getSerialNumber(
ps.getUserName());
assert user != 0;
permission = USER.BITS.combine(user, permission);
final int group = SerialNumberManager.INSTANCE.getGroupSerialNumber(
// ideally should assert on group but inodes are created with null
// group and then updated only when added to a directory.
final int group = SerialNumberManager.GROUP.getSerialNumber(
ps.getGroupName());
permission = GROUP.BITS.combine(group, permission);
final int mode = ps.getPermission().toShort();
permission = MODE.BITS.combine(mode, permission);
return permission;
}
static PermissionStatus toPermissionStatus(long id,
SerialNumberManager.StringTable stringTable) {
int uid = (int)USER.BITS.retrieve(id);
int gid = (int)GROUP.BITS.retrieve(id);
return new PermissionStatus(
SerialNumberManager.USER.getString(uid, stringTable),
SerialNumberManager.GROUP.getString(gid, stringTable),
new FsPermission(getMode(id)));
}
@Override
public int getLength() {
return BITS.getLength();
}
}
/** The inode id. */
@ -175,7 +197,7 @@ public abstract class INodeWithAdditionalFields extends INode
@Override
final void setUser(String user) {
int n = SerialNumberManager.INSTANCE.getUserSerialNumber(user);
int n = SerialNumberManager.USER.getSerialNumber(user);
updatePermissionStatus(PermissionStatusFormat.USER, n);
}
@ -189,7 +211,7 @@ public abstract class INodeWithAdditionalFields extends INode
@Override
final void setGroup(String group) {
int n = SerialNumberManager.INSTANCE.getGroupSerialNumber(group);
int n = SerialNumberManager.GROUP.getSerialNumber(group);
updatePermissionStatus(PermissionStatusFormat.GROUP, n);
}

View File

@ -17,23 +17,195 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
/** Manage name-to-serial-number maps for users and groups. */
class SerialNumberManager {
/** This is the only instance of {@link SerialNumberManager}.*/
static final SerialNumberManager INSTANCE = new SerialNumberManager();
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.server.namenode.INodeWithAdditionalFields.PermissionStatusFormat;
import org.apache.hadoop.hdfs.util.LongBitFormat;
private final SerialNumberMap<String> usermap = new SerialNumberMap<String>();
private final SerialNumberMap<String> groupmap = new SerialNumberMap<String>();
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
private SerialNumberManager() {}
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_IMAGE_EXPANDED_STRING_TABLES_DEFAULT;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_IMAGE_EXPANDED_STRING_TABLES_KEY;
int getUserSerialNumber(String u) {return usermap.get(u);}
int getGroupSerialNumber(String g) {return groupmap.get(g);}
String getUser(int n) {return usermap.get(n);}
String getGroup(int n) {return groupmap.get(n);}
/** Manage name-to-serial-number maps for various string tables. */
public enum SerialNumberManager {
GLOBAL(), // NEVER EVER directly access!
USER(PermissionStatusFormat.USER, AclEntryStatusFormat.NAME),
GROUP(PermissionStatusFormat.GROUP, AclEntryStatusFormat.NAME),
XATTR(XAttrFormat.NAME);
{
getUserSerialNumber(null);
getGroupSerialNumber(null);
private static final SerialNumberManager[] values = values();
private static final int maxEntryBits;
private static final int maxEntryNumber;
private static final int maskBits;
private static boolean initialized;
private SerialNumberMap<String> serialMap;
private int bitLength = Integer.SIZE;
private boolean enabled;
static {
maxEntryBits = Integer.numberOfLeadingZeros(values.length);
maxEntryNumber = (1 << maxEntryBits) - 1;
maskBits = Integer.SIZE - maxEntryBits;
for (SerialNumberManager snm : values) {
// account for string table mask bits.
snm.updateLength(maxEntryBits);
// find max allowed length in case global is enabled.
GLOBAL.updateLength(snm.getLength());
}
// can reinitialize once later.
initializeSerialMaps(DFS_IMAGE_EXPANDED_STRING_TABLES_DEFAULT);
}
static synchronized void initialize(Configuration conf) {
boolean useExpanded = conf.getBoolean(
DFS_IMAGE_EXPANDED_STRING_TABLES_KEY,
DFS_IMAGE_EXPANDED_STRING_TABLES_DEFAULT);
if (initialized) {
if (useExpanded ^ !GLOBAL.enabled) {
throw new IllegalStateException("Cannot change serial maps");
}
return;
}
initializeSerialMaps(useExpanded);
for (SerialNumberManager snm : values) {
if (snm.enabled) {
FSDirectory.LOG.info(snm + " serial map: bits=" + snm.getLength() +
" maxEntries=" + snm.serialMap.getMax());
}
}
initialized = true;
}
private static void initializeSerialMaps(boolean useExpanded) {
if (useExpanded) {
// initialize per-manager serial maps for all but global.
for (SerialNumberManager snm : values) {
snm.enabled = (snm != GLOBAL);
snm.serialMap = snm.enabled ? new SerialNumberMap<String>(snm) : null;
}
} else {
// initialize all managers to use the global serial map.
SerialNumberMap<String> globalSerialMap = new SerialNumberMap<>(GLOBAL);
for (SerialNumberManager snm : values) {
snm.enabled = (snm == GLOBAL);
snm.serialMap = globalSerialMap;
}
}
}
SerialNumberManager(LongBitFormat.Enum... elements) {
// compute the smallest bit length registered with the serial manager.
for (LongBitFormat.Enum element : elements) {
updateLength(element.getLength());
}
}
int getLength() {
return bitLength;
}
private void updateLength(int maxLength) {
bitLength = Math.min(bitLength, maxLength);
}
public int getSerialNumber(String u) {
return serialMap.get(u);
}
public String getString(int id) {
return serialMap.get(id);
}
public String getString(int id, StringTable stringTable) {
return (stringTable != null)
? stringTable.get(this, id) : getString(id);
}
private int getMask(int bits) {
return ordinal() << (Integer.SIZE - bits);
}
private static int getMaskBits() {
return GLOBAL.enabled ? 0 : maskBits;
}
private int size() {
return enabled ? serialMap.size() : 0;
}
private Iterable<Entry<Integer, String>> entrySet() {
if (!enabled) {
return Collections.emptySet();
}
return serialMap.entrySet();
}
// returns snapshot of current values for a save.
public static StringTable getStringTable() {
// approximate size for capacity.
int size = 0;
for (final SerialNumberManager snm : values) {
size += snm.size();
}
int tableMaskBits = getMaskBits();
StringTable map = new StringTable(size, tableMaskBits);
for (final SerialNumberManager snm : values) {
final int mask = snm.getMask(tableMaskBits);
for (Entry<Integer, String> entry : snm.entrySet()) {
map.put(entry.getKey() | mask, entry.getValue());
}
}
return map;
}
// returns an empty table for load.
public static StringTable newStringTable(int size, int bits) {
if (bits > maskBits) {
throw new IllegalArgumentException(
"String table bits " + bits + " > " + maskBits);
}
return new StringTable(size, bits);
}
public static class StringTable implements Iterable<Entry<Integer, String>> {
private final int tableMaskBits;
private final Map<Integer,String> map;
private StringTable(int size, int loadingMaskBits) {
this.tableMaskBits = loadingMaskBits;
this.map = new HashMap<>(size);
}
private String get(SerialNumberManager snm, int id) {
if (tableMaskBits != 0) {
if (id > maxEntryNumber) {
throw new IllegalStateException(
"serial id " + id + " > " + maxEntryNumber);
}
id |= snm.getMask(tableMaskBits);
}
return map.get(id);
}
public void put(int id, String str) {
map.put(id, str);
}
public Iterator<Entry<Integer, String>> iterator() {
return map.entrySet().iterator();
}
public int size() {
return map.size();
}
public int getMaskBits() {
return tableMaskBits;
}
}
}

View File

@ -17,6 +17,10 @@
*/
package org.apache.hadoop.hdfs.server.namenode;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
@ -35,21 +39,33 @@ import org.apache.hadoop.classification.InterfaceAudience;
*/
@InterfaceAudience.Private
public class SerialNumberMap<T> {
private final AtomicInteger max = new AtomicInteger(1);
private String name;
private final int max;
private final AtomicInteger current = new AtomicInteger(1);
private final ConcurrentMap<T, Integer> t2i =
new ConcurrentHashMap<T, Integer>();
private final ConcurrentMap<Integer, T> i2t =
new ConcurrentHashMap<Integer, T>();
SerialNumberMap(SerialNumberManager snm) {
this(snm.name(), snm.getLength());
}
SerialNumberMap(String name, int bitLength) {
this.name = name;
this.max = (1 << bitLength) - 1;
}
public int get(T t) {
if (t == null) {
return 0;
}
Integer sn = t2i.get(t);
if (sn == null) {
sn = max.getAndIncrement();
if (sn < 0) {
throw new IllegalStateException("Too many elements!");
sn = current.getAndIncrement();
if (sn > max) {
current.getAndDecrement();
throw new IllegalStateException(name + ": serial number map is full");
}
Integer old = t2i.putIfAbsent(t, sn);
if (old != null) {
@ -66,14 +82,27 @@ public class SerialNumberMap<T> {
}
T t = i2t.get(i);
if (t == null) {
throw new IllegalStateException("!i2t.containsKey(" + i
+ "), this=" + this);
throw new IllegalStateException(
name + ": serial number " + i + " does not exist");
}
return t;
}
int getMax() {
return max;
}
Set<Map.Entry<Integer, T>> entrySet() {
return new HashSet<>(i2t.entrySet());
}
public int size() {
return i2t.size();
}
@Override
public String toString() {
return "max=" + max + ",\n t2i=" + t2i + ",\n i2t=" + i2t;
return "current=" + current + ",\n" +
"max=" + max + ",\n t2i=" + t2i + ",\n i2t=" + i2t;
}
}

View File

@ -31,52 +31,66 @@ import org.apache.hadoop.hdfs.util.LongBitFormat;
/**
* Class to pack XAttrs into byte[].<br>
* For each XAttr:<br>
* The first 4 bytes represents XAttr namespace and name<br>
* [0:3) - XAttr namespace<br>
* [3:8) - Reserved<br>
* [8:32) - The name of the entry, which is an ID that points to a
* string in map<br>
* The following two bytes represents the length of XAttr value<br>
* The remaining bytes is the XAttr value<br>
*
* Note: this format is used both in-memory and on-disk. Changes will be
* incompatible.
*
*/
class XAttrFormat {
private enum XAttrStatusFormat {
NAMESPACE(null, 3),
RESERVED(NAMESPACE.BITS, 5),
NAME(RESERVED.BITS, 24);
public enum XAttrFormat implements LongBitFormat.Enum {
RESERVED(null, 5),
NS_EXT(RESERVED.BITS, 1),
NAME(NS_EXT.BITS, 24),
NS(NAME.BITS, 2);
private static final int NS_EXT_SHIFT = NS.BITS.getLength();
private static final int NS_MASK = (1 << NS_EXT_SHIFT) - 1;
private static final int XATTR_VALUE_LEN_MAX = 1 << 16;
private static final XAttr.NameSpace[] XATTR_NAMESPACE_VALUES =
XAttr.NameSpace.values();
private final LongBitFormat BITS;
XAttrStatusFormat(LongBitFormat previous, int length) {
XAttrFormat(LongBitFormat previous, int length) {
BITS = new LongBitFormat(name(), previous, length, 0);
}
static XAttr.NameSpace getNamespace(int xattrStatus) {
int ordinal = (int) NAMESPACE.BITS.retrieve(xattrStatus);
return XAttr.NameSpace.values()[ordinal];
@Override
public int getLength() {
return BITS.getLength();
}
static String getName(int xattrStatus) {
int id = (int) NAME.BITS.retrieve(xattrStatus);
return XAttrStorage.getName(id);
static XAttr.NameSpace getNamespace(int record) {
long nid = NS.BITS.retrieve(record);
nid |= NS_EXT.BITS.retrieve(record) << NS_EXT_SHIFT;
return XATTR_NAMESPACE_VALUES[(int) nid];
}
static int toInt(XAttr.NameSpace namespace, String name) {
long xattrStatusInt = 0;
xattrStatusInt = NAMESPACE.BITS
.combine(namespace.ordinal(), xattrStatusInt);
int nid = XAttrStorage.getNameSerialNumber(name);
xattrStatusInt = NAME.BITS
.combine(nid, xattrStatusInt);
return (int) xattrStatusInt;
}
public static String getName(int record) {
int nid = (int)NAME.BITS.retrieve(record);
return SerialNumberManager.XATTR.getString(nid);
}
private static final int XATTR_VALUE_LEN_MAX = 1 << 16;
static int toInt(XAttr a) {
int nid = SerialNumberManager.XATTR.getSerialNumber(a.getName());
int nsOrd = a.getNameSpace().ordinal();
long value = NS.BITS.combine(nsOrd & NS_MASK, 0L);
value = NS_EXT.BITS.combine(nsOrd >>> NS_EXT_SHIFT, value);
value = NAME.BITS.combine(nid, value);
return (int)value;
}
static XAttr toXAttr(int record, byte[] value,
SerialNumberManager.StringTable stringTable) {
int nid = (int)NAME.BITS.retrieve(record);
String name = SerialNumberManager.XATTR.getString(nid, stringTable);
return new XAttr.Builder()
.setNameSpace(getNamespace(record))
.setName(name)
.setValue(value)
.build();
}
/**
* Unpack byte[] to XAttrs.
@ -95,8 +109,8 @@ class XAttrFormat {
int v = Ints.fromBytes(attrs[i], attrs[i + 1],
attrs[i + 2], attrs[i + 3]);
i += 4;
builder.setNameSpace(XAttrStatusFormat.getNamespace(v));
builder.setName(XAttrStatusFormat.getName(v));
builder.setNameSpace(XAttrFormat.getNamespace(v));
builder.setName(XAttrFormat.getName(v));
int vlen = ((0xff & attrs[i]) << 8) | (0xff & attrs[i + 1]);
i += 2;
if (vlen > 0) {
@ -129,8 +143,8 @@ class XAttrFormat {
int v = Ints.fromBytes(attrs[i], attrs[i + 1],
attrs[i + 2], attrs[i + 3]);
i += 4;
XAttr.NameSpace namespace = XAttrStatusFormat.getNamespace(v);
String name = XAttrStatusFormat.getName(v);
XAttr.NameSpace namespace = XAttrFormat.getNamespace(v);
String name = XAttrFormat.getName(v);
int vlen = ((0xff & attrs[i]) << 8) | (0xff & attrs[i + 1]);
i += 2;
if (xAttr.getNameSpace() == namespace &&
@ -161,7 +175,8 @@ class XAttrFormat {
ByteArrayOutputStream out = new ByteArrayOutputStream();
try {
for (XAttr a : xAttrs) {
int v = XAttrStatusFormat.toInt(a.getNameSpace(), a.getName());
// big-endian
int v = XAttrFormat.toInt(a);
out.write(Ints.toByteArray(v));
int vlen = a.getValue() == null ? 0 : a.getValue().length;
Preconditions.checkArgument(vlen < XATTR_VALUE_LEN_MAX,

View File

@ -31,17 +31,6 @@ import org.apache.hadoop.hdfs.protocol.QuotaExceededException;
@InterfaceAudience.Private
public class XAttrStorage {
private static final SerialNumberMap<String> NAME_MAP =
new SerialNumberMap<>();
public static int getNameSerialNumber(String name) {
return NAME_MAP.get(name);
}
public static String getName(int n) {
return NAME_MAP.get(n);
}
/**
* Reads the extended attribute of an inode by name with prefix.
* <p/>

View File

@ -51,6 +51,7 @@ import org.apache.hadoop.hdfs.server.namenode.FsImageProto;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.server.namenode.INodeId;
import org.apache.hadoop.hdfs.server.namenode.SerialNumberManager;
import org.apache.hadoop.hdfs.web.JsonUtil;
import org.apache.hadoop.hdfs.web.resources.XAttrEncodingParam;
import org.apache.hadoop.io.IOUtils;
@ -67,7 +68,7 @@ import com.google.common.collect.Maps;
class FSImageLoader {
public static final Log LOG = LogFactory.getLog(FSImageHandler.class);
private final String[] stringTable;
private final SerialNumberManager.StringTable stringTable;
// byte representation of inodes, sorted by id
private final byte[][] inodes;
private final Map<Long, long[]> dirmap;
@ -93,8 +94,8 @@ class FSImageLoader {
}
};
private FSImageLoader(String[] stringTable, byte[][] inodes,
Map<Long, long[]> dirmap) {
private FSImageLoader(SerialNumberManager.StringTable stringTable,
byte[][] inodes, Map<Long, long[]> dirmap) {
this.stringTable = stringTable;
this.inodes = inodes;
this.dirmap = dirmap;
@ -119,7 +120,7 @@ class FSImageLoader {
try (FileInputStream fin = new FileInputStream(file.getFD())) {
// Map to record INodeReference to the referred id
ImmutableList<Long> refIdList = null;
String[] stringTable = null;
SerialNumberManager.StringTable stringTable = null;
byte[][] inodes = null;
Map<Long, long[]> dirmap = null;
@ -242,16 +243,17 @@ class FSImageLoader {
return inodes;
}
static String[] loadStringTable(InputStream in) throws
IOException {
static SerialNumberManager.StringTable loadStringTable(InputStream in)
throws IOException {
FsImageProto.StringTableSection s = FsImageProto.StringTableSection
.parseDelimitedFrom(in);
LOG.info("Loading " + s.getNumEntry() + " strings");
String[] stringTable = new String[s.getNumEntry() + 1];
SerialNumberManager.StringTable stringTable =
SerialNumberManager.newStringTable(s.getNumEntry(), s.getMaskBits());
for (int i = 0; i < s.getNumEntry(); ++i) {
FsImageProto.StringTableSection.Entry e = FsImageProto
.StringTableSection.Entry.parseDelimitedFrom(in);
stringTable[e.getId()] = e.getStr();
stringTable.put(e.getId(), e.getStr());
}
return stringTable;
}

View File

@ -33,6 +33,7 @@ import org.apache.hadoop.hdfs.server.namenode.FsImageProto.FileSummary;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.INodeSection.INode;
import org.apache.hadoop.hdfs.server.namenode.INodeId;
import org.apache.hadoop.hdfs.server.namenode.SerialNumberManager;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.LimitInputStream;
import org.apache.hadoop.util.Time;
@ -391,7 +392,7 @@ abstract class PBImageTextWriter implements Closeable {
}
}
private String[] stringTable;
private SerialNumberManager.StringTable stringTable;
private PrintStream out;
private MetadataMap metadataMap = null;

View File

@ -63,7 +63,7 @@ import org.apache.hadoop.hdfs.server.namenode.FsImageProto.NameSystemSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SecretManagerSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotDiffSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.SnapshotSection;
import org.apache.hadoop.hdfs.server.namenode.FsImageProto.StringTableSection;
import org.apache.hadoop.hdfs.server.namenode.SerialNumberManager;
import org.apache.hadoop.hdfs.server.namenode.INodeFile;
import org.apache.hadoop.hdfs.util.XMLUtils;
import org.apache.hadoop.io.erasurecode.ECSchema;
@ -269,7 +269,7 @@ public final class PBImageXmlWriter {
private final Configuration conf;
private final PrintStream out;
private final SimpleDateFormat isoDateFormat;
private String[] stringTable;
private SerialNumberManager.StringTable stringTable;
public static SimpleDateFormat createSimpleDateFormat() {
SimpleDateFormat format =
@ -430,8 +430,9 @@ public final class PBImageXmlWriter {
((XATTR_NAMESPACE_EXT_MASK & (encodedName >> XATTR_NAMESPACE_EXT_OFFSET)) << 2);
o(INODE_SECTION_NS, XAttrProtos.XAttrProto.
XAttrNamespaceProto.valueOf(ns).toString());
o(SECTION_NAME,
stringTable[XATTR_NAME_MASK & (encodedName >> XATTR_NAME_OFFSET)]);
o(SECTION_NAME, SerialNumberManager.XATTR.getString(
XATTR_NAME_MASK & (encodedName >> XATTR_NAME_OFFSET),
stringTable));
ByteString val = xattr.getValue();
if (val.isValidUtf8()) {
o(INODE_SECTION_VAL, val.toStringUtf8());
@ -784,10 +785,9 @@ public final class PBImageXmlWriter {
.o(SNAPSHOT_DIFF_SECTION_CHILDREN_SIZE, d.getChildrenSize())
.o(SNAPSHOT_DIFF_SECTION_IS_SNAPSHOT_ROOT, d.getIsSnapshotRoot())
.o(SECTION_NAME, d.getName().toStringUtf8());
INodeDirectory snapshotCopy = d.getSnapshotCopy();
if (snapshotCopy != null) {
if (d.hasSnapshotCopy()) {
out.print("<" + SNAPSHOT_DIFF_SECTION_SNAPSHOT_COPY + ">");
dumpINodeDirectory(snapshotCopy);
dumpINodeDirectory(d.getSnapshotCopy());
out.print("</" + SNAPSHOT_DIFF_SECTION_SNAPSHOT_COPY + ">\n");
}
o(SNAPSHOT_DIFF_SECTION_CREATED_LIST_SIZE, d.getCreatedListSize());
@ -851,13 +851,7 @@ public final class PBImageXmlWriter {
}
private void loadStringTable(InputStream in) throws IOException {
StringTableSection s = StringTableSection.parseDelimitedFrom(in);
stringTable = new String[s.getNumEntry() + 1];
for (int i = 0; i < s.getNumEntry(); ++i) {
StringTableSection.Entry e = StringTableSection.Entry
.parseDelimitedFrom(in);
stringTable[e.getId()] = e.getStr();
}
stringTable = FSImageLoader.loadStringTable(in);
}
private PBImageXmlWriter o(final String e, final Object v) {

View File

@ -311,6 +311,7 @@ message StringTableSection {
optional string str = 2;
}
optional uint32 numEntry = 1;
optional uint32 maskBits = 2 [default = 0];
// repeated Entry
}

View File

@ -57,12 +57,8 @@ public class TestCommitBlockSynchronization {
// set file's parent as root and put the file to inodeMap, so
// FSNamesystem's isFileDeleted() method will return false on this file
if (file.getParent() == null) {
INodeDirectory mparent = mock(INodeDirectory.class);
INodeDirectory parent = new INodeDirectory(mparent.getId(), new byte[0],
mparent.getPermissionStatus(), mparent.getAccessTime());
parent.setLocalName(new byte[0]);
INodeDirectory parent = namesystem.getFSDirectory().getRoot();
parent.addChild(file);
file.setParent(parent);
}
namesystem.dir.getINodeMap().put(file);