HDFS-15519. Check inaccessible INodes in FsImageValidation. (#2224)

This commit is contained in:
Tsz-Wo Nicholas Sze 2020-08-14 10:10:01 -07:00 committed by GitHub
parent 15a76e8d65
commit b93dd7c281
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 263 additions and 24 deletions

View File

@ -28,8 +28,12 @@ import org.apache.hadoop.hdfs.DFSUtil;
import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager; import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager; import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants; import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
import org.apache.hadoop.hdfs.server.common.Storage;
import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase; import org.apache.hadoop.hdfs.server.namenode.startupprogress.Phase;
import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics; import org.apache.hadoop.hdfs.server.namenode.top.metrics.TopMetrics;
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor;
import org.apache.hadoop.hdfs.server.namenode.visitor.INodeCountVisitor.Counts;
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo; import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
import org.apache.hadoop.util.GSet; import org.apache.hadoop.util.GSet;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
@ -40,15 +44,21 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.File; import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.Collections;
import java.util.Iterator;
import java.util.Timer; import java.util.Timer;
import java.util.TimerTask; import java.util.TimerTask;
import java.util.concurrent.atomic.AtomicInteger;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_HA_NAMENODES_KEY_PREFIX;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_ENABLE_RETRY_CACHE_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_RPC_ADDRESS_KEY;
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY; import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY;
import static org.apache.hadoop.hdfs.server.namenode.FsImageValidation.Cli.println;
import static org.apache.hadoop.util.Time.now; import static org.apache.hadoop.util.Time.now;
/** /**
@ -134,6 +144,25 @@ public class FsImageValidation {
} }
return b.insert(0, n).toString(); return b.insert(0, n).toString();
} }
/** @return a filter for the given type. */
static FilenameFilter newFilenameFilter(NameNodeFile type) {
final String prefix = type.getName() + "_";
return new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
if (!name.startsWith(prefix)) {
return false;
}
for (int i = prefix.length(); i < name.length(); i++) {
if (!Character.isDigit(name.charAt(i))) {
return false;
}
}
return true;
}
};
}
} }
private final File fsImageFile; private final File fsImageFile;
@ -142,21 +171,44 @@ public class FsImageValidation {
this.fsImageFile = fsImageFile; this.fsImageFile = fsImageFile;
} }
int checkINodeReference(Configuration conf) throws Exception { int run() throws Exception {
return run(new Configuration(), new AtomicInteger());
}
int run(AtomicInteger errorCount) throws Exception {
return run(new Configuration(), errorCount);
}
int run(Configuration conf, AtomicInteger errorCount) throws Exception {
final int initCount = errorCount.get();
LOG.info(Util.memoryInfo()); LOG.info(Util.memoryInfo());
initConf(conf); initConf(conf);
// check INodeReference
final FSNamesystem namesystem = checkINodeReference(conf, errorCount);
// check INodeMap
INodeMapValidation.run(namesystem.getFSDirectory(), errorCount);
LOG.info(Util.memoryInfo());
final int d = errorCount.get() - initCount;
if (d > 0) {
Cli.println("Found %d error(s) in %s", d, fsImageFile.getAbsolutePath());
}
return d;
}
private FSNamesystem loadImage(Configuration conf) throws IOException {
final TimerTask checkProgress = new TimerTask() { final TimerTask checkProgress = new TimerTask() {
@Override @Override
public void run() { public void run() {
final double percent = NameNode.getStartupProgress().createView() final double percent = NameNode.getStartupProgress().createView()
.getPercentComplete(Phase.LOADING_FSIMAGE); .getPercentComplete(Phase.LOADING_FSIMAGE);
LOG.info(String.format("%s Progress: %.1f%%", LOG.info(String.format("%s Progress: %.1f%% (%s)",
Phase.LOADING_FSIMAGE, 100*percent)); Phase.LOADING_FSIMAGE, 100*percent, Util.memoryInfo()));
} }
}; };
INodeReferenceValidation.start();
final Timer t = new Timer(); final Timer t = new Timer();
t.scheduleAtFixedRate(checkProgress, 0, 60_000); t.scheduleAtFixedRate(checkProgress, 0, 60_000);
final long loadStart = now(); final long loadStart = now();
@ -197,10 +249,42 @@ public class FsImageValidation {
t.cancel(); t.cancel();
Cli.println("Loaded %s %s successfully in %s", Cli.println("Loaded %s %s successfully in %s",
FS_IMAGE, fsImageFile, StringUtils.formatTime(now() - loadStart)); FS_IMAGE, fsImageFile, StringUtils.formatTime(now() - loadStart));
return namesystem;
}
FSNamesystem checkINodeReference(Configuration conf,
AtomicInteger errorCount) throws Exception {
INodeReferenceValidation.start();
final FSNamesystem namesystem = loadImage(conf);
LOG.info(Util.memoryInfo()); LOG.info(Util.memoryInfo());
final int errorCount = INodeReferenceValidation.end(); INodeReferenceValidation.end(errorCount);
LOG.info(Util.memoryInfo()); LOG.info(Util.memoryInfo());
return errorCount; return namesystem;
}
static class INodeMapValidation {
static Iterable<INodeWithAdditionalFields> iterate(INodeMap map) {
return new Iterable<INodeWithAdditionalFields>() {
@Override
public Iterator<INodeWithAdditionalFields> iterator() {
return map.getMapIterator();
}
};
}
static void run(FSDirectory fsdir, AtomicInteger errorCount) {
final int initErrorCount = errorCount.get();
final Counts counts = INodeCountVisitor.countTree(fsdir.getRoot());
for (INodeWithAdditionalFields i : iterate(fsdir.getINodeMap())) {
if (counts.getCount(i) == 0) {
Cli.printError(errorCount, "%s (%d) is inaccessible (%s)",
i, i.getId(), i.getFullPathName());
}
}
println("%s ended successfully: %d error(s) found.",
INodeMapValidation.class.getSimpleName(),
errorCount.get() - initErrorCount);
}
} }
static class Cli extends Configured implements Tool { static class Cli extends Configured implements Tool {
@ -217,9 +301,10 @@ public class FsImageValidation {
initLogLevels(); initLogLevels();
final FsImageValidation validation = FsImageValidation.newInstance(args); final FsImageValidation validation = FsImageValidation.newInstance(args);
final int errorCount = validation.checkINodeReference(getConf()); final AtomicInteger errorCount = new AtomicInteger();
validation.run(getConf(), errorCount);
println("Error Count: %s", errorCount); println("Error Count: %s", errorCount);
return errorCount == 0? 0: 1; return errorCount.get() == 0? 0: 1;
} }
static String parse(String... args) { static String parse(String... args) {
@ -240,19 +325,68 @@ public class FsImageValidation {
return f; return f;
} }
static void println(String format, Object... args) { static synchronized void println(String format, Object... args) {
final String s = String.format(format, args); final String s = String.format(format, args);
System.out.println(s); System.out.println(s);
LOG.info(s); LOG.info(s);
} }
static void printError(String message, Throwable t) { static synchronized void warn(String format, Object... args) {
final String s = "WARN: " + String.format(format, args);
System.out.println(s);
LOG.warn(s);
}
static synchronized void printError(String message, Throwable t) {
System.out.println(message); System.out.println(message);
if (t != null) { if (t != null) {
t.printStackTrace(System.out); t.printStackTrace(System.out);
} }
LOG.error(message, t); LOG.error(message, t);
} }
static synchronized void printError(AtomicInteger errorCount,
String format, Object... args) {
final int count = errorCount.incrementAndGet();
final String s = "FSIMAGE_ERROR " + count + ": "
+ String.format(format, args);
System.out.println(s);
LOG.info(s);
}
}
public static int validate(FSNamesystem namesystem) throws Exception {
final AtomicInteger errorCount = new AtomicInteger();
final NNStorage nnStorage = namesystem.getFSImage().getStorage();
for(Storage.StorageDirectory sd : nnStorage.getStorageDirs()) {
validate(sd.getCurrentDir(), errorCount);
}
return errorCount.get();
}
public static void validate(File path, AtomicInteger errorCount)
throws Exception {
if (path.isFile()) {
new FsImageValidation(path).run(errorCount);
} else if (path.isDirectory()) {
final File[] images = path.listFiles(
Util.newFilenameFilter(NameNodeFile.IMAGE));
if (images == null || images.length == 0) {
Cli.warn("%s not found in %s", FSImage.class.getSimpleName(),
path.getAbsolutePath());
return;
}
Arrays.sort(images, Collections.reverseOrder());
for (int i = 0; i < images.length; i++) {
final File image = images[i];
Cli.println("%s %d) %s", FSImage.class.getSimpleName(),
i, image.getAbsolutePath());
FsImageValidation.validate(image, errorCount);
}
}
Cli.warn("%s is neither a file nor a directory", path.getAbsolutePath());
} }
public static void main(String[] args) { public static void main(String[] args) {

View File

@ -46,18 +46,20 @@ public class INodeReferenceValidation {
public static void start() { public static void start() {
INSTANCE.compareAndSet(null, new INodeReferenceValidation()); INSTANCE.compareAndSet(null, new INodeReferenceValidation());
println("Validation started"); println("%s started", INodeReferenceValidation.class.getSimpleName());
} }
public static int end() { public static void end(AtomicInteger errorCount) {
final INodeReferenceValidation instance = INSTANCE.getAndSet(null); final INodeReferenceValidation instance = INSTANCE.getAndSet(null);
if (instance == null) { if (instance == null) {
return 0; return;
} }
final int errorCount = instance.assertReferences(); final int initCount = errorCount.get();
println("Validation ended successfully: %d error(s) found.", errorCount); instance.assertReferences(errorCount);
return errorCount; println("%s ended successfully: %d error(s) found.",
INodeReferenceValidation.class.getSimpleName(),
errorCount.get() - initCount);
} }
static <REF extends INodeReference> void add(REF ref, Class<REF> clazz) { static <REF extends INodeReference> void add(REF ref, Class<REF> clazz) {
@ -153,7 +155,7 @@ public class INodeReferenceValidation {
throw new IllegalArgumentException("References not found for " + clazz); throw new IllegalArgumentException("References not found for " + clazz);
} }
private int assertReferences() { private void assertReferences(AtomicInteger errorCount) {
final int p = Runtime.getRuntime().availableProcessors(); final int p = Runtime.getRuntime().availableProcessors();
LOG.info("Available Processors: {}", p); LOG.info("Available Processors: {}", p);
final ExecutorService service = Executors.newFixedThreadPool(p); final ExecutorService service = Executors.newFixedThreadPool(p);
@ -168,7 +170,6 @@ public class INodeReferenceValidation {
final Timer t = new Timer(); final Timer t = new Timer();
t.scheduleAtFixedRate(checkProgress, 0, 1_000); t.scheduleAtFixedRate(checkProgress, 0, 1_000);
final AtomicInteger errorCount = new AtomicInteger();
try { try {
dstReferences.submit(errorCount, service); dstReferences.submit(errorCount, service);
withCounts.submit(errorCount, service); withCounts.submit(errorCount, service);
@ -183,7 +184,6 @@ public class INodeReferenceValidation {
service.shutdown(); service.shutdown();
t.cancel(); t.cancel();
} }
return errorCount.get();
} }
static <REF extends INodeReference> List<Task<REF>> createTasks( static <REF extends INodeReference> List<Task<REF>> createTasks(
@ -215,7 +215,7 @@ public class INodeReferenceValidation {
try { try {
ref.assertReferences(); ref.assertReferences();
} catch (Throwable t) { } catch (Throwable t) {
println("%d: %s", errorCount.incrementAndGet(), t); printError(errorCount, "%s", t);
} }
} }
return references.size(); return references.size();

View File

@ -0,0 +1,107 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hdfs.server.namenode.visitor;
import org.apache.hadoop.hdfs.server.namenode.INode;
import org.apache.hadoop.hdfs.server.namenode.snapshot.Snapshot;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.atomic.AtomicInteger;
/**
* For validating {@link org.apache.hadoop.hdfs.server.namenode.FSImage}s.
*/
public class INodeCountVisitor implements NamespaceVisitor {
public interface Counts {
int getCount(INode inode);
}
public static Counts countTree(INode root) {
return new INodeCountVisitor().count(root);
}
private static class SetElement {
private final INode inode;
private final AtomicInteger count = new AtomicInteger();
SetElement(INode inode) {
this.inode = inode;
}
int getCount() {
return count.get();
}
int incrementAndGet() {
return count.incrementAndGet();
}
@Override
public boolean equals(Object obj) {
if (this == obj) {
return true;
} else if (obj == null || getClass() != obj.getClass()) {
return false;
}
final SetElement that = (SetElement) obj;
return this.inode.getId() == that.inode.getId();
}
@Override
public int hashCode() {
return Long.hashCode(inode.getId());
}
}
static class INodeSet implements Counts {
private final ConcurrentMap<SetElement, SetElement> map
= new ConcurrentHashMap<>();
int put(INode inode, int snapshot) {
final SetElement key = new SetElement(inode);
final SetElement previous = map.putIfAbsent(key, key);
final SetElement current = previous != null? previous: key;
return current.incrementAndGet();
}
@Override
public int getCount(INode inode) {
final SetElement key = new SetElement(inode);
final SetElement value = map.get(key);
return value != null? value.getCount(): 0;
}
}
private final INodeSet inodes = new INodeSet();
@Override
public INodeVisitor getDefaultVisitor() {
return new INodeVisitor() {
@Override
public void visit(INode iNode, int snapshot) {
inodes.put(iNode, snapshot);
}
};
}
private Counts count(INode root) {
root.accept(this, Snapshot.CURRENT_STATE_ID);
return inodes;
}
}

View File

@ -43,13 +43,11 @@ public class TestFsImageValidation {
* by the environment variable FS_IMAGE_FILE. * by the environment variable FS_IMAGE_FILE.
*/ */
@Test @Test
public void testINodeReference() throws Exception { public void testValidation() throws Exception {
FsImageValidation.initLogLevels(); FsImageValidation.initLogLevels();
try { try {
final Configuration conf = new Configuration(); final int errorCount = FsImageValidation.newInstance().run();
final FsImageValidation validation = FsImageValidation.newInstance();
final int errorCount = validation.checkINodeReference(conf);
Assert.assertEquals("Error Count: " + errorCount, 0, errorCount); Assert.assertEquals("Error Count: " + errorCount, 0, errorCount);
} catch (HadoopIllegalArgumentException e) { } catch (HadoopIllegalArgumentException e) {
LOG.warn("The environment variable {} is not set: {}", LOG.warn("The environment variable {} is not set: {}",