SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS being in safe mode.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1560553 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2014-01-22 22:39:10 +00:00
parent 4fcf007f94
commit d9e7f46ef9
7 changed files with 183 additions and 42 deletions

View File

@ -219,6 +219,9 @@ Bug Fixes
* SOLR-5650: When a replica becomes a leader, only peer sync with other replicas
that last published an ACTIVE state. (Mark Miller)
* SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS
being in safe mode. (Mark Miller)
Optimizations
----------------------

View File

@ -28,9 +28,9 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.store.BaseDirectory;
import org.apache.lucene.store.BufferedIndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
@ -58,14 +58,37 @@ public class HdfsDirectory extends BaseDirectory {
this.hdfsDirPath = hdfsDirPath;
this.configuration = configuration;
fileSystem = FileSystem.newInstance(hdfsDirPath.toUri(), configuration);
while (true) {
try {
if (!fileSystem.exists(hdfsDirPath)) {
fileSystem.mkdirs(hdfsDirPath);
boolean success = fileSystem.mkdirs(hdfsDirPath);
if (!success) {
throw new RuntimeException("Could not create directory: " + hdfsDirPath);
}
} else {
fileSystem.mkdirs(hdfsDirPath); // check for safe mode
}
break;
} catch (RemoteException e) {
if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
LOG.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
throw new RuntimeException(
"Problem creating directory: " + hdfsDirPath, e);
} catch (Exception e) {
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
throw new RuntimeException("Problem creating directory: " + hdfsDirPath,
e);
throw new RuntimeException(
"Problem creating directory: " + hdfsDirPath, e);
}
}
}

View File

@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.LockReleaseFailedException;
@ -59,17 +60,32 @@ public class HdfsLockFactory extends LockFactory {
FileSystem fs = null;
try {
fs = FileSystem.newInstance(lockPath.toUri(), configuration);
while (true) {
if (fs.exists(lockPath)) {
if (lockPrefix != null) {
lockName = lockPrefix + "-" + lockName;
}
Path lockFile = new Path(lockPath, lockName);
try {
if (fs.exists(lockFile) && !fs.delete(lockFile, false)) {
throw new IOException("Cannot delete " + lockFile);
}
} catch (RemoteException e) {
if (e.getClassName().equals(
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw e;
}
break;
}
}
} finally {
IOUtils.closeQuietly(fs);
@ -99,20 +115,46 @@ public class HdfsLockFactory extends LockFactory {
@Override
public boolean obtain() throws IOException {
FSDataOutputStream file = null;
FileSystem fs = null;
FileSystem fs = FileSystem.newInstance(lockPath.toUri(), conf);
try {
while (true) {
try {
fs = FileSystem.newInstance(lockPath.toUri(), conf);
if (!fs.exists(lockPath)) {
boolean success = fs.mkdirs(lockPath);
if (!success) {
throw new RuntimeException("Could not create directory: " + lockPath);
}
} else {
// just to check for safe mode
fs.mkdirs(lockPath);
}
file = fs.create(new Path(lockPath, lockName), false);
break;
} catch (FileAlreadyExistsException e) {
return false;
}catch (IOException e) {
} catch (RemoteException e) {
if (e.getClassName().equals(
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
log.error("Error creating lock file", e);
return false;
} catch (IOException e) {
log.error("Error creating lock file", e);
return false;
} finally {
IOUtils.closeQuietly(file);
}
}
} finally {
IOUtils.closeQuietly(fs);
}
return true;

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
@ -142,16 +143,33 @@ public class HdfsUpdateLog extends UpdateLog {
}
lastDataDir = dataDir;
tlogDir = new Path(dataDir, TLOG_NAME);
while (true) {
try {
if (!fs.exists(tlogDir)) {
boolean success = fs.mkdirs(tlogDir);
if (!success) {
throw new RuntimeException("Could not create directory:" + tlogDir);
}
} else {
fs.mkdirs(tlogDir); // To check for safe mode
}
break;
} catch (RemoteException e) {
if (e.getClassName().equals(
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw new RuntimeException(
"Problem creating directory: " + tlogDir, e);
} catch (IOException e) {
throw new RuntimeException(e);
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
}
}
tlogFiles = getLogList(fs, tlogDir);

View File

@ -4,12 +4,16 @@ import java.io.File;
import java.io.IOException;
import java.net.URI;
import java.util.Locale;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.SolrTestCaseJ4;
import org.junit.Assert;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -32,6 +36,8 @@ public class HdfsTestUtil {
private static Locale savedLocale;
private static Map<MiniDFSCluster,Timer> timers = new ConcurrentHashMap<MiniDFSCluster,Timer>();
public static MiniDFSCluster setupClass(String dataDir) throws Exception {
LuceneTestCase.assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
@ -58,7 +64,22 @@ public class HdfsTestUtil {
System.setProperty("solr.hdfs.home", "/solr_hdfs_home");
MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
final MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
dfsCluster.waitActive();
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
int rnd = LuceneTestCase.random().nextInt(10000);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
}
}, rnd);
timers.put(dfsCluster, timer);
SolrTestCaseJ4.useFactory("org.apache.solr.core.HdfsDirectoryFactory");
@ -72,6 +93,7 @@ public class HdfsTestUtil {
System.clearProperty("test.cache.data");
System.clearProperty("solr.hdfs.home");
if (dfsCluster != null) {
timers.remove(dfsCluster);
dfsCluster.shutdown();
}

View File

@ -23,11 +23,15 @@ import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
@ -35,6 +39,7 @@ import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.cloud.BasicDistributedZkTest;
import org.apache.solr.cloud.ChaosMonkey;
import org.apache.solr.common.params.CollectionParams.CollectionAction;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
@ -52,6 +57,9 @@ public class StressHdfsTest extends BasicDistributedZkTest {
private static final String DELETE_DATA_DIR_COLLECTION = "delete_data_dir";
private static MiniDFSCluster dfsCluster;
private boolean testRestartIntoSafeMode;
@BeforeClass
public static void setupClass() throws Exception {
@ -68,7 +76,6 @@ public class StressHdfsTest extends BasicDistributedZkTest {
dfsCluster = null;
}
@Override
protected String getDataDir(String dataDir) throws IOException {
return HdfsTestUtil.getDataDir(dfsCluster, dataDir);
@ -78,6 +85,7 @@ public class StressHdfsTest extends BasicDistributedZkTest {
super();
sliceCount = 1;
shardCount = TEST_NIGHTLY ? 7 : random().nextInt(2) + 1;
testRestartIntoSafeMode = random().nextBoolean();
}
protected String getSolrXml() {
@ -90,6 +98,31 @@ public class StressHdfsTest extends BasicDistributedZkTest {
for (int i = 0; i < cnt; i++) {
createAndDeleteCollection();
}
if (testRestartIntoSafeMode) {
createCollection(DELETE_DATA_DIR_COLLECTION, 1, 1, 1);
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
ChaosMonkey.stop(jettys.get(0));
// enter safe mode and restart a node
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
int rnd = LuceneTestCase.random().nextInt(10000);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
}
}, rnd);
ChaosMonkey.start(jettys.get(0));
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
}
}
private void createAndDeleteCollection() throws SolrServerException,

View File

@ -90,8 +90,8 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
throw new RuntimeException(e);
}
hdfsDataDir = hdfsUri + "/solr/shard1";
System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1");
//hdfsDataDir = hdfsUri + "/solr/shard1";
// System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1");
System.setProperty("solr.ulog.dir", hdfsUri + "/solr/shard1");
initCore("solrconfig-tlog.xml","schema15.xml");