SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS being in safe mode.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1560553 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2014-01-22 22:39:10 +00:00
parent 4fcf007f94
commit d9e7f46ef9
7 changed files with 183 additions and 42 deletions

View File

@ -219,6 +219,9 @@ Bug Fixes
* SOLR-5650: When a replica becomes a leader, only peer sync with other replicas * SOLR-5650: When a replica becomes a leader, only peer sync with other replicas
that last published an ACTIVE state. (Mark Miller) that last published an ACTIVE state. (Mark Miller)
* SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS
being in safe mode. (Mark Miller)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -28,9 +28,9 @@ import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.store.BaseDirectory; import org.apache.lucene.store.BaseDirectory;
import org.apache.lucene.store.BufferedIndexOutput; import org.apache.lucene.store.BufferedIndexOutput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput; import org.apache.lucene.store.IndexOutput;
@ -58,14 +58,37 @@ public class HdfsDirectory extends BaseDirectory {
this.hdfsDirPath = hdfsDirPath; this.hdfsDirPath = hdfsDirPath;
this.configuration = configuration; this.configuration = configuration;
fileSystem = FileSystem.newInstance(hdfsDirPath.toUri(), configuration); fileSystem = FileSystem.newInstance(hdfsDirPath.toUri(), configuration);
try {
if (!fileSystem.exists(hdfsDirPath)) { while (true) {
fileSystem.mkdirs(hdfsDirPath); try {
if (!fileSystem.exists(hdfsDirPath)) {
boolean success = fileSystem.mkdirs(hdfsDirPath);
if (!success) {
throw new RuntimeException("Could not create directory: " + hdfsDirPath);
}
} else {
fileSystem.mkdirs(hdfsDirPath); // check for safe mode
}
break;
} catch (RemoteException e) {
if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
LOG.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
throw new RuntimeException(
"Problem creating directory: " + hdfsDirPath, e);
} catch (Exception e) {
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
throw new RuntimeException(
"Problem creating directory: " + hdfsDirPath, e);
} }
} catch (Exception e) {
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
throw new RuntimeException("Problem creating directory: " + hdfsDirPath,
e);
} }
} }

View File

@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.store.Lock; import org.apache.lucene.store.Lock;
import org.apache.lucene.store.LockFactory; import org.apache.lucene.store.LockFactory;
import org.apache.lucene.store.LockReleaseFailedException; import org.apache.lucene.store.LockReleaseFailedException;
@ -59,16 +60,31 @@ public class HdfsLockFactory extends LockFactory {
FileSystem fs = null; FileSystem fs = null;
try { try {
fs = FileSystem.newInstance(lockPath.toUri(), configuration); fs = FileSystem.newInstance(lockPath.toUri(), configuration);
while (true) {
if (fs.exists(lockPath)) {
if (lockPrefix != null) {
lockName = lockPrefix + "-" + lockName;
}
if (fs.exists(lockPath)) { Path lockFile = new Path(lockPath, lockName);
if (lockPrefix != null) { try {
lockName = lockPrefix + "-" + lockName; if (fs.exists(lockFile) && !fs.delete(lockFile, false)) {
} throw new IOException("Cannot delete " + lockFile);
}
Path lockFile = new Path(lockPath, lockName); } catch (RemoteException e) {
if (e.getClassName().equals(
if (fs.exists(lockFile) && !fs.delete(lockFile, false)) { "org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
throw new IOException("Cannot delete " + lockFile); log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw e;
}
break;
} }
} }
} finally { } finally {
@ -99,20 +115,46 @@ public class HdfsLockFactory extends LockFactory {
@Override @Override
public boolean obtain() throws IOException { public boolean obtain() throws IOException {
FSDataOutputStream file = null; FSDataOutputStream file = null;
FileSystem fs = null; FileSystem fs = FileSystem.newInstance(lockPath.toUri(), conf);
try { try {
fs = FileSystem.newInstance(lockPath.toUri(), conf); while (true) {
if (!fs.exists(lockPath)) { try {
fs.mkdirs(lockPath); if (!fs.exists(lockPath)) {
boolean success = fs.mkdirs(lockPath);
if (!success) {
throw new RuntimeException("Could not create directory: " + lockPath);
}
} else {
// just to check for safe mode
fs.mkdirs(lockPath);
}
file = fs.create(new Path(lockPath, lockName), false);
break;
} catch (FileAlreadyExistsException e) {
return false;
} catch (RemoteException e) {
if (e.getClassName().equals(
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
log.error("Error creating lock file", e);
return false;
} catch (IOException e) {
log.error("Error creating lock file", e);
return false;
} finally {
IOUtils.closeQuietly(file);
}
} }
file = fs.create(new Path(lockPath, lockName), false);
} catch (FileAlreadyExistsException e) {
return false;
}catch (IOException e) {
log.error("Error creating lock file", e);
return false;
} finally { } finally {
IOUtils.closeQuietly(file);
IOUtils.closeQuietly(fs); IOUtils.closeQuietly(fs);
} }
return true; return true;

View File

@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.PathFilter;
import org.apache.hadoop.ipc.RemoteException;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.SolrException.ErrorCode;
@ -142,16 +143,33 @@ public class HdfsUpdateLog extends UpdateLog {
} }
lastDataDir = dataDir; lastDataDir = dataDir;
tlogDir = new Path(dataDir, TLOG_NAME); tlogDir = new Path(dataDir, TLOG_NAME);
while (true) {
try { try {
if (!fs.exists(tlogDir)) { if (!fs.exists(tlogDir)) {
boolean success = fs.mkdirs(tlogDir); boolean success = fs.mkdirs(tlogDir);
if (!success) { if (!success) {
throw new RuntimeException("Could not create directory:" + tlogDir); throw new RuntimeException("Could not create directory:" + tlogDir);
}
} else {
fs.mkdirs(tlogDir); // To check for safe mode
} }
break;
} catch (RemoteException e) {
if (e.getClassName().equals(
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
try {
Thread.sleep(5000);
} catch (InterruptedException e1) {
Thread.interrupted();
}
continue;
}
throw new RuntimeException(
"Problem creating directory: " + tlogDir, e);
} catch (IOException e) {
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
} }
} catch (IOException e) {
throw new RuntimeException(e);
} }
tlogFiles = getLogList(fs, tlogDir); tlogFiles = getLogList(fs, tlogDir);

View File

@ -4,12 +4,16 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.URI; import java.net.URI;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.junit.Assert;
/* /*
* Licensed to the Apache Software Foundation (ASF) under one or more * Licensed to the Apache Software Foundation (ASF) under one or more
@ -32,6 +36,8 @@ public class HdfsTestUtil {
private static Locale savedLocale; private static Locale savedLocale;
private static Map<MiniDFSCluster,Timer> timers = new ConcurrentHashMap<MiniDFSCluster,Timer>();
public static MiniDFSCluster setupClass(String dataDir) throws Exception { public static MiniDFSCluster setupClass(String dataDir) throws Exception {
LuceneTestCase.assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs", LuceneTestCase.assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false"))); Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
@ -58,7 +64,22 @@ public class HdfsTestUtil {
System.setProperty("solr.hdfs.home", "/solr_hdfs_home"); System.setProperty("solr.hdfs.home", "/solr_hdfs_home");
MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null); final MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
dfsCluster.waitActive();
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
int rnd = LuceneTestCase.random().nextInt(10000);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
}
}, rnd);
timers.put(dfsCluster, timer);
SolrTestCaseJ4.useFactory("org.apache.solr.core.HdfsDirectoryFactory"); SolrTestCaseJ4.useFactory("org.apache.solr.core.HdfsDirectoryFactory");
@ -72,6 +93,7 @@ public class HdfsTestUtil {
System.clearProperty("test.cache.data"); System.clearProperty("test.cache.data");
System.clearProperty("solr.hdfs.home"); System.clearProperty("solr.hdfs.home");
if (dfsCluster != null) { if (dfsCluster != null) {
timers.remove(dfsCluster);
dfsCluster.shutdown(); dfsCluster.shutdown();
} }

View File

@ -23,11 +23,15 @@ import java.net.URI;
import java.net.URISyntaxException; import java.net.URISyntaxException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.Timer;
import java.util.TimerTask;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hadoop.hdfs.MiniDFSCluster;
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer; import org.apache.solr.client.solrj.SolrServer;
@ -35,6 +39,7 @@ import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.cloud.BasicDistributedZkTest; import org.apache.solr.cloud.BasicDistributedZkTest;
import org.apache.solr.cloud.ChaosMonkey;
import org.apache.solr.common.params.CollectionParams.CollectionAction; import org.apache.solr.common.params.CollectionParams.CollectionAction;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList; import org.apache.solr.common.util.NamedList;
@ -52,6 +57,9 @@ public class StressHdfsTest extends BasicDistributedZkTest {
private static final String DELETE_DATA_DIR_COLLECTION = "delete_data_dir"; private static final String DELETE_DATA_DIR_COLLECTION = "delete_data_dir";
private static MiniDFSCluster dfsCluster; private static MiniDFSCluster dfsCluster;
private boolean testRestartIntoSafeMode;
@BeforeClass @BeforeClass
public static void setupClass() throws Exception { public static void setupClass() throws Exception {
@ -68,7 +76,6 @@ public class StressHdfsTest extends BasicDistributedZkTest {
dfsCluster = null; dfsCluster = null;
} }
@Override @Override
protected String getDataDir(String dataDir) throws IOException { protected String getDataDir(String dataDir) throws IOException {
return HdfsTestUtil.getDataDir(dfsCluster, dataDir); return HdfsTestUtil.getDataDir(dfsCluster, dataDir);
@ -78,6 +85,7 @@ public class StressHdfsTest extends BasicDistributedZkTest {
super(); super();
sliceCount = 1; sliceCount = 1;
shardCount = TEST_NIGHTLY ? 7 : random().nextInt(2) + 1; shardCount = TEST_NIGHTLY ? 7 : random().nextInt(2) + 1;
testRestartIntoSafeMode = random().nextBoolean();
} }
protected String getSolrXml() { protected String getSolrXml() {
@ -90,6 +98,31 @@ public class StressHdfsTest extends BasicDistributedZkTest {
for (int i = 0; i < cnt; i++) { for (int i = 0; i < cnt; i++) {
createAndDeleteCollection(); createAndDeleteCollection();
} }
if (testRestartIntoSafeMode) {
createCollection(DELETE_DATA_DIR_COLLECTION, 1, 1, 1);
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
ChaosMonkey.stop(jettys.get(0));
// enter safe mode and restart a node
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
int rnd = LuceneTestCase.random().nextInt(10000);
Timer timer = new Timer();
timer.schedule(new TimerTask() {
@Override
public void run() {
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
}
}, rnd);
ChaosMonkey.start(jettys.get(0));
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
}
} }
private void createAndDeleteCollection() throws SolrServerException, private void createAndDeleteCollection() throws SolrServerException,

View File

@ -90,8 +90,8 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
throw new RuntimeException(e); throw new RuntimeException(e);
} }
hdfsDataDir = hdfsUri + "/solr/shard1"; //hdfsDataDir = hdfsUri + "/solr/shard1";
System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1"); // System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1");
System.setProperty("solr.ulog.dir", hdfsUri + "/solr/shard1"); System.setProperty("solr.ulog.dir", hdfsUri + "/solr/shard1");
initCore("solrconfig-tlog.xml","schema15.xml"); initCore("solrconfig-tlog.xml","schema15.xml");