mirror of https://github.com/apache/lucene.git
SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS being in safe mode.
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1560553 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
4fcf007f94
commit
d9e7f46ef9
|
@ -219,6 +219,9 @@ Bug Fixes
|
|||
* SOLR-5650: When a replica becomes a leader, only peer sync with other replicas
|
||||
that last published an ACTIVE state. (Mark Miller)
|
||||
|
||||
* SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS
|
||||
being in safe mode. (Mark Miller)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -28,9 +28,9 @@ import org.apache.hadoop.fs.FSDataInputStream;
|
|||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.lucene.store.BaseDirectory;
|
||||
import org.apache.lucene.store.BufferedIndexOutput;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.IOContext;
|
||||
import org.apache.lucene.store.IndexInput;
|
||||
import org.apache.lucene.store.IndexOutput;
|
||||
|
@ -58,14 +58,37 @@ public class HdfsDirectory extends BaseDirectory {
|
|||
this.hdfsDirPath = hdfsDirPath;
|
||||
this.configuration = configuration;
|
||||
fileSystem = FileSystem.newInstance(hdfsDirPath.toUri(), configuration);
|
||||
try {
|
||||
if (!fileSystem.exists(hdfsDirPath)) {
|
||||
fileSystem.mkdirs(hdfsDirPath);
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
if (!fileSystem.exists(hdfsDirPath)) {
|
||||
boolean success = fileSystem.mkdirs(hdfsDirPath);
|
||||
if (!success) {
|
||||
throw new RuntimeException("Could not create directory: " + hdfsDirPath);
|
||||
}
|
||||
} else {
|
||||
fileSystem.mkdirs(hdfsDirPath); // check for safe mode
|
||||
}
|
||||
|
||||
break;
|
||||
} catch (RemoteException e) {
|
||||
if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
|
||||
LOG.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e1) {
|
||||
Thread.interrupted();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
|
||||
throw new RuntimeException(
|
||||
"Problem creating directory: " + hdfsDirPath, e);
|
||||
} catch (Exception e) {
|
||||
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
|
||||
throw new RuntimeException(
|
||||
"Problem creating directory: " + hdfsDirPath, e);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
org.apache.solr.util.IOUtils.closeQuietly(fileSystem);
|
||||
throw new RuntimeException("Problem creating directory: " + hdfsDirPath,
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FSDataOutputStream;
|
|||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.lucene.store.Lock;
|
||||
import org.apache.lucene.store.LockFactory;
|
||||
import org.apache.lucene.store.LockReleaseFailedException;
|
||||
|
@ -59,16 +60,31 @@ public class HdfsLockFactory extends LockFactory {
|
|||
FileSystem fs = null;
|
||||
try {
|
||||
fs = FileSystem.newInstance(lockPath.toUri(), configuration);
|
||||
while (true) {
|
||||
if (fs.exists(lockPath)) {
|
||||
if (lockPrefix != null) {
|
||||
lockName = lockPrefix + "-" + lockName;
|
||||
}
|
||||
|
||||
if (fs.exists(lockPath)) {
|
||||
if (lockPrefix != null) {
|
||||
lockName = lockPrefix + "-" + lockName;
|
||||
}
|
||||
|
||||
Path lockFile = new Path(lockPath, lockName);
|
||||
|
||||
if (fs.exists(lockFile) && !fs.delete(lockFile, false)) {
|
||||
throw new IOException("Cannot delete " + lockFile);
|
||||
Path lockFile = new Path(lockPath, lockName);
|
||||
try {
|
||||
if (fs.exists(lockFile) && !fs.delete(lockFile, false)) {
|
||||
throw new IOException("Cannot delete " + lockFile);
|
||||
}
|
||||
} catch (RemoteException e) {
|
||||
if (e.getClassName().equals(
|
||||
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
|
||||
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e1) {
|
||||
Thread.interrupted();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
|
@ -99,20 +115,46 @@ public class HdfsLockFactory extends LockFactory {
|
|||
@Override
|
||||
public boolean obtain() throws IOException {
|
||||
FSDataOutputStream file = null;
|
||||
FileSystem fs = null;
|
||||
FileSystem fs = FileSystem.newInstance(lockPath.toUri(), conf);
|
||||
try {
|
||||
fs = FileSystem.newInstance(lockPath.toUri(), conf);
|
||||
if (!fs.exists(lockPath)) {
|
||||
fs.mkdirs(lockPath);
|
||||
while (true) {
|
||||
try {
|
||||
if (!fs.exists(lockPath)) {
|
||||
boolean success = fs.mkdirs(lockPath);
|
||||
if (!success) {
|
||||
throw new RuntimeException("Could not create directory: " + lockPath);
|
||||
}
|
||||
} else {
|
||||
// just to check for safe mode
|
||||
fs.mkdirs(lockPath);
|
||||
}
|
||||
|
||||
|
||||
file = fs.create(new Path(lockPath, lockName), false);
|
||||
break;
|
||||
} catch (FileAlreadyExistsException e) {
|
||||
return false;
|
||||
} catch (RemoteException e) {
|
||||
if (e.getClassName().equals(
|
||||
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
|
||||
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e1) {
|
||||
Thread.interrupted();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
log.error("Error creating lock file", e);
|
||||
return false;
|
||||
} catch (IOException e) {
|
||||
log.error("Error creating lock file", e);
|
||||
return false;
|
||||
} finally {
|
||||
IOUtils.closeQuietly(file);
|
||||
}
|
||||
}
|
||||
file = fs.create(new Path(lockPath, lockName), false);
|
||||
} catch (FileAlreadyExistsException e) {
|
||||
return false;
|
||||
}catch (IOException e) {
|
||||
log.error("Error creating lock file", e);
|
||||
return false;
|
||||
} finally {
|
||||
IOUtils.closeQuietly(file);
|
||||
IOUtils.closeQuietly(fs);
|
||||
}
|
||||
return true;
|
||||
|
|
|
@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrException.ErrorCode;
|
||||
|
@ -142,16 +143,33 @@ public class HdfsUpdateLog extends UpdateLog {
|
|||
}
|
||||
lastDataDir = dataDir;
|
||||
tlogDir = new Path(dataDir, TLOG_NAME);
|
||||
|
||||
try {
|
||||
if (!fs.exists(tlogDir)) {
|
||||
boolean success = fs.mkdirs(tlogDir);
|
||||
if (!success) {
|
||||
throw new RuntimeException("Could not create directory:" + tlogDir);
|
||||
while (true) {
|
||||
try {
|
||||
if (!fs.exists(tlogDir)) {
|
||||
boolean success = fs.mkdirs(tlogDir);
|
||||
if (!success) {
|
||||
throw new RuntimeException("Could not create directory:" + tlogDir);
|
||||
}
|
||||
} else {
|
||||
fs.mkdirs(tlogDir); // To check for safe mode
|
||||
}
|
||||
break;
|
||||
} catch (RemoteException e) {
|
||||
if (e.getClassName().equals(
|
||||
"org.apache.hadoop.hdfs.server.namenode.SafeModeException")) {
|
||||
log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again.");
|
||||
try {
|
||||
Thread.sleep(5000);
|
||||
} catch (InterruptedException e1) {
|
||||
Thread.interrupted();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
throw new RuntimeException(
|
||||
"Problem creating directory: " + tlogDir, e);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Problem creating directory: " + tlogDir, e);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
tlogFiles = getLogList(fs, tlogDir);
|
||||
|
|
|
@ -4,12 +4,16 @@ import java.io.File;
|
|||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.junit.Assert;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -32,6 +36,8 @@ public class HdfsTestUtil {
|
|||
|
||||
private static Locale savedLocale;
|
||||
|
||||
private static Map<MiniDFSCluster,Timer> timers = new ConcurrentHashMap<MiniDFSCluster,Timer>();
|
||||
|
||||
public static MiniDFSCluster setupClass(String dataDir) throws Exception {
|
||||
LuceneTestCase.assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs",
|
||||
Boolean.parseBoolean(System.getProperty("tests.disableHdfs", "false")));
|
||||
|
@ -58,7 +64,22 @@ public class HdfsTestUtil {
|
|||
|
||||
System.setProperty("solr.hdfs.home", "/solr_hdfs_home");
|
||||
|
||||
MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
|
||||
final MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null);
|
||||
dfsCluster.waitActive();
|
||||
|
||||
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
|
||||
|
||||
int rnd = LuceneTestCase.random().nextInt(10000);
|
||||
Timer timer = new Timer();
|
||||
timer.schedule(new TimerTask() {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
|
||||
}
|
||||
}, rnd);
|
||||
|
||||
timers.put(dfsCluster, timer);
|
||||
|
||||
SolrTestCaseJ4.useFactory("org.apache.solr.core.HdfsDirectoryFactory");
|
||||
|
||||
|
@ -72,6 +93,7 @@ public class HdfsTestUtil {
|
|||
System.clearProperty("test.cache.data");
|
||||
System.clearProperty("solr.hdfs.home");
|
||||
if (dfsCluster != null) {
|
||||
timers.remove(dfsCluster);
|
||||
dfsCluster.shutdown();
|
||||
}
|
||||
|
||||
|
|
|
@ -23,11 +23,15 @@ import java.net.URI;
|
|||
import java.net.URISyntaxException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Timer;
|
||||
import java.util.TimerTask;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.SolrServer;
|
||||
|
@ -35,6 +39,7 @@ import org.apache.solr.client.solrj.SolrServerException;
|
|||
import org.apache.solr.client.solrj.impl.HttpSolrServer;
|
||||
import org.apache.solr.client.solrj.request.QueryRequest;
|
||||
import org.apache.solr.cloud.BasicDistributedZkTest;
|
||||
import org.apache.solr.cloud.ChaosMonkey;
|
||||
import org.apache.solr.common.params.CollectionParams.CollectionAction;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
|
@ -52,6 +57,9 @@ public class StressHdfsTest extends BasicDistributedZkTest {
|
|||
private static final String DELETE_DATA_DIR_COLLECTION = "delete_data_dir";
|
||||
private static MiniDFSCluster dfsCluster;
|
||||
|
||||
|
||||
private boolean testRestartIntoSafeMode;
|
||||
|
||||
@BeforeClass
|
||||
public static void setupClass() throws Exception {
|
||||
|
||||
|
@ -68,7 +76,6 @@ public class StressHdfsTest extends BasicDistributedZkTest {
|
|||
dfsCluster = null;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
protected String getDataDir(String dataDir) throws IOException {
|
||||
return HdfsTestUtil.getDataDir(dfsCluster, dataDir);
|
||||
|
@ -78,6 +85,7 @@ public class StressHdfsTest extends BasicDistributedZkTest {
|
|||
super();
|
||||
sliceCount = 1;
|
||||
shardCount = TEST_NIGHTLY ? 7 : random().nextInt(2) + 1;
|
||||
testRestartIntoSafeMode = random().nextBoolean();
|
||||
}
|
||||
|
||||
protected String getSolrXml() {
|
||||
|
@ -90,6 +98,31 @@ public class StressHdfsTest extends BasicDistributedZkTest {
|
|||
for (int i = 0; i < cnt; i++) {
|
||||
createAndDeleteCollection();
|
||||
}
|
||||
|
||||
if (testRestartIntoSafeMode) {
|
||||
createCollection(DELETE_DATA_DIR_COLLECTION, 1, 1, 1);
|
||||
|
||||
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
|
||||
|
||||
ChaosMonkey.stop(jettys.get(0));
|
||||
|
||||
// enter safe mode and restart a node
|
||||
NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false);
|
||||
|
||||
int rnd = LuceneTestCase.random().nextInt(10000);
|
||||
Timer timer = new Timer();
|
||||
timer.schedule(new TimerTask() {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode());
|
||||
}
|
||||
}, rnd);
|
||||
|
||||
ChaosMonkey.start(jettys.get(0));
|
||||
|
||||
waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false);
|
||||
}
|
||||
}
|
||||
|
||||
private void createAndDeleteCollection() throws SolrServerException,
|
||||
|
|
|
@ -90,8 +90,8 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 {
|
|||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
hdfsDataDir = hdfsUri + "/solr/shard1";
|
||||
System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1");
|
||||
//hdfsDataDir = hdfsUri + "/solr/shard1";
|
||||
// System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1");
|
||||
System.setProperty("solr.ulog.dir", hdfsUri + "/solr/shard1");
|
||||
|
||||
initCore("solrconfig-tlog.xml","schema15.xml");
|
||||
|
|
Loading…
Reference in New Issue