From d9e7f46ef9fbda5a96936fcf53028725555a0d51 Mon Sep 17 00:00:00 2001 From: Mark Robert Miller Date: Wed, 22 Jan 2014 22:39:10 +0000 Subject: [PATCH] SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS being in safe mode. git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1560553 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 3 + .../apache/solr/store/hdfs/HdfsDirectory.java | 39 +++++++-- .../solr/store/hdfs/HdfsLockFactory.java | 84 ++++++++++++++----- .../org/apache/solr/update/HdfsUpdateLog.java | 34 ++++++-- .../apache/solr/cloud/hdfs/HdfsTestUtil.java | 26 +++++- .../solr/cloud/hdfs/StressHdfsTest.java | 35 +++++++- .../apache/solr/search/TestRecoveryHdfs.java | 4 +- 7 files changed, 183 insertions(+), 42 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index c5043c72667..7c51c9106b0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -219,6 +219,9 @@ Bug Fixes * SOLR-5650: When a replica becomes a leader, only peer sync with other replicas that last published an ACTIVE state. (Mark Miller) +* SOLR-5657: When a SolrCore starts on HDFS, it should gracefully handle HDFS + being in safe mode. (Mark Miller) + Optimizations ---------------------- diff --git a/solr/core/src/java/org/apache/solr/store/hdfs/HdfsDirectory.java b/solr/core/src/java/org/apache/solr/store/hdfs/HdfsDirectory.java index af8c0973a4b..3e5d7711d52 100644 --- a/solr/core/src/java/org/apache/solr/store/hdfs/HdfsDirectory.java +++ b/solr/core/src/java/org/apache/solr/store/hdfs/HdfsDirectory.java @@ -28,9 +28,9 @@ import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.ipc.RemoteException; import org.apache.lucene.store.BaseDirectory; import org.apache.lucene.store.BufferedIndexOutput; -import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -58,14 +58,37 @@ public class HdfsDirectory extends BaseDirectory { this.hdfsDirPath = hdfsDirPath; this.configuration = configuration; fileSystem = FileSystem.newInstance(hdfsDirPath.toUri(), configuration); - try { - if (!fileSystem.exists(hdfsDirPath)) { - fileSystem.mkdirs(hdfsDirPath); + + while (true) { + try { + if (!fileSystem.exists(hdfsDirPath)) { + boolean success = fileSystem.mkdirs(hdfsDirPath); + if (!success) { + throw new RuntimeException("Could not create directory: " + hdfsDirPath); + } + } else { + fileSystem.mkdirs(hdfsDirPath); // check for safe mode + } + + break; + } catch (RemoteException e) { + if (e.getClassName().equals("org.apache.hadoop.hdfs.server.namenode.SafeModeException")) { + LOG.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again."); + try { + Thread.sleep(5000); + } catch (InterruptedException e1) { + Thread.interrupted(); + } + continue; + } + org.apache.solr.util.IOUtils.closeQuietly(fileSystem); + throw new RuntimeException( + "Problem creating directory: " + hdfsDirPath, e); + } catch (Exception e) { + org.apache.solr.util.IOUtils.closeQuietly(fileSystem); + throw new RuntimeException( + "Problem creating directory: " + hdfsDirPath, e); } - } catch (Exception e) { - org.apache.solr.util.IOUtils.closeQuietly(fileSystem); - throw new RuntimeException("Problem creating directory: " + hdfsDirPath, - e); } } diff --git a/solr/core/src/java/org/apache/solr/store/hdfs/HdfsLockFactory.java b/solr/core/src/java/org/apache/solr/store/hdfs/HdfsLockFactory.java index d46965511e5..23b4bc3c3c2 100644 --- a/solr/core/src/java/org/apache/solr/store/hdfs/HdfsLockFactory.java +++ b/solr/core/src/java/org/apache/solr/store/hdfs/HdfsLockFactory.java @@ -24,6 +24,7 @@ import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hadoop.ipc.RemoteException; import org.apache.lucene.store.Lock; import org.apache.lucene.store.LockFactory; import org.apache.lucene.store.LockReleaseFailedException; @@ -59,16 +60,31 @@ public class HdfsLockFactory extends LockFactory { FileSystem fs = null; try { fs = FileSystem.newInstance(lockPath.toUri(), configuration); - - if (fs.exists(lockPath)) { - if (lockPrefix != null) { - lockName = lockPrefix + "-" + lockName; - } - - Path lockFile = new Path(lockPath, lockName); - - if (fs.exists(lockFile) && !fs.delete(lockFile, false)) { - throw new IOException("Cannot delete " + lockFile); + while (true) { + if (fs.exists(lockPath)) { + if (lockPrefix != null) { + lockName = lockPrefix + "-" + lockName; + } + + Path lockFile = new Path(lockPath, lockName); + try { + if (fs.exists(lockFile) && !fs.delete(lockFile, false)) { + throw new IOException("Cannot delete " + lockFile); + } + } catch (RemoteException e) { + if (e.getClassName().equals( + "org.apache.hadoop.hdfs.server.namenode.SafeModeException")) { + log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again."); + try { + Thread.sleep(5000); + } catch (InterruptedException e1) { + Thread.interrupted(); + } + continue; + } + throw e; + } + break; } } } finally { @@ -99,20 +115,46 @@ public class HdfsLockFactory extends LockFactory { @Override public boolean obtain() throws IOException { FSDataOutputStream file = null; - FileSystem fs = null; + FileSystem fs = FileSystem.newInstance(lockPath.toUri(), conf); try { - fs = FileSystem.newInstance(lockPath.toUri(), conf); - if (!fs.exists(lockPath)) { - fs.mkdirs(lockPath); + while (true) { + try { + if (!fs.exists(lockPath)) { + boolean success = fs.mkdirs(lockPath); + if (!success) { + throw new RuntimeException("Could not create directory: " + lockPath); + } + } else { + // just to check for safe mode + fs.mkdirs(lockPath); + } + + + file = fs.create(new Path(lockPath, lockName), false); + break; + } catch (FileAlreadyExistsException e) { + return false; + } catch (RemoteException e) { + if (e.getClassName().equals( + "org.apache.hadoop.hdfs.server.namenode.SafeModeException")) { + log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again."); + try { + Thread.sleep(5000); + } catch (InterruptedException e1) { + Thread.interrupted(); + } + continue; + } + log.error("Error creating lock file", e); + return false; + } catch (IOException e) { + log.error("Error creating lock file", e); + return false; + } finally { + IOUtils.closeQuietly(file); + } } - file = fs.create(new Path(lockPath, lockName), false); - } catch (FileAlreadyExistsException e) { - return false; - }catch (IOException e) { - log.error("Error creating lock file", e); - return false; } finally { - IOUtils.closeQuietly(file); IOUtils.closeQuietly(fs); } return true; diff --git a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java index 097a7c4d682..6db18c45357 100644 --- a/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java +++ b/solr/core/src/java/org/apache/solr/update/HdfsUpdateLog.java @@ -31,6 +31,7 @@ import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.ipc.RemoteException; import org.apache.lucene.util.BytesRef; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; @@ -142,16 +143,33 @@ public class HdfsUpdateLog extends UpdateLog { } lastDataDir = dataDir; tlogDir = new Path(dataDir, TLOG_NAME); - - try { - if (!fs.exists(tlogDir)) { - boolean success = fs.mkdirs(tlogDir); - if (!success) { - throw new RuntimeException("Could not create directory:" + tlogDir); + while (true) { + try { + if (!fs.exists(tlogDir)) { + boolean success = fs.mkdirs(tlogDir); + if (!success) { + throw new RuntimeException("Could not create directory:" + tlogDir); + } + } else { + fs.mkdirs(tlogDir); // To check for safe mode } + break; + } catch (RemoteException e) { + if (e.getClassName().equals( + "org.apache.hadoop.hdfs.server.namenode.SafeModeException")) { + log.warn("The NameNode is in SafeMode - Solr will wait 5 seconds and try again."); + try { + Thread.sleep(5000); + } catch (InterruptedException e1) { + Thread.interrupted(); + } + continue; + } + throw new RuntimeException( + "Problem creating directory: " + tlogDir, e); + } catch (IOException e) { + throw new RuntimeException("Problem creating directory: " + tlogDir, e); } - } catch (IOException e) { - throw new RuntimeException(e); } tlogFiles = getLogList(fs, tlogDir); diff --git a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsTestUtil.java b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsTestUtil.java index fdee1efb5e8..1788aa715ad 100644 --- a/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsTestUtil.java +++ b/solr/core/src/test/org/apache/solr/cloud/hdfs/HdfsTestUtil.java @@ -4,12 +4,16 @@ import java.io.File; import java.io.IOException; import java.net.URI; import java.util.Locale; +import java.util.Map; +import java.util.Timer; +import java.util.TimerTask; +import java.util.concurrent.ConcurrentHashMap; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; import org.apache.lucene.util.LuceneTestCase; import org.apache.solr.SolrTestCaseJ4; -import org.junit.Assert; /* * Licensed to the Apache Software Foundation (ASF) under one or more @@ -31,6 +35,8 @@ import org.junit.Assert; public class HdfsTestUtil { private static Locale savedLocale; + + private static Map timers = new ConcurrentHashMap(); public static MiniDFSCluster setupClass(String dataDir) throws Exception { LuceneTestCase.assumeFalse("HDFS tests were disabled by -Dtests.disableHdfs", @@ -58,7 +64,22 @@ public class HdfsTestUtil { System.setProperty("solr.hdfs.home", "/solr_hdfs_home"); - MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null); + final MiniDFSCluster dfsCluster = new MiniDFSCluster(conf, dataNodes, true, null); + dfsCluster.waitActive(); + + NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false); + + int rnd = LuceneTestCase.random().nextInt(10000); + Timer timer = new Timer(); + timer.schedule(new TimerTask() { + + @Override + public void run() { + NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode()); + } + }, rnd); + + timers.put(dfsCluster, timer); SolrTestCaseJ4.useFactory("org.apache.solr.core.HdfsDirectoryFactory"); @@ -72,6 +93,7 @@ public class HdfsTestUtil { System.clearProperty("test.cache.data"); System.clearProperty("solr.hdfs.home"); if (dfsCluster != null) { + timers.remove(dfsCluster); dfsCluster.shutdown(); } diff --git a/solr/core/src/test/org/apache/solr/cloud/hdfs/StressHdfsTest.java b/solr/core/src/test/org/apache/solr/cloud/hdfs/StressHdfsTest.java index 3c93d2e3065..6ac7e886c6b 100644 --- a/solr/core/src/test/org/apache/solr/cloud/hdfs/StressHdfsTest.java +++ b/solr/core/src/test/org/apache/solr/cloud/hdfs/StressHdfsTest.java @@ -23,11 +23,15 @@ import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; +import java.util.Timer; +import java.util.TimerTask; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter; +import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServer; @@ -35,6 +39,7 @@ import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.impl.HttpSolrServer; import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.cloud.BasicDistributedZkTest; +import org.apache.solr.cloud.ChaosMonkey; import org.apache.solr.common.params.CollectionParams.CollectionAction; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.util.NamedList; @@ -52,6 +57,9 @@ public class StressHdfsTest extends BasicDistributedZkTest { private static final String DELETE_DATA_DIR_COLLECTION = "delete_data_dir"; private static MiniDFSCluster dfsCluster; + + private boolean testRestartIntoSafeMode; + @BeforeClass public static void setupClass() throws Exception { @@ -67,7 +75,6 @@ public class StressHdfsTest extends BasicDistributedZkTest { System.clearProperty("solr.hdfs.home"); dfsCluster = null; } - @Override protected String getDataDir(String dataDir) throws IOException { @@ -78,6 +85,7 @@ public class StressHdfsTest extends BasicDistributedZkTest { super(); sliceCount = 1; shardCount = TEST_NIGHTLY ? 7 : random().nextInt(2) + 1; + testRestartIntoSafeMode = random().nextBoolean(); } protected String getSolrXml() { @@ -90,6 +98,31 @@ public class StressHdfsTest extends BasicDistributedZkTest { for (int i = 0; i < cnt; i++) { createAndDeleteCollection(); } + + if (testRestartIntoSafeMode) { + createCollection(DELETE_DATA_DIR_COLLECTION, 1, 1, 1); + + waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false); + + ChaosMonkey.stop(jettys.get(0)); + + // enter safe mode and restart a node + NameNodeAdapter.enterSafeMode(dfsCluster.getNameNode(), false); + + int rnd = LuceneTestCase.random().nextInt(10000); + Timer timer = new Timer(); + timer.schedule(new TimerTask() { + + @Override + public void run() { + NameNodeAdapter.leaveSafeMode(dfsCluster.getNameNode()); + } + }, rnd); + + ChaosMonkey.start(jettys.get(0)); + + waitForRecoveriesToFinish(DELETE_DATA_DIR_COLLECTION, false); + } } private void createAndDeleteCollection() throws SolrServerException, diff --git a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java index 69a849ef0ab..2a86d45d702 100644 --- a/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java +++ b/solr/core/src/test/org/apache/solr/search/TestRecoveryHdfs.java @@ -90,8 +90,8 @@ public class TestRecoveryHdfs extends SolrTestCaseJ4 { throw new RuntimeException(e); } - hdfsDataDir = hdfsUri + "/solr/shard1"; - System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1"); + //hdfsDataDir = hdfsUri + "/solr/shard1"; + // System.setProperty("solr.data.dir", hdfsUri + "/solr/shard1"); System.setProperty("solr.ulog.dir", hdfsUri + "/solr/shard1"); initCore("solrconfig-tlog.xml","schema15.xml");