SOLR-12801: Make massive improvements to the tests.

SOLR-12804: Remove static modifier from Overseer queue access.

SOLR-12896: Introduce more checks for shutdown and closed to improve clean close and shutdown. (Partial)

SOLR-12897: Introduce AlreadyClosedException to clean up silly close / shutdown logging. (Partial)

SOLR-12898: Replace cluster state polling with ZkStateReader#waitFor. (Partial)

SOLR-12923: The new AutoScaling tests are way too flaky and need special attention. (Partial)

SOLR-12932: ant test (without badapples=false) should pass easily for developers. (Partial)

SOLR-12933: Fix SolrCloud distributed commit.
This commit is contained in:
markrmiller 2018-11-29 11:58:18 -06:00
parent 81c092d826
commit 75b1831967
349 changed files with 6840 additions and 4129 deletions

View File

@ -91,4 +91,8 @@ grant {
permission javax.security.auth.kerberos.ServicePermission "HTTP/127.0.0.1@EXAMPLE.COM", "accept";
permission javax.security.auth.kerberos.DelegationPermission "\"HTTP/127.0.0.1@EXAMPLE.COM\" \"krbtgt/EXAMPLE.COM@EXAMPLE.COM\"";
// java 8 accessibility requires this perm - should not after 8 I believe (rrd4j is the root reason we hit an accessibility code path)
permission java.awt.AWTPermission "listenToAllAWTEvents";
permission java.awt.AWTPermission "accessEventQueue";
};

View File

@ -131,16 +131,15 @@ New Features
----------------------
(No Changes)
Other Changes
----------------------
* SOLR-12972: deprecate unused SolrIndexConfig.luceneVersion (Christine Poerschke)
Bug Fixes
----------------------
* SOLR-12546: CVSResponseWriter omits useDocValuesAsStored=true field when fl=*
(Munendra S N via Mikhail Khludnev)
* SOLR-12933: Fix SolrCloud distributed commit. (Mark Miller)
Improvements
----------------------
@ -149,6 +148,25 @@ Improvements
* SOLR-12992: When using binary format, ExportWriter to directly copy BytesRef instead of
creating new String (noble)
* SOLR-12898: Replace cluster state polling with ZkStateReader#waitFor. (Mark Miller)
* SOLR-12897: Introduce AlreadyClosedException to clean up silly close / shutdown logging. (Mark Miller)
* SOLR-12896: Introduce more checks for shutdown and closed to improve clean close and shutdown. (Mark Miller)
* SOLR-12804: Remove static modifier from Overseer queue access. (Mark Miller)
Other Changes
----------------------
* SOLR-12972: deprecate unused SolrIndexConfig.luceneVersion (Christine Poerschke)
* SOLR-12801: Make massive improvements to the tests. (Mark Miller)
* SOLR-12923: The new AutoScaling tests are way too flaky and need special attention. (Mark Miller)
* SOLR-12932: ant test (without badapples=false) should pass easily for developers. (Mark Miller)
================== 7.6.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -20,6 +20,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.concurrent.TimeoutException;
import org.apache.solr.analytics.util.AnalyticsResponseHeadings;
import org.apache.solr.analytics.util.MedianCalculator;
@ -29,11 +30,11 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.cloud.AbstractDistribZkTestBase;
import org.apache.solr.cloud.SolrCloudTestCase;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.BeforeClass;
import org.junit.After;
import org.junit.Before;
public class LegacyAbstractAnalyticsCloudTest extends SolrCloudTestCase {
@ -41,19 +42,23 @@ public class LegacyAbstractAnalyticsCloudTest extends SolrCloudTestCase {
protected static final int TIMEOUT = DEFAULT_TIMEOUT;
protected static final String id = "id";
@BeforeClass
public static void setupCollection() throws Exception {
@Before
public void setupCollection() throws Exception {
configureCluster(4)
.addConfig("conf", configset("cloud-analytics"))
.configure();
CollectionAdminRequest.createCollection(COLLECTIONORALIAS, "conf", 2, 1).process(cluster.getSolrClient());
AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTIONORALIAS, cluster.getSolrClient().getZkStateReader(),
false, true, TIMEOUT);
cleanIndex();
cluster.waitForActiveCollection(COLLECTIONORALIAS, 2, 2);
}
public static void cleanIndex() throws Exception {
@After
public void teardownCollection() throws Exception {
cluster.deleteAllCollections();
shutdownCluster();
}
public void cleanIndex() throws Exception {
new UpdateRequest()
.deleteByQuery("*:*")
.commit(cluster.getSolrClient(), COLLECTIONORALIAS);
@ -81,7 +86,7 @@ public class LegacyAbstractAnalyticsCloudTest extends SolrCloudTestCase {
}
}
protected NamedList<Object> queryLegacyCloudAnalytics(String[] testParams) throws SolrServerException, IOException, InterruptedException {
protected NamedList<Object> queryLegacyCloudAnalytics(String[] testParams) throws SolrServerException, IOException, InterruptedException, TimeoutException {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("q", "*:*");
params.set("indent", "true");

View File

@ -21,7 +21,7 @@ import java.util.List;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.util.NamedList;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
public class LegacyNoFacetCloudTest extends LegacyAbstractAnalyticsCloudTest {
@ -57,16 +57,20 @@ public class LegacyNoFacetCloudTest extends LegacyAbstractAnalyticsCloudTest {
static ArrayList<String> stringTestStart;
static long stringMissing = 0;
@BeforeClass
public static void populate() throws Exception {
cleanIndex();
@Before
public void populate() throws Exception {
intTestStart = new ArrayList<>();
longTestStart = new ArrayList<>();
floatTestStart = new ArrayList<>();
doubleTestStart = new ArrayList<>();
dateTestStart = new ArrayList<>();
stringTestStart = new ArrayList<>();
intMissing = 0;
longMissing = 0;
doubleMissing = 0;
floatMissing = 0;
dateMissing = 0;
stringMissing = 0;
UpdateRequest req = new UpdateRequest();
for (int j = 0; j < NUM_LOOPS; ++j) {

View File

@ -24,7 +24,7 @@ import java.util.List;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.util.NamedList;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
@ -85,9 +85,8 @@ public class LegacyFieldFacetCloudTest extends LegacyAbstractAnalyticsFacetCloud
private static ArrayList<ArrayList<Integer>> multiDateTestStart;
private static ArrayList<Long> multiDateTestMissing;
@BeforeClass
public static void beforeClass() throws Exception {
cleanIndex();
@Before
public void beforeTest() throws Exception {
//INT
intDateTestStart = new ArrayList<>();

View File

@ -24,7 +24,7 @@ import java.util.List;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.util.NamedList;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
public class LegacyFieldFacetExtrasCloudTest extends LegacyAbstractAnalyticsFacetCloudTest {
@ -42,9 +42,8 @@ public class LegacyFieldFacetExtrasCloudTest extends LegacyAbstractAnalyticsFace
static ArrayList<ArrayList<Integer>> intDoubleTestStart;
static ArrayList<ArrayList<Integer>> intStringTestStart;
@BeforeClass
public static void beforeClass() throws Exception {
cleanIndex();
@Before
public void beforeTest() throws Exception {
//INT
intLongTestStart = new ArrayList<>();

View File

@ -22,7 +22,7 @@ import java.util.List;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.util.NamedList;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
public class LegacyQueryFacetCloudTest extends LegacyAbstractAnalyticsFacetCloudTest {
@ -39,9 +39,8 @@ public class LegacyQueryFacetCloudTest extends LegacyAbstractAnalyticsFacetCloud
private static ArrayList<ArrayList<Long>> longTestStart = new ArrayList<>();
private static ArrayList<ArrayList<Float>> floatTestStart = new ArrayList<>();
@BeforeClass
public static void beforeClass() throws Exception {
cleanIndex();
@Before
public void beforeTest() throws Exception {
//INT
int1TestStart.add(new ArrayList<Integer>());

View File

@ -21,7 +21,7 @@ import java.util.List;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.util.NamedList;
import org.junit.BeforeClass;
import org.junit.Before;
import org.junit.Test;
@ -44,9 +44,8 @@ public class LegacyRangeFacetCloudTest extends LegacyAbstractAnalyticsFacetCloud
static ArrayList<ArrayList<Float>> floatDoubleTestStart;
static ArrayList<ArrayList<Float>> floatDateTestStart;
@BeforeClass
public static void beforeClass() throws Exception {
cleanIndex();
@Before
public void beforeTest() throws Exception {
//INT
intLongTestStart = new ArrayList<>();

View File

@ -52,7 +52,7 @@ public class TestContentStreamDataSource extends AbstractDataImportHandlerTestCa
super.setUp();
instance = new SolrInstance("inst", null);
instance.setUp();
jetty = createJetty(instance);
jetty = createAndStartJetty(instance);
}
@Override
@ -173,7 +173,7 @@ public class TestContentStreamDataSource extends AbstractDataImportHandlerTestCa
}
private JettySolrRunner createJetty(SolrInstance instance) throws Exception {
private JettySolrRunner createAndStartJetty(SolrInstance instance) throws Exception {
Properties nodeProperties = new Properties();
nodeProperties.setProperty("solr.data.dir", instance.getDataDir());
JettySolrRunner jetty = new JettySolrRunner(instance.getHomeDir(), nodeProperties, buildJettyConfig("/solr"));

View File

@ -127,7 +127,7 @@ public class TestSolrEntityProcessorEndToEnd extends AbstractDataImportHandlerTe
// data source solr instance
instance = new SolrInstance();
instance.setUp();
jetty = createJetty(instance);
jetty = createAndStartJetty(instance);
}
@Override
@ -362,7 +362,7 @@ public class TestSolrEntityProcessorEndToEnd extends AbstractDataImportHandlerTe
}
}
private JettySolrRunner createJetty(SolrInstance instance) throws Exception {
private JettySolrRunner createAndStartJetty(SolrInstance instance) throws Exception {
Properties nodeProperties = new Properties();
nodeProperties.setProperty("solr.data.dir", instance.getDataDir());
JettySolrRunner jetty = new JettySolrRunner(instance.getHomeDir(), nodeProperties, buildJettyConfig("/solr"));

View File

@ -26,7 +26,6 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.solr.cloud.AbstractZkTestCase;
import org.apache.solr.cloud.ZkTestServer;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.SuppressForbidden;
@ -62,7 +61,7 @@ public class TestZKPropertiesWriter extends AbstractDataImportHandlerTestCase {
System.setProperty("zkHost", zkServer.getZkAddress());
System.setProperty("jetty.port", "0000");
AbstractZkTestCase.buildZooKeeper(zkServer.getZkHost(), zkServer.getZkAddress(), getFile("dih/solr"),
zkServer.buildZooKeeper(getFile("dih/solr"),
"dataimport-solrconfig.xml", "dataimport-schema.xml");
//initCore("solrconfig.xml", "schema.xml", getFile("dih/solr").getAbsolutePath());

View File

@ -18,14 +18,13 @@ package org.apache.solr.ltr;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Semaphore;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.core.CloseHook;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
@ -58,7 +57,7 @@ import org.apache.solr.util.plugin.NamedListInitializedPlugin;
* <code>totalPoolThreads</code> imposes a contention between the queries if
* <code>(totalPoolThreads &lt; numThreadsPerRequest * total parallel queries)</code>.
*/
final public class LTRThreadModule implements NamedListInitializedPlugin {
final public class LTRThreadModule extends CloseHook implements NamedListInitializedPlugin {
public static LTRThreadModule getInstance(NamedList args) {
@ -103,13 +102,10 @@ final public class LTRThreadModule implements NamedListInitializedPlugin {
// settings
private int totalPoolThreads = 1;
private int numThreadsPerRequest = 1;
private int maxPoolSize = Integer.MAX_VALUE;
private long keepAliveTimeSeconds = 10;
private String threadNamePrefix = "ltrExecutor";
// implementation
private Semaphore ltrSemaphore;
private Executor createWeightScoreExecutor;
private volatile ExecutorService createWeightScoreExecutor;
public LTRThreadModule() {
}
@ -132,13 +128,6 @@ final public class LTRThreadModule implements NamedListInitializedPlugin {
} else {
ltrSemaphore = null;
}
createWeightScoreExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(
0,
maxPoolSize,
keepAliveTimeSeconds, TimeUnit.SECONDS, // terminate idle threads after 10 sec
new SynchronousQueue<Runnable>(), // directly hand off tasks
new DefaultSolrThreadFactory(threadNamePrefix)
);
}
private void validate() {
@ -161,18 +150,6 @@ final public class LTRThreadModule implements NamedListInitializedPlugin {
this.numThreadsPerRequest = numThreadsPerRequest;
}
public void setMaxPoolSize(int maxPoolSize) {
this.maxPoolSize = maxPoolSize;
}
public void setKeepAliveTimeSeconds(long keepAliveTimeSeconds) {
this.keepAliveTimeSeconds = keepAliveTimeSeconds;
}
public void setThreadNamePrefix(String threadNamePrefix) {
this.threadNamePrefix = threadNamePrefix;
}
public Semaphore createQuerySemaphore() {
return (numThreadsPerRequest > 1 ? new Semaphore(numThreadsPerRequest) : null);
}
@ -189,4 +166,18 @@ final public class LTRThreadModule implements NamedListInitializedPlugin {
createWeightScoreExecutor.execute(command);
}
@Override
public void preClose(SolrCore core) {
ExecutorUtil.shutdownAndAwaitTermination(createWeightScoreExecutor);
}
@Override
public void postClose(SolrCore core) {
}
public void setExecutor(ExecutorService sharedExecutor) {
this.createWeightScoreExecutor = sharedExecutor;
}
}

View File

@ -204,6 +204,9 @@ public class LTRFeatureLoggerTransformerFactory extends TransformerFactory {
"searcher is null");
}
leafContexts = searcher.getTopReaderContext().leaves();
if (threadManager != null) {
threadManager.setExecutor(context.getRequest().getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
}
// Setup LTRScoringQuery
scoringQuery = SolrQueryRequestContextUtils.getScoringQuery(req);

View File

@ -162,7 +162,9 @@ public class LTRQParserPlugin extends QParserPlugin implements ResourceLoaderAwa
final String fvStoreName = SolrQueryRequestContextUtils.getFvStoreName(req);
// Check if features are requested and if the model feature store and feature-transform feature store are the same
final boolean featuresRequestedFromSameStore = (modelFeatureStoreName.equals(fvStoreName) || fvStoreName == null) ? extractFeatures:false;
if (threadManager != null) {
threadManager.setExecutor(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
}
final LTRScoringQuery scoringQuery = new LTRScoringQuery(ltrScoringModel,
extractEFIParams(localParams),
featuresRequestedFromSameStore, threadManager);

View File

@ -25,7 +25,6 @@ import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.cloud.AbstractDistribZkTestBase;
import org.apache.solr.cloud.MiniSolrCloudCluster;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.cloud.ZkStateReader;
@ -232,7 +231,7 @@ public class TestLTROnSolrCloud extends TestRerankBase {
fail("Could not create collection. Response" + response.toString());
}
ZkStateReader zkStateReader = solrCluster.getSolrClient().getZkStateReader();
AbstractDistribZkTestBase.waitForRecoveriesToFinish(name, zkStateReader, false, true, 100);
solrCluster.waitForActiveCollection(name, numShards, numShards * numReplicas);
}

View File

@ -39,7 +39,9 @@ public class JettyConfig {
public final SSLConfig sslConfig;
private JettyConfig(int port, String context, boolean stopAtShutdown, Long waitForLoadingCoresToFinishMs, Map<ServletHolder, String> extraServlets,
public final int portRetryTime;
private JettyConfig(int port, int portRetryTime, String context, boolean stopAtShutdown, Long waitForLoadingCoresToFinishMs, Map<ServletHolder, String> extraServlets,
Map<Class<? extends Filter>, String> extraFilters, SSLConfig sslConfig) {
this.port = port;
this.context = context;
@ -48,6 +50,7 @@ public class JettyConfig {
this.extraServlets = extraServlets;
this.extraFilters = extraFilters;
this.sslConfig = sslConfig;
this.portRetryTime = portRetryTime;
}
public static Builder builder() {
@ -74,6 +77,7 @@ public class JettyConfig {
Map<ServletHolder, String> extraServlets = new TreeMap<>();
Map<Class<? extends Filter>, String> extraFilters = new LinkedHashMap<>();
SSLConfig sslConfig = null;
int portRetryTime = 60;
public Builder setPort(int port) {
this.port = port;
@ -122,8 +126,14 @@ public class JettyConfig {
return this;
}
public Builder withPortRetryTime(int portRetryTime) {
this.portRetryTime = portRetryTime;
return this;
}
public JettyConfig build() {
return new JettyConfig(port, context, stopAtShutdown, waitForLoadingCoresToFinishMs, extraServlets, extraFilters, sslConfig);
return new JettyConfig(port, portRetryTime, context, stopAtShutdown, waitForLoadingCoresToFinishMs, extraServlets, extraFilters, sslConfig);
}
}

View File

@ -16,18 +16,9 @@
*/
package org.apache.solr.client.solrj.embedded;
import javax.servlet.DispatcherType;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.BindException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
@ -41,10 +32,24 @@ import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import javax.servlet.DispatcherType;
import javax.servlet.Filter;
import javax.servlet.FilterChain;
import javax.servlet.FilterConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.cloud.SocketProxy;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.servlet.SolrDispatchFilter;
import org.apache.solr.util.TimeOut;
import org.eclipse.jetty.server.Connector;
import org.eclipse.jetty.server.HttpConfiguration;
import org.eclipse.jetty.server.HttpConnectionFactory;
@ -61,6 +66,7 @@ import org.eclipse.jetty.servlet.Source;
import org.eclipse.jetty.util.component.LifeCycle;
import org.eclipse.jetty.util.ssl.SslContextFactory;
import org.eclipse.jetty.util.thread.QueuedThreadPool;
import org.eclipse.jetty.util.thread.ReservedThreadExecutor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.MDC;
@ -80,8 +86,8 @@ public class JettySolrRunner {
Server server;
FilterHolder dispatchFilter;
FilterHolder debugFilter;
volatile FilterHolder dispatchFilter;
volatile FilterHolder debugFilter;
private boolean waitOnSolr = false;
private int jettyPort = -1;
@ -98,6 +104,16 @@ public class JettySolrRunner {
private int proxyPort = -1;
private final boolean enableProxy;
private SocketProxy proxy;
private String protocol;
private String host;
private volatile boolean started = false;
public static class DebugFilter implements Filter {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -200,11 +216,34 @@ public class JettySolrRunner {
* @param config the configuration
*/
public JettySolrRunner(String solrHome, Properties nodeProperties, JettyConfig config) {
this(solrHome, nodeProperties, config, false);
}
/**
* Construct a JettySolrRunner
*
* After construction, you must start the jetty with {@link #start()}
*
* @param solrHome the solrHome to use
* @param nodeProperties the container properties
* @param config the configuration
* @param enableProxy enables proxy feature to disable connections
*/
public JettySolrRunner(String solrHome, Properties nodeProperties, JettyConfig config, boolean enableProxy) {
this.enableProxy = enableProxy;
this.solrHome = solrHome;
this.config = config;
this.nodeProperties = nodeProperties;
if (enableProxy) {
try {
proxy = new SocketProxy(0, config.sslConfig != null && config.sslConfig.isSSLMode());
} catch (Exception e) {
throw new RuntimeException(e);
}
setProxyPort(proxy.getListenPort());
}
this.init(this.config.port);
}
@ -213,7 +252,7 @@ public class JettySolrRunner {
QueuedThreadPool qtp = new QueuedThreadPool();
qtp.setMaxThreads(THREAD_POOL_MAX_THREADS);
qtp.setIdleTimeout(THREAD_POOL_MAX_IDLE_TIME_MS);
qtp.setStopTimeout((int) TimeUnit.MINUTES.toMillis(1));
qtp.setReservedThreads(0);
server = new Server(qtp);
server.manage(qtp);
server.setStopAtShutdown(config.stopAtShutdown);
@ -246,7 +285,7 @@ public class JettySolrRunner {
connector.setPort(port);
connector.setHost("127.0.0.1");
connector.setIdleTimeout(THREAD_POOL_MAX_IDLE_TIME_MS);
connector.setStopTimeout(0);
server.setConnectors(new Connector[] {connector});
server.setSessionIdManager(new DefaultSessionIdManager(server, new Random()));
} else {
@ -271,10 +310,7 @@ public class JettySolrRunner {
@Override
public void lifeCycleStarting(LifeCycle arg0) {
synchronized (JettySolrRunner.this) {
waitOnSolr = true;
JettySolrRunner.this.notify();
}
}
@Override
@ -306,6 +342,11 @@ public class JettySolrRunner {
dispatchFilter.setHeldClass(SolrDispatchFilter.class);
dispatchFilter.setInitParameter("excludePatterns", excludePatterns);
root.addFilter(dispatchFilter, "*", EnumSet.of(DispatcherType.REQUEST));
synchronized (JettySolrRunner.this) {
waitOnSolr = true;
JettySolrRunner.this.notify();
}
}
@Override
@ -344,15 +385,19 @@ public class JettySolrRunner {
}
public String getNodeName() {
if (getCoreContainer() == null) {
return null;
}
return getCoreContainer().getZkController().getNodeName();
}
public boolean isRunning() {
return server.isRunning();
return server.isRunning() && dispatchFilter != null && dispatchFilter.isRunning();
}
public boolean isStopped() {
return server.isStopped();
return (server.isStopped() && dispatchFilter == null) || (server.isStopped() && dispatchFilter.isStopped()
&& ((QueuedThreadPool) server.getThreadPool()).isStopped());
}
// ------------------------------------------------------------------------------------------------
@ -382,31 +427,53 @@ public class JettySolrRunner {
// Do not let Jetty/Solr pollute the MDC for this thread
Map<String, String> prevContext = MDC.getCopyOfContextMap();
MDC.clear();
log.info("Start Jetty (original configured port={})", this.config.port);
try {
int port = reusePort && jettyPort != -1 ? jettyPort : this.config.port;
// if started before, make a new server
if (startedBefore) {
waitOnSolr = false;
int port = reusePort ? jettyPort : this.config.port;
init(port);
} else {
startedBefore = true;
}
if (!server.isRunning()) {
if (config.portRetryTime > 0) {
retryOnPortBindFailure(config.portRetryTime, port);
} else {
server.start();
}
}
synchronized (JettySolrRunner.this) {
int cnt = 0;
while (!waitOnSolr) {
while (!waitOnSolr || !dispatchFilter.isRunning() || getCoreContainer() == null) {
this.wait(100);
if (cnt++ == 5) {
if (cnt++ == 15) {
throw new RuntimeException("Jetty/Solr unresponsive");
}
}
}
if (config.waitForLoadingCoresToFinishMs != null && config.waitForLoadingCoresToFinishMs > 0L) waitForLoadingCoresToFinish(config.waitForLoadingCoresToFinishMs);
if (config.waitForLoadingCoresToFinishMs != null && config.waitForLoadingCoresToFinishMs > 0L) {
waitForLoadingCoresToFinish(config.waitForLoadingCoresToFinishMs);
}
setProtocolAndHost();
if (enableProxy) {
if (started) {
proxy.reopen();
} else {
proxy.open(getBaseUrl().toURI());
}
}
} finally {
started = true;
if (prevContext != null) {
MDC.setContextMap(prevContext);
} else {
@ -415,6 +482,43 @@ public class JettySolrRunner {
}
}
private void setProtocolAndHost() {
String protocol = null;
Connector[] conns = server.getConnectors();
if (0 == conns.length) {
throw new IllegalStateException("Jetty Server has no Connectors");
}
ServerConnector c = (ServerConnector) conns[0];
protocol = c.getDefaultProtocol().startsWith("SSL") ? "https" : "http";
this.protocol = protocol;
this.host = c.getHost();
}
private void retryOnPortBindFailure(int portRetryTime, int port) throws Exception, InterruptedException {
TimeOut timeout = new TimeOut(portRetryTime, TimeUnit.SECONDS, TimeSource.NANO_TIME);
int tryCnt = 1;
while (true) {
try {
log.info("Trying to start Jetty on port {} try number {} ...", port, tryCnt++);
server.start();
break;
} catch (BindException e) {
log.info("Port is in use, will try again until timeout of " + timeout);
server.stop();
Thread.sleep(3000);
if (!timeout.hasTimedOut()) {
continue;
}
throw e;
}
}
}
/**
* Stop the Jetty server
*
@ -427,6 +531,28 @@ public class JettySolrRunner {
try {
Filter filter = dispatchFilter.getFilter();
// we want to shutdown outside of jetty cutting us off
SolrDispatchFilter sdf = getSolrDispatchFilter();
Thread shutdownThead = null;
if (sdf != null) {
shutdownThead = new Thread() {
public void run() {
try {
sdf.close();
} catch (Throwable t) {
log.error("Error shutting down Solr", t);
}
}
};
sdf.closeOnDestroy(false);
shutdownThead.start();
}
QueuedThreadPool qtp = (QueuedThreadPool) server.getThreadPool();
ReservedThreadExecutor rte = qtp.getBean(ReservedThreadExecutor.class);
server.stop();
if (server.getState().equals(Server.FAILED)) {
@ -438,8 +564,47 @@ public class JettySolrRunner {
}
}
// stop timeout is 0, so we will interrupt right away
while(!qtp.isStopped()) {
qtp.stop();
if (qtp.isStopped()) {
Thread.sleep(50);
}
}
// we tried to kill everything, now we wait for executor to stop
qtp.setStopTimeout(Integer.MAX_VALUE);
qtp.stop();
qtp.join();
if (rte != null) {
// we try and wait for the reserved thread executor, but it doesn't always seem to work
// so we actually set 0 reserved threads at creation
rte.stop();
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, TimeSource.NANO_TIME);
timeout.waitFor("Timeout waiting for reserved executor to stop.", ()
-> rte.isStopped());
}
if (shutdownThead != null) {
shutdownThead.join();
}
do {
try {
server.join();
} catch (InterruptedException e) {
// ignore
}
} while (!server.isStopped());
} finally {
if (enableProxy) {
proxy.close();
}
if (prevContext != null) {
MDC.setContextMap(prevContext);
} else {
@ -461,15 +626,30 @@ public class JettySolrRunner {
return ((ServerConnector) conns[0]).getLocalPort();
}
/**
* Returns the Local Port of the jetty Server.
*
* @exception RuntimeException if there is no Connector
*/
public int getLocalPort() {
return getLocalPort(false);
}
/**
* Returns the Local Port of the jetty Server.
*
* @param internalPort pass true to get the true jetty port rather than the proxy port if configured
*
* @exception RuntimeException if there is no Connector
*/
public int getLocalPort(boolean internalPort) {
if (jettyPort == -1) {
throw new IllegalStateException("You cannot get the port until this instance has started");
}
if (internalPort ) {
return jettyPort;
}
return (proxyPort != -1) ? proxyPort : jettyPort;
}
@ -487,23 +667,21 @@ public class JettySolrRunner {
* Connector in use by the Jetty Server contained in this runner.
*/
public URL getBaseUrl() {
String protocol = null;
try {
Connector[] conns = server.getConnectors();
if (0 == conns.length) {
throw new IllegalStateException("Jetty Server has no Connectors");
}
ServerConnector c = (ServerConnector) conns[0];
if (c.getLocalPort() < 0) {
throw new IllegalStateException("Jetty Connector is not open: " +
c.getLocalPort());
}
protocol = c.getDefaultProtocol().startsWith("SSL") ? "https" : "http";
return new URL(protocol, c.getHost(), c.getLocalPort(), config.context);
return new URL(protocol, host, jettyPort, config.context);
} catch (MalformedURLException e) {
throw new IllegalStateException
("Java could not make sense of protocol: " + protocol, e);
throw new RuntimeException(e);
}
}
/**
* Returns a base URL consisting of the protocol, host, and port for a
* Connector in use by the Jetty Server contained in this runner.
*/
public URL getProxyBaseUrl() {
try {
return new URL(protocol, host, getLocalPort(), config.context);
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
@ -568,7 +746,11 @@ public class JettySolrRunner {
CoreContainer cores = solrFilter.getCores();
if (cores != null) {
cores.waitForLoadingCoresToFinish(timeoutMs);
} else {
throw new IllegalStateException("The CoreContainer is not set!");
}
} else {
throw new IllegalStateException("The dispatchFilter is not set!");
}
}
@ -583,4 +765,8 @@ public class JettySolrRunner {
this.delayValue = delay;
}
}
public SocketProxy getProxy() {
return proxy;
}
}

View File

@ -73,6 +73,7 @@ public abstract class ElectionContext implements Closeable {
public ElectionContext(final String coreNodeName,
final String electionPath, final String leaderPath, final ZkNodeProps leaderProps, final SolrZkClient zkClient) {
assert zkClient != null;
this.id = coreNodeName;
this.electionPath = electionPath;
this.leaderPath = leaderPath;
@ -116,6 +117,7 @@ class ShardLeaderElectionContextBase extends ElectionContext {
protected String collection;
protected LeaderElector leaderElector;
protected ZkStateReader zkStateReader;
protected ZkController zkController;
private Integer leaderZkNodeParentVersion;
// Prevents a race between cancelling and becoming leader.
@ -123,15 +125,29 @@ class ShardLeaderElectionContextBase extends ElectionContext {
public ShardLeaderElectionContextBase(LeaderElector leaderElector,
final String shardId, final String collection, final String coreNodeName,
ZkNodeProps props, ZkStateReader zkStateReader) {
ZkNodeProps props, ZkController zkController) {
super(coreNodeName, ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection
+ "/leader_elect/" + shardId, ZkStateReader.getShardLeadersPath(
collection, shardId), props, zkStateReader.getZkClient());
collection, shardId), props, zkController.getZkClient());
this.leaderElector = leaderElector;
this.zkStateReader = zkController.getZkStateReader();
this.zkClient = zkStateReader.getZkClient();
this.zkStateReader = zkStateReader;
this.zkController = zkController;
this.shardId = shardId;
this.collection = collection;
String parent = new Path(leaderPath).getParent().toString();
ZkCmdExecutor zcmd = new ZkCmdExecutor(30000);
// only if /collections/{collection} exists already do we succeed in creating this path
log.info("make sure parent is created {}", parent);
try {
zcmd.ensureExists(parent, (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
} catch (KeeperException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException(e);
}
}
@Override
@ -172,20 +188,11 @@ class ShardLeaderElectionContextBase extends ElectionContext {
throws KeeperException, InterruptedException, IOException {
// register as leader - if an ephemeral is already there, wait to see if it goes away
if (!zkClient.exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) {
log.info("Will not register as leader because collection appears to be gone.");
return;
}
String parent = new Path(leaderPath).getParent().toString();
ZkCmdExecutor zcmd = new ZkCmdExecutor(30000);
// only if /collections/{collection} exists already do we succeed in creating this path
zcmd.ensureExists(parent, (byte[])null, CreateMode.PERSISTENT, zkClient, 2);
try {
RetryUtil.retryOnThrowable(NodeExistsException.class, 60000, 5000, () -> {
synchronized (lock) {
log.debug("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
log.info("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
List<Op> ops = new ArrayList<>(2);
// We use a multi operation to get the parent nodes version, which will
@ -210,6 +217,9 @@ class ShardLeaderElectionContextBase extends ElectionContext {
assert leaderZkNodeParentVersion != null;
}
});
} catch (NoNodeException e) {
log.info("Will not register as leader because it seems the election is no longer taking place.");
return;
} catch (Throwable t) {
if (t instanceof OutOfMemoryError) {
throw (OutOfMemoryError) t;
@ -235,7 +245,9 @@ class ShardLeaderElectionContextBase extends ElectionContext {
ZkStateReader.BASE_URL_PROP, leaderProps.get(ZkStateReader.BASE_URL_PROP),
ZkStateReader.CORE_NAME_PROP, leaderProps.get(ZkStateReader.CORE_NAME_PROP),
ZkStateReader.STATE_PROP, Replica.State.ACTIVE.toString());
Overseer.getStateUpdateQueue(zkClient).offer(Utils.toJSON(m));
assert zkController != null;
assert zkController.getOverseer() != null;
zkController.getOverseer().offerStateUpdate(Utils.toJSON(m));
}
}
@ -254,7 +266,6 @@ class ShardLeaderElectionContextBase extends ElectionContext {
final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final ZkController zkController;
private final CoreContainer cc;
private final SyncStrategy syncStrategy;
@ -264,8 +275,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
final String shardId, final String collection,
final String coreNodeName, ZkNodeProps props, ZkController zkController, CoreContainer cc) {
super(leaderElector, shardId, collection, coreNodeName, props,
zkController.getZkStateReader());
this.zkController = zkController;
zkController);
this.cc = cc;
syncStrategy = new SyncStrategy(cc);
}
@ -304,11 +314,8 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
ActionThrottle lt;
try (SolrCore core = cc.getCore(coreName)) {
if (core == null ) {
if (cc.isShutDown()) {
// shutdown or removed
return;
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "SolrCore not found:" + coreName + " in " + cc.getLoadedCoreNames());
}
}
MDCLoggingContext.setCore(core);
lt = core.getUpdateHandler().getSolrCoreState().getLeaderThrottle();
@ -326,7 +333,7 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
// Clear the leader in clusterstate. We only need to worry about this if there is actually more than one replica.
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.LEADER.toLower(),
ZkStateReader.SHARD_ID_PROP, shardId, ZkStateReader.COLLECTION_PROP, collection);
Overseer.getStateUpdateQueue(zkClient).offer(Utils.toJSON(m));
zkController.getOverseer().getStateUpdateQueue().offer(Utils.toJSON(m));
}
boolean allReplicasInLine = false;
@ -349,14 +356,8 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
try (SolrCore core = cc.getCore(coreName)) {
if (core == null) {
if (!zkController.getCoreContainer().isShutDown()) {
cancelElection();
throw new SolrException(ErrorCode.SERVER_ERROR,
"SolrCore not found:" + coreName + " in " + cc.getLoadedCoreNames());
} else {
return;
}
}
replicaType = core.getCoreDescriptor().getCloudDescriptor().getReplicaType();
coreNodeName = core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
@ -698,7 +699,8 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
final class OverseerElectionContext extends ElectionContext {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final SolrZkClient zkClient;
private Overseer overseer;
private final Overseer overseer;
private volatile boolean isClosed = false;
public OverseerElectionContext(SolrZkClient zkClient, Overseer overseer, final String zkNodeName) {
super(zkNodeName, Overseer.OVERSEER_ELECT, Overseer.OVERSEER_ELECT + "/leader", null, zkClient);
@ -732,10 +734,12 @@ final class OverseerElectionContext extends ElectionContext {
log.warn("Wait interrupted ", e);
}
}
if (!overseer.getZkController().isClosed() && !overseer.getZkController().getCoreContainer().isShutDown()) {
synchronized (this) {
if (!this.isClosed && !overseer.getZkController().getCoreContainer().isShutDown()) {
overseer.start(id);
}
}
}
@Override
public void cancelElection() throws InterruptedException, KeeperException {
@ -744,7 +748,8 @@ final class OverseerElectionContext extends ElectionContext {
}
@Override
public void close() {
public synchronized void close() {
this.isClosed = true;
overseer.close();
}

View File

@ -26,6 +26,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.solr.cloud.ZkController.ContextKey;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkCmdExecutor;
@ -346,6 +347,8 @@ public class LeaderElector {
try {
// am I the next leader?
checkIfIamLeader(context, true);
} catch (AlreadyClosedException e) {
} catch (Exception e) {
if (!zkClient.isClosed()) {
log.warn("", e);

View File

@ -16,6 +16,8 @@
*/
package org.apache.solr.cloud;
import static org.apache.solr.common.params.CommonParams.ID;
import java.io.Closeable;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
@ -26,7 +28,6 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import com.codahale.metrics.Timer;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.impl.ClusterStateProvider;
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler;
@ -39,9 +40,11 @@ import org.apache.solr.cloud.overseer.ReplicaMutator;
import org.apache.solr.cloud.overseer.SliceMutator;
import org.apache.solr.cloud.overseer.ZkStateWriter;
import org.apache.solr.cloud.overseer.ZkWriteCommand;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrCloseable;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.ConnectionManager;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
@ -53,7 +56,7 @@ import org.apache.solr.common.util.Utils;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.handler.admin.CollectionsHandler;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.zookeeper.CreateMode;
@ -61,7 +64,7 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.ID;
import com.codahale.metrics.Timer;
/**
* Cluster leader. Responsible for processing state updates, node assignments, creating/deleting
@ -107,7 +110,7 @@ public class Overseer implements SolrCloseable {
public ClusterStateUpdater(final ZkStateReader reader, final String myId, Stats zkStats) {
this.zkClient = reader.getZkClient();
this.zkStats = zkStats;
this.stateUpdateQueue = getStateUpdateQueue(zkClient, zkStats);
this.stateUpdateQueue = getStateUpdateQueue(zkStats);
this.workQueue = getInternalWorkQueue(zkClient, zkStats);
this.failureMap = getFailureMap(zkClient);
this.runningMap = getRunningMap(zkClient);
@ -188,6 +191,8 @@ public class Overseer implements SolrCloseable {
// the workQueue is empty now, use stateUpdateQueue as fallback queue
fallbackQueue = stateUpdateQueue;
fallbackQueueSize = 0;
} catch (AlreadyClosedException e) {
return;
} catch (KeeperException.SessionExpiredException e) {
log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
return;
@ -211,6 +216,8 @@ public class Overseer implements SolrCloseable {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.error("Exception in Overseer main queue loop", e);
}
@ -247,6 +254,8 @@ public class Overseer implements SolrCloseable {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.error("Exception in Overseer main queue loop", e);
refreshClusterState = true; // it might have been a bad version error
@ -308,8 +317,10 @@ public class Overseer implements SolrCloseable {
byte[] data;
try {
data = zkClient.getData(path, null, stat, true);
} catch (AlreadyClosedException e) {
return;
} catch (Exception e) {
log.error("could not read the "+path+" data" ,e);
log.warn("Error communicating with ZooKeeper", e);
return;
}
try {
@ -437,6 +448,11 @@ public class Overseer implements SolrCloseable {
} catch (InterruptedException e) {
success = false;
Thread.currentThread().interrupt();
} catch (AlreadyClosedException e) {
success = false;
} catch (Exception e) {
success = false;
log.warn("Unexpected exception", e);
} finally {
timerContext.stop();
if (success) {
@ -495,7 +511,7 @@ public class Overseer implements SolrCloseable {
private final ZkStateReader reader;
private final ShardHandler shardHandler;
private final HttpShardHandler shardHandler;
private final UpdateShardHandler updateShardHandler;
@ -507,11 +523,11 @@ public class Overseer implements SolrCloseable {
private Stats stats;
private String id;
private boolean closed;
private volatile boolean closed;
private CloudConfig config;
// overseer not responsible for closing reader
public Overseer(ShardHandler shardHandler,
public Overseer(HttpShardHandler shardHandler,
UpdateShardHandler updateShardHandler, String adminPath,
final ZkStateReader reader, ZkController zkController, CloudConfig config)
throws KeeperException, InterruptedException {
@ -541,7 +557,7 @@ public class Overseer implements SolrCloseable {
ThreadGroup ccTg = new ThreadGroup("Overseer collection creation process.");
OverseerNodePrioritizer overseerPrioritizer = new OverseerNodePrioritizer(reader, adminPath, shardHandler.getShardHandlerFactory());
OverseerNodePrioritizer overseerPrioritizer = new OverseerNodePrioritizer(reader, getStateUpdateQueue(), adminPath, shardHandler.getShardHandlerFactory(), updateShardHandler.getDefaultHttpClient());
overseerCollectionConfigSetProcessor = new OverseerCollectionConfigSetProcessor(reader, id, shardHandler, adminPath, stats, Overseer.this, overseerPrioritizer);
ccThread = new OverseerThread(ccTg, overseerCollectionConfigSetProcessor, "OverseerCollectionConfigSetProcessor-" + id);
ccThread.setDaemon(true);
@ -554,10 +570,9 @@ public class Overseer implements SolrCloseable {
updaterThread.start();
ccThread.start();
triggerThread.start();
if (this.id != null) {
assert ObjectReleaseTracker.track(this);
}
}
public Stats getStats() {
return stats;
@ -595,17 +610,14 @@ public class Overseer implements SolrCloseable {
}
public synchronized void close() {
if (closed) return;
if (this.id != null) {
log.info("Overseer (id=" + id + ") closing");
}
doClose();
this.closed = true;
if (this.id != null) {
doClose();
assert ObjectReleaseTracker.release(this);
}
}
@Override
public boolean isClosed() {
@ -660,11 +672,10 @@ public class Overseer implements SolrCloseable {
* <p>
* This method will create the /overseer znode in ZooKeeper if it does not exist already.
*
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @return a {@link ZkDistributedQueue} object
*/
public static ZkDistributedQueue getStateUpdateQueue(final SolrZkClient zkClient) {
return getStateUpdateQueue(zkClient, new Stats());
ZkDistributedQueue getStateUpdateQueue() {
return getStateUpdateQueue(new Stats());
}
/**
@ -672,13 +683,15 @@ public class Overseer implements SolrCloseable {
* This method should not be used directly by anyone other than the Overseer itself.
* This method will create the /overseer znode in ZooKeeper if it does not exist already.
*
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @param zkStats a {@link Stats} object which tracks statistics for all zookeeper operations performed by this queue
* @return a {@link ZkDistributedQueue} object
*/
static ZkDistributedQueue getStateUpdateQueue(final SolrZkClient zkClient, Stats zkStats) {
createOverseerNode(zkClient);
return new ZkDistributedQueue(zkClient, "/overseer/queue", zkStats, STATE_UPDATE_MAX_QUEUE);
ZkDistributedQueue getStateUpdateQueue(Stats zkStats) {
return new ZkDistributedQueue(reader.getZkClient(), "/overseer/queue", zkStats, STATE_UPDATE_MAX_QUEUE, new ConnectionManager.IsClosed(){
public boolean isClosed() {
return Overseer.this.isClosed() || zkController.getCoreContainer().isShutDown();
}
});
}
/**
@ -697,31 +710,26 @@ public class Overseer implements SolrCloseable {
* @return a {@link ZkDistributedQueue} object
*/
static ZkDistributedQueue getInternalWorkQueue(final SolrZkClient zkClient, Stats zkStats) {
createOverseerNode(zkClient);
return new ZkDistributedQueue(zkClient, "/overseer/queue-work", zkStats);
}
/* Internal map for failed tasks, not to be used outside of the Overseer */
static DistributedMap getRunningMap(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new DistributedMap(zkClient, "/overseer/collection-map-running");
}
/* Size-limited map for successfully completed tasks*/
static DistributedMap getCompletedMap(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new SizeLimitedDistributedMap(zkClient, "/overseer/collection-map-completed", NUM_RESPONSES_TO_STORE, (child) -> getAsyncIdsMap(zkClient).remove(child));
}
/* Map for failed tasks, not to be used outside of the Overseer */
static DistributedMap getFailureMap(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new SizeLimitedDistributedMap(zkClient, "/overseer/collection-map-failure", NUM_RESPONSES_TO_STORE, (child) -> getAsyncIdsMap(zkClient).remove(child));
}
/* Map of async IDs currently in use*/
static DistributedMap getAsyncIdsMap(final SolrZkClient zkClient) {
createOverseerNode(zkClient);
return new DistributedMap(zkClient, "/overseer/async_ids");
}
@ -740,7 +748,7 @@ public class Overseer implements SolrCloseable {
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @return a {@link ZkDistributedQueue} object
*/
static OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient) {
OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient) {
return getCollectionQueue(zkClient, new Stats());
}
@ -758,8 +766,7 @@ public class Overseer implements SolrCloseable {
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @return a {@link ZkDistributedQueue} object
*/
static OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient, Stats zkStats) {
createOverseerNode(zkClient);
OverseerTaskQueue getCollectionQueue(final SolrZkClient zkClient, Stats zkStats) {
return new OverseerTaskQueue(zkClient, "/overseer/collection-queue-work", zkStats);
}
@ -778,7 +785,7 @@ public class Overseer implements SolrCloseable {
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @return a {@link ZkDistributedQueue} object
*/
static OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient) {
OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient) {
return getConfigSetQueue(zkClient, new Stats());
}
@ -801,15 +808,14 @@ public class Overseer implements SolrCloseable {
* @param zkClient the {@link SolrZkClient} to be used for reading/writing to the queue
* @return a {@link ZkDistributedQueue} object
*/
static OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient, Stats zkStats) {
OverseerTaskQueue getConfigSetQueue(final SolrZkClient zkClient, Stats zkStats) {
// For now, we use the same queue as the collection queue, but ensure
// that the actions are prefixed with a unique string.
createOverseerNode(zkClient);
return getCollectionQueue(zkClient, zkStats);
}
private static void createOverseerNode(final SolrZkClient zkClient) {
private void createOverseerNode(final SolrZkClient zkClient) {
try {
zkClient.create("/overseer", new byte[0], CreateMode.PERSISTENT, true);
} catch (KeeperException.NodeExistsException e) {
@ -823,6 +829,7 @@ public class Overseer implements SolrCloseable {
throw new RuntimeException(e);
}
}
public static boolean isLegacy(ZkStateReader stateReader) {
String legacyProperty = stateReader.getClusterProperty(ZkStateReader.LEGACY_CLOUD, "false");
return "true".equals(legacyProperty);
@ -837,4 +844,11 @@ public class Overseer implements SolrCloseable {
return reader;
}
public void offerStateUpdate(byte[] data) throws KeeperException, InterruptedException {
if (zkController.getZkClient().isClosed()) {
throw new AlreadyClosedException();
}
getStateUpdateQueue().offer(data);
}
}

View File

@ -16,16 +16,16 @@
*/
package org.apache.solr.cloud;
import static org.apache.solr.cloud.OverseerConfigSetMessageHandler.CONFIGSETS_ACTION_PREFIX;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.solr.cloud.api.collections.OverseerCollectionMessageHandler;
import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import static org.apache.solr.cloud.OverseerConfigSetMessageHandler.CONFIGSETS_ACTION_PREFIX;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
/**
* An {@link OverseerTaskProcessor} that handles:
@ -35,18 +35,18 @@ import static org.apache.solr.cloud.OverseerConfigSetMessageHandler.CONFIGSETS_A
public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor {
public OverseerCollectionConfigSetProcessor(ZkStateReader zkStateReader, String myId,
final ShardHandler shardHandler,
final HttpShardHandler shardHandler,
String adminPath, Stats stats, Overseer overseer,
OverseerNodePrioritizer overseerNodePrioritizer) {
this(
zkStateReader,
myId,
shardHandler.getShardHandlerFactory(),
(HttpShardHandlerFactory) shardHandler.getShardHandlerFactory(),
adminPath,
stats,
overseer,
overseerNodePrioritizer,
Overseer.getCollectionQueue(zkStateReader.getZkClient(), stats),
overseer.getCollectionQueue(zkStateReader.getZkClient(), stats),
Overseer.getRunningMap(zkStateReader.getZkClient()),
Overseer.getCompletedMap(zkStateReader.getZkClient()),
Overseer.getFailureMap(zkStateReader.getZkClient())
@ -54,7 +54,7 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
}
protected OverseerCollectionConfigSetProcessor(ZkStateReader zkStateReader, String myId,
final ShardHandlerFactory shardHandlerFactory,
final HttpShardHandlerFactory shardHandlerFactory,
String adminPath,
Stats stats,
Overseer overseer,
@ -79,7 +79,7 @@ public class OverseerCollectionConfigSetProcessor extends OverseerTaskProcessor
private static OverseerMessageHandlerSelector getOverseerMessageHandlerSelector(
ZkStateReader zkStateReader,
String myId,
final ShardHandlerFactory shardHandlerFactory,
final HttpShardHandlerFactory shardHandlerFactory,
String adminPath,
Stats stats,
Overseer overseer,

View File

@ -20,6 +20,7 @@ import java.lang.invoke.MethodHandles;
import java.util.List;
import java.util.Map;
import org.apache.http.client.HttpClient;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
@ -28,6 +29,7 @@ import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.CoreAdminParams.CoreAdminAction;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import org.apache.solr.handler.component.ShardRequest;
@ -49,10 +51,16 @@ public class OverseerNodePrioritizer {
private final String adminPath;
private final ShardHandlerFactory shardHandlerFactory;
public OverseerNodePrioritizer(ZkStateReader zkStateReader, String adminPath, ShardHandlerFactory shardHandlerFactory) {
private ZkDistributedQueue stateUpdateQueue;
private HttpClient httpClient;
public OverseerNodePrioritizer(ZkStateReader zkStateReader, ZkDistributedQueue stateUpdateQueue, String adminPath, ShardHandlerFactory shardHandlerFactory, HttpClient httpClient) {
this.zkStateReader = zkStateReader;
this.adminPath = adminPath;
this.shardHandlerFactory = shardHandlerFactory;
this.stateUpdateQueue = stateUpdateQueue;
this.httpClient = httpClient;
}
public synchronized void prioritizeOverseerNodes(String overseerId) throws Exception {
@ -88,7 +96,7 @@ public class OverseerNodePrioritizer {
invokeOverseerOp(electionNodes.get(1), "rejoin");//ask second inline to go behind
}
//now ask the current leader to QUIT , so that the designate can takeover
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(
stateUpdateQueue.offer(
Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(),
ID, OverseerTaskProcessor.getLeaderId(zkStateReader.getZkClient()))));
@ -96,7 +104,7 @@ public class OverseerNodePrioritizer {
private void invokeOverseerOp(String electionNode, String op) {
ModifiableSolrParams params = new ModifiableSolrParams();
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ((HttpShardHandlerFactory)shardHandlerFactory).getShardHandler(httpClient);
params.set(CoreAdminParams.ACTION, CoreAdminAction.OVERSEEROP.toString());
params.set("op", op);
params.set("qt", adminPath);

View File

@ -19,6 +19,7 @@ package org.apache.solr.cloud;
import java.io.Closeable;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashMap;
@ -36,6 +37,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.cloud.Overseer.LeaderStatus;
import org.apache.solr.cloud.OverseerTaskQueue.QueueEvent;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps;
@ -86,13 +88,13 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
// List of completed tasks. This is used to clean up workQueue in zk.
final private HashMap<String, QueueEvent> completedTasks;
private String myId;
private volatile String myId;
private ZkStateReader zkStateReader;
private volatile ZkStateReader zkStateReader;
private boolean isClosed;
private Stats stats;
private volatile Stats stats;
// Set of tasks that have been picked up for processing but not cleaned up from zk work-queue.
// It may contain tasks that have completed execution, have been entered into the completed/failed map in zk but not
@ -102,7 +104,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
// be executed because they are blocked or the execution queue is full
// This is an optimization to ensure that we do not read the same tasks
// again and again from ZK.
final private Map<String, QueueEvent> blockedTasks = new LinkedHashMap<>();
final private Map<String, QueueEvent> blockedTasks = Collections.synchronizedMap(new LinkedHashMap<>());
final private Predicate<String> excludedTasks = new Predicate<String>() {
@Override
public boolean test(String s) {
@ -170,6 +172,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
// We don't need to handle this. This is just a fail-safe which comes in handy in skipping already processed
// async calls.
SolrException.log(log, "", e);
} catch (AlreadyClosedException e) {
return;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
@ -181,6 +185,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
try {
prioritizer.prioritizeOverseerNodes(myId);
} catch (AlreadyClosedException e) {
return;
} catch (Exception e) {
if (!zkStateReader.getZkClient().isClosed()) {
log.error("Unable to prioritize overseer ", e);
@ -203,14 +209,14 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
continue; // not a no, not a yes, try asking again
}
log.debug("Cleaning up work-queue. #Running tasks: {}", runningTasks.size());
log.debug("Cleaning up work-queue. #Running tasks: {} #Completed tasks: {}", runningTasksSize(), completedTasks.size());
cleanUpWorkQueue();
printTrackingMaps();
boolean waited = false;
while (runningTasks.size() > MAX_PARALLEL_TASKS) {
while (runningTasksSize() > MAX_PARALLEL_TASKS) {
synchronized (waitLock) {
waitLock.wait(100);//wait for 100 ms or till a task is complete
}
@ -229,7 +235,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
// to clear out at least a few items in the queue before we read more items
if (heads.size() < MAX_BLOCKED_TASKS) {
//instead of reading MAX_PARALLEL_TASKS items always, we should only fetch as much as we can execute
int toFetch = Math.min(MAX_BLOCKED_TASKS - heads.size(), MAX_PARALLEL_TASKS - runningTasks.size());
int toFetch = Math.min(MAX_BLOCKED_TASKS - heads.size(), MAX_PARALLEL_TASKS - runningTasksSize());
List<QueueEvent> newTasks = workQueue.peekTopN(toFetch, excludedTasks, 2000L);
log.debug("Got {} tasks from work-queue : [{}]", newTasks.size(), newTasks);
heads.addAll(newTasks);
@ -251,7 +257,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
for (QueueEvent head : heads) {
if (!tooManyTasks) {
synchronized (runningTasks) {
tooManyTasks = runningTasks.size() >= MAX_PARALLEL_TASKS;
tooManyTasks = runningTasksSize() >= MAX_PARALLEL_TASKS;
}
}
if (tooManyTasks) {
@ -260,7 +266,9 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
blockedTasks.put(head.getId(), head);
continue;
}
synchronized (runningZKTasks) {
if (runningZKTasks.contains(head.getId())) continue;
}
final ZkNodeProps message = ZkNodeProps.load(head.getBytes());
final String asyncId = message.getStr(ASYNC);
if (hasLeftOverItems) {
@ -316,6 +324,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
} catch (AlreadyClosedException e) {
} catch (Exception e) {
SolrException.log(log, "", e);
}
@ -325,12 +335,20 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
}
}
private int runningTasksSize() {
synchronized (runningTasks) {
return runningTasks.size();
}
}
private void cleanUpWorkQueue() throws KeeperException, InterruptedException {
synchronized (completedTasks) {
for (String id : completedTasks.keySet()) {
workQueue.remove(completedTasks.get(id));
synchronized (runningTasks) {
runningZKTasks.remove(id);
}
}
completedTasks.clear();
}
}
@ -502,6 +520,8 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
log.debug(messageHandler.getName() + ": Message id:" + head.getId() +
" complete, response:" + response.getResponse().toString());
success = true;
} catch (AlreadyClosedException e) {
} catch (KeeperException e) {
SolrException.log(log, "", e);
} catch (InterruptedException e) {
@ -513,7 +533,11 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
lock.unlock();
if (!success) {
// Reset task from tracking data structures so that it can be retried.
try {
resetTaskWithException(messageHandler, head.getId(), asyncId, taskKey, message);
} catch(AlreadyClosedException e) {
}
}
synchronized (waitLock){
waitLock.notifyAll();
@ -587,7 +611,7 @@ public class OverseerTaskProcessor implements Runnable, Closeable {
log.debug("CompletedTasks: {}", completedTasks.keySet().toString());
}
synchronized (runningZKTasks) {
log.debug("RunningZKTasks: {}", runningZKTasks.toString());
log.info("RunningZKTasks: {}", runningZKTasks.toString());
}
}
}

View File

@ -63,7 +63,6 @@ import org.apache.solr.update.CommitUpdateCommand;
import org.apache.solr.update.PeerSyncWithLeader;
import org.apache.solr.update.UpdateLog;
import org.apache.solr.update.UpdateLog.RecoveryInfo;
import org.apache.solr.update.processor.DistributedUpdateProcessor;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.SolrPluginUtils;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
@ -71,18 +70,21 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class may change in future and customisations are not supported
* between versions in terms of API or back compat behaviour.
* This class may change in future and customisations are not supported between versions in terms of API or back compat
* behaviour.
*
* @lucene.experimental
*/
public class RecoveryStrategy implements Runnable, Closeable {
public static class Builder implements NamedListInitializedPlugin {
private NamedList args;
@Override
public void init(NamedList args) {
this.args = args;
}
// this should only be used from SolrCoreState
public RecoveryStrategy create(CoreContainer cc, CoreDescriptor cd,
RecoveryStrategy.RecoveryListener recoveryListener) {
@ -90,6 +92,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
SolrPluginUtils.invokeSetters(recoveryStrategy, args);
return recoveryStrategy;
}
protected RecoveryStrategy newRecoveryStrategy(CoreContainer cc, CoreDescriptor cd,
RecoveryStrategy.RecoveryListener recoveryListener) {
return new RecoveryStrategy(cc, cd, recoveryListener);
@ -98,12 +101,14 @@ public class RecoveryStrategy implements Runnable, Closeable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private int waitForUpdatesWithStaleStatePauseMilliSeconds = Integer.getInteger("solr.cloud.wait-for-updates-with-stale-state-pause", 2500);
private int waitForUpdatesWithStaleStatePauseMilliSeconds = Integer
.getInteger("solr.cloud.wait-for-updates-with-stale-state-pause", 2500);
private int maxRetries = 500;
private int startingRecoveryDelayMilliSeconds = 5000;
private int startingRecoveryDelayMilliSeconds = 2000;
public static interface RecoveryListener {
public void recovered();
public void failed();
}
@ -121,6 +126,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
private volatile HttpUriRequest prevSendPreRecoveryHttpUriRequest;
private final Replica.Type replicaType;
private CoreDescriptor coreDescriptor;
protected RecoveryStrategy(CoreContainer cc, CoreDescriptor cd, RecoveryListener recoveryListener) {
this.cc = cc;
this.coreName = cd.getName();
@ -136,7 +143,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
return waitForUpdatesWithStaleStatePauseMilliSeconds;
}
final public void setWaitForUpdatesWithStaleStatePauseMilliSeconds(int waitForUpdatesWithStaleStatePauseMilliSeconds) {
final public void setWaitForUpdatesWithStaleStatePauseMilliSeconds(
int waitForUpdatesWithStaleStatePauseMilliSeconds) {
this.waitForUpdatesWithStaleStatePauseMilliSeconds = waitForUpdatesWithStaleStatePauseMilliSeconds;
}
@ -187,8 +195,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
/**
* This method may change in future and customisations are not supported
* between versions in terms of API or back compat behaviour.
* This method may change in future and customisations are not supported between versions in terms of API or back
* compat behaviour.
*
* @lucene.experimental
*/
protected String getReplicateLeaderUrl(ZkNodeProps leaderprops) {
@ -219,7 +228,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
solrParams.set(ReplicationHandler.SKIP_COMMIT_ON_MASTER_VERSION_ZERO, replicaType == Replica.Type.TLOG);
// always download the tlogs from the leader when running with cdcr enabled. We need to have all the tlogs
// to ensure leader failover doesn't cause missing docs on the target
if (core.getUpdateHandler().getUpdateLog() != null && core.getUpdateHandler().getUpdateLog() instanceof CdcrUpdateLog) {
if (core.getUpdateHandler().getUpdateLog() != null
&& core.getUpdateHandler().getUpdateLog() instanceof CdcrUpdateLog) {
solrParams.set(ReplicationHandler.TLOG_FILES, true);
}
@ -245,7 +255,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
+ " from "
+ leaderUrl
+ " gen:"
+ (core.getDeletionPolicy().getLatestCommit() != null ? "null" : core.getDeletionPolicy().getLatestCommit().getGeneration())
+ (core.getDeletionPolicy().getLatestCommit() != null ? "null"
: core.getDeletionPolicy().getLatestCommit().getGeneration())
+ " data:" + core.getDataDir()
+ " index:" + core.getIndexDir()
+ " newIndex:" + core.getNewIndexDir()
@ -265,11 +276,13 @@ public class RecoveryStrategy implements Runnable, Closeable {
IOException {
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderUrl)
.withConnectionTimeout(30000)
.withHttpClient(cc.getUpdateShardHandler().getRecoveryOnlyHttpClient())
.build()) {
UpdateRequest ureq = new UpdateRequest();
ureq.setParams(new ModifiableSolrParams());
ureq.getParams().set(DistributedUpdateProcessor.COMMIT_END_POINT, true);
// ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes);// Why do we need to open searcher if "onlyLeaderIndexes"?
// ureq.getParams().set(DistributedUpdateProcessor.COMMIT_END_POINT, true);
// ureq.getParams().set(UpdateParams.OPEN_SEARCHER, onlyLeaderIndexes);// Why do we need to open searcher if
// "onlyLeaderIndexes"?
ureq.getParams().set(UpdateParams.OPEN_SEARCHER, false);
ureq.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, true).process(
client);
@ -306,7 +319,10 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
final public void doRecovery(SolrCore core) throws Exception {
if (core.getCoreDescriptor().getCloudDescriptor().requiresTransactionLog()) {
// we can lose our core descriptor, so store it now
this.coreDescriptor = core.getCoreDescriptor();
if (this.coreDescriptor.getCloudDescriptor().requiresTransactionLog()) {
doSyncOrReplicateRecovery(core);
} else {
doReplicateOnlyRecovery(core);
@ -317,13 +333,16 @@ public class RecoveryStrategy implements Runnable, Closeable {
boolean successfulRecovery = false;
// if (core.getUpdateHandler().getUpdateLog() != null) {
// SolrException.log(log, "'replicate-only' recovery strategy should only be used if no update logs are present, but this core has one: "
// SolrException.log(log, "'replicate-only' recovery strategy should only be used if no update logs are present, but
// this core has one: "
// + core.getUpdateHandler().getUpdateLog());
// return;
// }
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or
// it will close channels
// though
try {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(
cloudDesc.getCollectionName(), cloudDesc.getShardId());
final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
@ -333,7 +352,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
boolean isLeader = leaderUrl.equals(ourUrl); //TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas
boolean isLeader = leaderUrl.equals(ourUrl); // TODO: We can probably delete most of this code if we say this
// strategy can only be used for pull replicas
if (isLeader && !cloudDesc.isLeader()) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
}
@ -342,14 +362,13 @@ public class RecoveryStrategy implements Runnable, Closeable {
// we are now the leader - no one else must have been suitable
log.warn("We have not yet recovered - but we are now the leader!");
log.info("Finished recovery process.");
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
return;
}
log.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl,
ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
if (isClosed()) {
log.info("Recovery for core {} has been closed", core.getName());
@ -381,7 +400,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
zkController.startReplicationFromLeader(coreName, false);
log.info("Registering as Active after recovery.");
try {
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
} catch (Exception e) {
log.error("Could not publish as ACTIVE after succesful recovery", e);
successfulRecovery = false;
@ -411,7 +430,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
if (retries >= maxRetries) {
SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
try {
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, this.coreDescriptor);
} catch (Exception e) {
SolrException.log(log, "Could not publish that recovery failed", e);
}
@ -457,7 +476,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
if (ulog == null) {
SolrException.log(log, "No UpdateLog found - cannot recover.");
recoveryFailed(core, zkController, baseUrl, coreZkNodeName,
core.getCoreDescriptor());
this.coreDescriptor);
return;
}
@ -485,13 +504,15 @@ public class RecoveryStrategy implements Runnable, Closeable {
if (oldIdx > 0) {
log.info("Found new versions added after startup: num=[{}]", oldIdx);
log.info("currentVersions size={} range=[{} to {}]", recentVersions.size(), recentVersions.get(0), recentVersions.get(recentVersions.size()-1));
log.info("currentVersions size={} range=[{} to {}]", recentVersions.size(), recentVersions.get(0),
recentVersions.get(recentVersions.size() - 1));
}
if (startingVersions.isEmpty()) {
log.info("startupVersions is empty");
} else {
log.info("startupVersions size={} range=[{} to {}]", startingVersions.size(), startingVersions.get(0), startingVersions.get(startingVersions.size()-1));
log.info("startupVersions size={} range=[{} to {}]", startingVersions.size(), startingVersions.get(0),
startingVersions.get(startingVersions.size() - 1));
}
} catch (Exception e) {
SolrException.log(log, "Error getting recent versions.", e);
@ -523,10 +544,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
final String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
Future<RecoveryInfo> replayFuture = null;
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or it will close channels though
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) { // don't use interruption or
// it will close channels
// though
try {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
final Replica leader = pingLeader(ourUrl, core.getCoreDescriptor(), true);
CloudDescriptor cloudDesc = this.coreDescriptor.getCloudDescriptor();
final Replica leader = pingLeader(ourUrl, this.coreDescriptor, true);
if (isClosed()) {
log.info("RecoveryStrategy has been closed");
break;
@ -540,7 +563,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
// we are now the leader - no one else must have been suitable
log.warn("We have not yet recovered - but we are now the leader!");
log.info("Finished recovery process.");
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
return;
}
@ -548,10 +571,10 @@ public class RecoveryStrategy implements Runnable, Closeable {
// recalling buffer updates will drop the old buffer tlog
ulog.bufferUpdates();
log.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leader.getCoreUrl(),
log.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(),
leader.getCoreUrl(),
ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
zkController.publish(this.coreDescriptor, Replica.State.RECOVERING);
final Slice slice = zkStateReader.getClusterState().getCollection(cloudDesc.getCollectionName())
.getSlice(cloudDesc.getShardId());
@ -588,7 +611,8 @@ public class RecoveryStrategy implements Runnable, Closeable {
// first thing we just try to sync
if (firstTime) {
firstTime = false; // only try sync the first time through the loop
log.info("Attempting to PeerSync from [{}] - recoveringAfterStartup=[{}]", leader.getCoreUrl(), recoveringAfterStartup);
log.info("Attempting to PeerSync from [{}] - recoveringAfterStartup=[{}]", leader.getCoreUrl(),
recoveringAfterStartup);
// System.out.println("Attempting to PeerSync from " + leaderUrl
// + " i am:" + zkController.getNodeName());
PeerSyncWithLeader peerSyncWithLeader = new PeerSyncWithLeader(core,
@ -658,7 +682,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
if (replicaType == Replica.Type.TLOG) {
zkController.startReplicationFromLeader(coreName, true);
}
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
zkController.publish(this.coreDescriptor, Replica.State.ACTIVE);
} catch (Exception e) {
log.error("Could not publish as ACTIVE after succesful recovery", e);
successfulRecovery = false;
@ -688,7 +712,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
if (retries >= maxRetries) {
SolrException.log(log, "Recovery failed - max retries exceeded (" + retries + ").");
try {
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, this.coreDescriptor);
} catch (Exception e) {
SolrException.log(log, "Could not publish that recovery failed", e);
}
@ -699,12 +723,12 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
try {
// Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
// If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
// will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
// order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
log.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
// Wait an exponential interval between retries, start at 2 seconds and work up to a minute.
// Since we sleep at 2 seconds sub-intervals in
// order to check if we were closed, 30 is chosen as the maximum loopCount (2s * 30 = 1m).
double loopCount = Math.min(Math.pow(2, retries - 1), 30);
log.info("Wait [{}] seconds before trying to recover again (attempt={})",
loopCount * startingRecoveryDelayMilliSeconds, retries);
for (int i = 0; i < loopCount; i++) {
if (isClosed()) {
log.info("RecoveryStrategy has been closed");
@ -731,13 +755,15 @@ public class RecoveryStrategy implements Runnable, Closeable {
log.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
}
private final Replica pingLeader(String ourUrl, CoreDescriptor coreDesc, boolean mayPutReplicaAsDown) throws Exception {
private final Replica pingLeader(String ourUrl, CoreDescriptor coreDesc, boolean mayPutReplicaAsDown)
throws Exception {
int numTried = 0;
while (true) {
CloudDescriptor cloudDesc = coreDesc.getCloudDescriptor();
DocCollection docCollection = zkStateReader.getClusterState().getCollection(cloudDesc.getCollectionName());
if (!isClosed() && mayPutReplicaAsDown && numTried == 1 &&
docCollection.getReplica(coreDesc.getCloudDescriptor().getCoreNodeName()).getState() == Replica.State.ACTIVE) {
docCollection.getReplica(coreDesc.getCloudDescriptor().getCoreNodeName())
.getState() == Replica.State.ACTIVE) {
// this operation may take a long time, by putting replica into DOWN state, client won't query this replica
zkController.publish(coreDesc, Replica.State.DOWN);
}
@ -763,6 +789,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
try (HttpSolrClient httpSolrClient = new HttpSolrClient.Builder(leaderReplica.getCoreUrl())
.withSocketTimeout(1000)
.withConnectionTimeout(1000)
.withHttpClient(cc.getUpdateShardHandler().getRecoveryOnlyHttpClient())
.build()) {
SolrPingResponse resp = httpSolrClient.ping();
return leaderReplica;
@ -838,7 +865,7 @@ public class RecoveryStrategy implements Runnable, Closeable {
}
final public boolean isClosed() {
return close;
return close || cc.isShutDown();
}
final private void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice)
@ -858,8 +885,9 @@ public class RecoveryStrategy implements Runnable, Closeable {
int conflictWaitMs = zkController.getLeaderConflictResolveWait();
// timeout after 5 seconds more than the max timeout (conflictWait + 3 seconds) on the server side
int readTimeout = conflictWaitMs + 8000;
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl).build()) {
int readTimeout = conflictWaitMs + Integer.parseInt(System.getProperty("prepRecoveryReadTimeoutExtraWait", "8000"));
try (HttpSolrClient client = new HttpSolrClient.Builder(leaderBaseUrl)
.withHttpClient(cc.getUpdateShardHandler().getRecoveryOnlyHttpClient()).build()) {
client.setConnectionTimeout(10000);
client.setSoTimeout(readTimeout);
HttpUriRequestResponse mrr = client.httpUriRequest(prepCmd);

View File

@ -39,11 +39,11 @@ import org.slf4j.LoggerFactory;
public class ReplicateFromLeader {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private CoreContainer cc;
private String coreName;
private final CoreContainer cc;
private final String coreName;
private ReplicationHandler replicationProcess;
private long lastVersion = 0;
private volatile ReplicationHandler replicationProcess;
private volatile long lastVersion = 0;
public ReplicateFromLeader(CoreContainer cc, String coreName) {
this.cc = cc;

View File

@ -35,6 +35,7 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
@ -70,7 +71,7 @@ public class SyncStrategy {
public SyncStrategy(CoreContainer cc) {
UpdateShardHandler updateShardHandler = cc.getUpdateShardHandler();
client = updateShardHandler.getDefaultHttpClient();
shardHandler = cc.getShardHandlerFactory().getShardHandler();
shardHandler = ((HttpShardHandlerFactory)cc.getShardHandlerFactory()).getShardHandler(cc.getUpdateShardHandler().getDefaultHttpClient());
updateExecutor = updateShardHandler.getUpdateExecutor();
}
@ -113,16 +114,17 @@ public class SyncStrategy {
private PeerSync.PeerSyncResult syncReplicas(ZkController zkController, SolrCore core,
ZkNodeProps leaderProps, boolean peerSyncOnlyWithActive) {
boolean success = false;
PeerSync.PeerSyncResult result = null;
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
String collection = cloudDesc.getCollectionName();
String shardId = cloudDesc.getShardId();
if (isClosed) {
log.info("We have been closed, won't sync with replicas");
return PeerSync.PeerSyncResult.failure();
}
boolean success = false;
PeerSync.PeerSyncResult result = null;
assert core != null;
assert core.getCoreDescriptor() != null;
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
String collection = cloudDesc.getCollectionName();
String shardId = cloudDesc.getShardId();
// first sync ourselves - we are the potential leader after all
try {
@ -160,6 +162,11 @@ public class SyncStrategy {
List<ZkCoreNodeProps> nodes = zkController.getZkStateReader()
.getReplicaProps(collection, shardId,core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
if (isClosed) {
log.info("We have been closed, won't sync with replicas");
return PeerSync.PeerSyncResult.failure();
}
if (nodes == null) {
// I have no replicas
return PeerSync.PeerSyncResult.success();
@ -184,6 +191,11 @@ public class SyncStrategy {
String shardId, ZkNodeProps leaderProps, CoreDescriptor cd,
int nUpdates) {
if (isClosed) {
log.info("We have been closed, won't sync replicas to me.");
return;
}
// sync everyone else
// TODO: we should do this in parallel at least
List<ZkCoreNodeProps> nodes = zkController
@ -289,6 +301,11 @@ public class SyncStrategy {
}
@Override
public void run() {
if (isClosed) {
log.info("We have been closed, won't request recovery");
return;
}
RequestRecovery recoverRequestCmd = new RequestRecovery();
recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
recoverRequestCmd.setCoreName(coreName);

View File

@ -16,6 +16,7 @@
*/
package org.apache.solr.cloud;
import java.io.Closeable;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
@ -46,6 +47,7 @@ import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
@ -62,11 +64,13 @@ import org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.cloud.overseer.SliceMutator;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.BeforeReconnect;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.CollectionStateWatcher;
import org.apache.solr.common.cloud.ConnectionManager;
import org.apache.solr.common.cloud.DefaultConnectionStrategy;
import org.apache.solr.common.cloud.DefaultZkACLProvider;
import org.apache.solr.common.cloud.DefaultZkCredentialsProvider;
@ -90,6 +94,7 @@ import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.CoreAdminParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.IOUtils;
import org.apache.solr.common.util.ObjectReleaseTracker;
import org.apache.solr.common.util.StrUtils;
@ -102,6 +107,7 @@ import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrCoreInitializationException;
import org.apache.solr.handler.admin.ConfigSetsHandlerApi;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.logging.MDCLoggingContext;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.servlet.SolrDispatchFilter;
@ -137,7 +143,7 @@ import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
* <p>
* TODO: exceptions during close on attempts to update cloud state
*/
public class ZkController {
public class ZkController implements Closeable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
@ -433,11 +439,14 @@ public class ZkController {
closeOutstandingElections(registerOnReconnect);
markAllAsNotLeader(registerOnReconnect);
}
}, zkACLProvider);
}, zkACLProvider, new ConnectionManager.IsClosed() {
@Override
public boolean isClosed() {
return cc.isShutDown();
}});
this.overseerJobQueue = Overseer.getStateUpdateQueue(zkClient);
this.overseerCollectionQueue = Overseer.getCollectionQueue(zkClient);
this.overseerConfigSetQueue = Overseer.getConfigSetQueue(zkClient);
this.overseerRunningMap = Overseer.getRunningMap(zkClient);
this.overseerCompletedMap = Overseer.getCompletedMap(zkClient);
this.overseerFailureMap = Overseer.getFailureMap(zkClient);
@ -449,6 +458,10 @@ public class ZkController {
init(registerOnReconnect);
this.overseerJobQueue = overseer.getStateUpdateQueue();
this.overseerCollectionQueue = overseer.getCollectionQueue(zkClient);
this.overseerConfigSetQueue = overseer.getConfigSetQueue(zkClient);
assert ObjectReleaseTracker.track(this);
}
@ -554,28 +567,41 @@ public class ZkController {
*/
public void close() {
this.isClosed = true;
ForkJoinPool customThreadPool = new ForkJoinPool(10);
customThreadPool.submit(() -> Collections.singleton(overseerElector.getContext()).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
customThreadPool.submit(() -> Collections.singleton(overseer).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
synchronized (collectionToTerms) {
collectionToTerms.values().forEach(ZkCollectionTerms::close);
customThreadPool.submit(() -> collectionToTerms.values().parallelStream().forEach(c -> {
c.close();
}));
}
try {
for (ElectionContext context : electionContexts.values()) {
try {
context.close();
} catch (Exception e) {
log.error("Error closing overseer", e);
}
}
customThreadPool.submit(() -> replicateFromLeaders.values().parallelStream().forEach(c -> {
c.stopReplication();
}));
customThreadPool.submit(() -> electionContexts.values().parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
} finally {
try {
IOUtils.closeQuietly(overseerElector.getContext());
IOUtils.closeQuietly(overseer);
} finally {
if (cloudSolrClient != null) {
IOUtils.closeQuietly(cloudSolrClient);
}
if (cloudManager != null) {
IOUtils.closeQuietly(cloudManager);
}
customThreadPool.submit(() -> Collections.singleton(cloudSolrClient).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
customThreadPool.submit(() -> Collections.singleton(cloudManager).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
try {
try {
zkStateReader.close();
@ -587,9 +613,16 @@ public class ZkController {
zkClient.close();
} catch (Exception e) {
log.error("Error closing zkClient", e);
} finally {
// just in case the OverseerElectionContext managed to start another Overseer
IOUtils.closeQuietly(overseer);
ExecutorUtil.shutdownAndAwaitTermination(customThreadPool);
}
}
}
}
assert ObjectReleaseTracker.release(this);
}
@ -669,9 +702,11 @@ public class ZkController {
if (cloudManager != null) {
return cloudManager;
}
cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zkServerAddress), Optional.empty())
.withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient()).build();
cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zkServerAddress), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000)
.withHttpClient(cc.getUpdateShardHandler().getDefaultHttpClient())
.withConnectionTimeout(15000).withSocketTimeout(30000).build();
cloudManager = new SolrClientCloudManager(new ZkDistributedQueueFactory(zkClient), cloudSolrClient);
cloudManager.getClusterStateProvider().connect();
}
return cloudManager;
}
@ -764,7 +799,8 @@ public class ZkController {
* @throws KeeperException if there is a Zookeeper error
* @throws InterruptedException on interrupt
*/
public static void createClusterZkNodes(SolrZkClient zkClient) throws KeeperException, InterruptedException, IOException {
public static void createClusterZkNodes(SolrZkClient zkClient)
throws KeeperException, InterruptedException, IOException {
ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
cmdExecutor.ensureExists(ZkStateReader.LIVE_NODES_ZKNODE, zkClient);
cmdExecutor.ensureExists(ZkStateReader.COLLECTIONS_ZKNODE, zkClient);
@ -839,7 +875,7 @@ public class ZkController {
// start the overseer first as following code may need it's processing
if (!zkRunOnly) {
overseerElector = new LeaderElector(zkClient);
this.overseer = new Overseer(cc.getShardHandlerFactory().getShardHandler(), cc.getUpdateShardHandler(),
this.overseer = new Overseer((HttpShardHandler) cc.getShardHandlerFactory().getShardHandler(), cc.getUpdateShardHandler(),
CommonParams.CORES_HANDLER_PATH, zkStateReader, this, cloudConfig);
ElectionContext context = new OverseerElectionContext(zkClient,
overseer, getNodeName());
@ -911,10 +947,10 @@ public class ZkController {
LiveNodesListener listener = (oldNodes, newNodes) -> {
oldNodes.removeAll(newNodes);
if (oldNodes.isEmpty()) { // only added nodes
return;
return false;
}
if (isClosed) {
return;
return true;
}
// if this node is in the top three then attempt to create nodeLost message
int i = 0;
@ -923,7 +959,7 @@ public class ZkController {
break;
}
if (i > 2) {
return; // this node is not in the top three
return false; // this node is not in the top three
}
i++;
}
@ -948,12 +984,18 @@ public class ZkController {
}
}
}
return false;
};
zkStateReader.registerLiveNodesListener(listener);
}
public void publishAndWaitForDownStates() throws KeeperException,
InterruptedException {
publishAndWaitForDownStates(WAIT_DOWN_STATES_TIMEOUT_SECONDS);
}
public void publishAndWaitForDownStates(int timeoutSeconds) throws KeeperException,
InterruptedException {
publishNodeAsDown(getNodeName());
@ -983,7 +1025,7 @@ public class ZkController {
});
}
boolean allPublishedDown = latch.await(WAIT_DOWN_STATES_TIMEOUT_SECONDS, TimeUnit.SECONDS);
boolean allPublishedDown = latch.await(timeoutSeconds, TimeUnit.SECONDS);
if (!allPublishedDown) {
log.warn("Timed out waiting to see all nodes published as DOWN in our cluster state.");
}
@ -1051,10 +1093,13 @@ public class ZkController {
log.info("Remove node as live in ZooKeeper:" + nodePath);
List<Op> ops = new ArrayList<>(2);
ops.add(Op.delete(nodePath, -1));
if (zkClient.exists(nodeAddedPath, true)) {
ops.add(Op.delete(nodeAddedPath, -1));
}
try {
zkClient.multi(ops, true);
} catch (NoNodeException e) {
}
}
public String getNodeName() {
@ -1158,6 +1203,10 @@ public class ZkController {
// TODO: should this actually be done earlier, before (or as part of)
// leader election perhaps?
if (core == null) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "SolrCore is no longer available to register");
}
UpdateLog ulog = core.getUpdateHandler().getUpdateLog();
boolean isTlogReplicaAndNotLeader = replica.getType() == Replica.Type.TLOG && !isLeader;
if (isTlogReplicaAndNotLeader) {
@ -1270,6 +1319,7 @@ public class ZkController {
final long msInSec = 1000L;
int maxTries = (int) Math.floor(leaderConflictResolveWait / msInSec);
while (!leaderUrl.equals(clusterStateLeaderUrl)) {
if (cc.isShutDown()) throw new AlreadyClosedException();
if (tries > maxTries) {
throw new SolrException(ErrorCode.SERVER_ERROR,
"There is conflicting information about the leader of shard: "
@ -1290,6 +1340,8 @@ public class ZkController {
.getCoreUrl();
}
} catch (AlreadyClosedException e) {
throw e;
} catch (Exception e) {
log.error("Error getting leader from zk", e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
@ -1336,7 +1388,7 @@ public class ZkController {
Thread.sleep(1000);
}
if (cc.isShutDown()) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "CoreContainer is closed");
throw new AlreadyClosedException();
}
}
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Could not get leader props", exp);
@ -2392,6 +2444,9 @@ public class ZkController {
}
private boolean fireEventListeners(String zkDir) {
if (isClosed || cc.isShutDown()) {
return false;
}
synchronized (confDirectoryListeners) {
// if this is not among directories to be watched then don't set the watcher anymore
if (!confDirectoryListeners.containsKey(zkDir)) {
@ -2527,15 +2582,17 @@ public class ZkController {
* @param nodeName to operate on
*/
public void publishNodeAsDown(String nodeName) {
log.debug("Publish node={} as DOWN", nodeName);
log.info("Publish node={} as DOWN", nodeName);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.DOWNNODE.toLower(),
ZkStateReader.NODE_NAME_PROP, nodeName);
try {
Overseer.getStateUpdateQueue(getZkClient()).offer(Utils.toJSON(m));
overseer.getStateUpdateQueue().offer(Utils.toJSON(m));
} catch (AlreadyClosedException e) {
log.info("Not publishing node as DOWN because a resource required to do so is already closed.");
} catch (InterruptedException e) {
Thread.interrupted();
Thread.currentThread().interrupt();
log.debug("Publish node as down was interrupted.");
} catch (Exception e) {
} catch (KeeperException e) {
log.warn("Could not publish node as down: " + e.getMessage());
}
}

View File

@ -39,6 +39,7 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkCmdExecutor;
import org.apache.solr.common.cloud.ConnectionManager.IsClosed;
import org.apache.solr.common.util.Pair;
import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException;
@ -115,9 +116,13 @@ public class ZkDistributedQueue implements DistributedQueue {
}
public ZkDistributedQueue(SolrZkClient zookeeper, String dir, Stats stats, int maxQueueSize) {
this(zookeeper, dir, stats, maxQueueSize, null);
}
public ZkDistributedQueue(SolrZkClient zookeeper, String dir, Stats stats, int maxQueueSize, IsClosed higherLevelIsClosed) {
this.dir = dir;
ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zookeeper.getZkClientTimeout());
ZkCmdExecutor cmdExecutor = new ZkCmdExecutor(zookeeper.getZkClientTimeout(), higherLevelIsClosed);
try {
cmdExecutor.ensureExists(dir, zookeeper);
} catch (KeeperException e) {

View File

@ -315,27 +315,22 @@ public class ZkShardTerms implements AutoCloseable{
private void ensureTermNodeExist() {
String path = "/collections/" + collection + "/terms";
try {
if (!zkClient.exists(path, true)) {
try {
zkClient.makePath(path, true);
} catch (KeeperException.NodeExistsException e) {
// it's okay if another beats us creating the node
}
}
path += "/" + shard;
if (!zkClient.exists(path, true)) {
try {
Map<String,Long> initialTerms = new HashMap<>();
zkClient.create(path, Utils.toJSON(initialTerms), CreateMode.PERSISTENT, true);
zkClient.makePath(path, Utils.toJSON(initialTerms), CreateMode.PERSISTENT, true);
} catch (KeeperException.NodeExistsException e) {
// it's okay if another beats us creating the node
}
}
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error creating shard term node in Zookeeper for collection: " + collection, e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error creating shard term node in Zookeeper for collection: " + collection, e);
} catch (KeeperException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error creating shard term node in Zookeeper for collection: " + collection, e);
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Error creating shard term node in Zookeeper for collection: " + collection, e);
}
}

View File

@ -245,7 +245,7 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
props = props.plus(ZkStateReader.CORE_NODE_NAME_PROP, createReplica.coreNodeName);
}
try {
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props));
ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Exception updating Overseer state queue", e);
}
@ -328,6 +328,7 @@ public class AddReplicaCmd implements OverseerCollectionMessageHandler.Cmd {
}
}
}
log.info("Returning CreateReplica command.");
return new CreateReplica(collection, shard, node, replicaType, coreName, coreNodeName);
}

View File

@ -115,7 +115,7 @@ public class Assign {
} catch (IOException | KeeperException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error inc and get counter from Zookeeper for collection:"+collection, e);
} catch (InterruptedException e) {
Thread.interrupted();
Thread.currentThread().interrupt();
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error inc and get counter from Zookeeper for collection:" + collection, e);
}
}
@ -182,21 +182,34 @@ public class Assign {
return String.format(Locale.ROOT, "%s_%s_replica_%s%s", collectionName, shard, type.name().substring(0,1).toLowerCase(Locale.ROOT), replicaNum);
}
private static int defaultCounterValue(DocCollection collection, boolean newCollection) {
private static int defaultCounterValue(DocCollection collection, boolean newCollection, String shard) {
if (newCollection) return 0;
int defaultValue = collection.getReplicas().size();
int defaultValue;
if (collection.getSlice(shard) != null && collection.getSlice(shard).getReplicas().isEmpty()) {
return 0;
} else {
defaultValue = collection.getReplicas().size() * 2;
}
if (collection.getReplicationFactor() != null) {
// numReplicas and replicationFactor * numSlices can be not equals,
// in case of many addReplicas or deleteReplicas are executed
defaultValue = Math.max(defaultValue,
collection.getReplicationFactor() * collection.getSlices().size());
}
return defaultValue * 20;
return defaultValue;
}
private static int defaultCounterValue(DocCollection collection, boolean newCollection) {
if (newCollection) return 0;
int defaultValue = collection.getReplicas().size();
return defaultValue;
}
public static String buildSolrCoreName(DistribStateManager stateManager, DocCollection collection, String shard, Replica.Type type, boolean newCollection) {
Slice slice = collection.getSlice(shard);
int defaultValue = defaultCounterValue(collection, newCollection);
int defaultValue = defaultCounterValue(collection, newCollection, shard);
int replicaNum = incAndGetId(stateManager, collection.getName(), defaultValue);
String coreName = buildSolrCoreName(collection.getName(), shard, type, replicaNum);
while (existCoreName(coreName, slice)) {

View File

@ -160,7 +160,7 @@ public class BackupCmd implements OverseerCollectionMessageHandler.Cmd {
String backupName = request.getStr(NAME);
String asyncId = request.getStr(ASYNC);
String repoName = request.getStr(CoreAdminParams.BACKUP_REPOSITORY);
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
Map<String, String> requestMap = new HashMap<>();
String commitName = request.getStr(CoreAdminParams.COMMIT_NAME);

View File

@ -156,7 +156,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
createCollectionZkNode(stateManager, collectionName, collectionParams);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
ocmh.overseer.offerStateUpdate(Utils.toJSON(message));
// wait for a while until we see the collection
TimeOut waitUntil = new TimeOut(30, TimeUnit.SECONDS, timeSource);
@ -195,7 +195,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , message : {2}",
collectionName, shardNames, message));
Map<String,ShardRequest> coresToCreate = new LinkedHashMap<>();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
for (ReplicaPosition replicaPosition : replicaPositions) {
String nodeName = replicaPosition.node;
@ -235,7 +235,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
ZkStateReader.BASE_URL_PROP, baseUrl,
ZkStateReader.REPLICA_TYPE, replicaPosition.type.name(),
CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props));
ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
}
// Need to create new params for each request
@ -308,7 +308,7 @@ public class CreateCollectionCmd implements OverseerCollectionMessageHandler.Cmd
Overseer.QUEUE_OPERATION, MODIFYCOLLECTION.toString(),
ZkStateReader.COLLECTION_PROP, withCollection,
CollectionAdminParams.COLOCATED_WITH, collectionName);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props));
ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
try {
zkStateReader.waitForState(withCollection, 5, TimeUnit.SECONDS, (liveNodes, collectionState) -> collectionName.equals(collectionState.getStr(COLOCATED_WITH)));
} catch (TimeoutException e) {

View File

@ -21,7 +21,6 @@ import java.lang.invoke.MethodHandles;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
@ -71,7 +70,7 @@ public class CreateShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
ZkStateReader zkStateReader = ocmh.zkStateReader;
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
ocmh.overseer.offerStateUpdate(Utils.toJSON(message));
// wait for a while until we see the shard
ocmh.waitForNewShard(collectionName, sliceName);
String async = message.getStr(ASYNC);

View File

@ -84,7 +84,7 @@ public class CreateSnapshotCmd implements OverseerCollectionMessageHandler.Cmd {
Map<String, String> requestMap = new HashMap<>();
NamedList shardRequestResults = new NamedList();
Map<String, Slice> shardByCoreName = new HashMap<>();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
for (Slice slice : ocmh.zkStateReader.getClusterState().getCollection(collectionName).getSlices()) {
for (Replica replica : slice.getReplicas()) {

View File

@ -46,7 +46,6 @@ import org.apache.solr.core.SolrInfoBean;
import org.apache.solr.core.snapshots.SolrSnapshotManager;
import org.apache.solr.handler.admin.MetricsHistoryHandler;
import org.apache.solr.metrics.SolrMetricManager;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -127,24 +126,26 @@ public class DeleteCollectionCmd implements OverseerCollectionMessageHandler.Cmd
}
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, collection);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
// wait for a while until we don't see the collection
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, timeSource);
boolean removed = false;
while (! timeout.hasTimedOut()) {
timeout.sleep(100);
removed = !zkStateReader.getClusterState().hasCollection(collection);
if (removed) {
timeout.sleep(500); // just a bit of time so it's more likely other
// readers see on return
break;
}
}
if (!removed) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Could not fully remove collection: " + collection);
}
zkStateReader.waitForState(collection, 60, TimeUnit.SECONDS, (liveNodes, collectionState) -> collectionState == null);
// TimeOut timeout = new TimeOut(60, TimeUnit.SECONDS, timeSource);
// boolean removed = false;
// while (! timeout.hasTimedOut()) {
// timeout.sleep(100);
// removed = !zkStateReader.getClusterState().hasCollection(collection);
// if (removed) {
// timeout.sleep(500); // just a bit of time so it's more likely other
// // readers see on return
// break;
// }
// }
// if (!removed) {
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
// "Could not fully remove collection: " + collection);
// }
} finally {
try {

View File

@ -218,7 +218,7 @@ public class DeleteReplicaCmd implements Cmd {
" with onlyIfDown='true', but state is '" + replica.getStr(ZkStateReader.STATE_PROP) + "'");
}
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
String core = replica.getStr(ZkStateReader.CORE_NAME_PROP);
String asyncId = message.getStr(ASYNC);
AtomicReference<Map<String, String>> requestMap = new AtomicReference<>(null);
@ -246,7 +246,7 @@ public class DeleteReplicaCmd implements Cmd {
ocmh.processResponses(results, shardHandler, false, null, asyncId, requestMap.get());
//check if the core unload removed the corenode zk entry
if (ocmh.waitForCoreNodeGone(collectionName, shard, replicaName, 5000)) return Boolean.TRUE;
if (ocmh.waitForCoreNodeGone(collectionName, shard, replicaName, 30000)) return Boolean.TRUE;
}
// try and ensure core info is removed from cluster state

View File

@ -17,6 +17,13 @@
*/
package org.apache.solr.cloud.api.collections;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.HashMap;
@ -26,12 +33,10 @@ import java.util.Map;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.overseer.OverseerAction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkNodeProps;
@ -41,18 +46,10 @@ import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.util.TimeOut;
import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
public class DeleteShardCmd implements OverseerCollectionMessageHandler.Cmd {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private final OverseerCollectionMessageHandler ocmh;
@ -85,13 +82,12 @@ public class DeleteShardCmd implements OverseerCollectionMessageHandler.Cmd {
if (state == Slice.State.RECOVERY) {
// mark the slice as 'construction' and only then try to delete the cores
// see SOLR-9455
DistributedQueue inQueue = Overseer.getStateUpdateQueue(ocmh.zkStateReader.getZkClient());
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
propMap.put(sliceId, Slice.State.CONSTRUCTION.toString());
propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
}
String asyncId = message.getStr(ASYNC);
@ -129,29 +125,14 @@ public class DeleteShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
}
log.debug("Waiting for delete shard action to complete");
cleanupLatch.await(5, TimeUnit.MINUTES);
cleanupLatch.await(1, TimeUnit.MINUTES);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP,
collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
ZkStateReader zkStateReader = ocmh.zkStateReader;
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
// wait for a while until we don't see the shard
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, timeSource);
boolean removed = false;
while (!timeout.hasTimedOut()) {
timeout.sleep(100);
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
removed = collection.getSlice(sliceId) == null;
if (removed) {
timeout.sleep(100); // just a bit of time so it's more likely other readers see on return
break;
}
}
if (!removed) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Could not fully remove collection: " + collectionName + " shard: " + sliceId);
}
zkStateReader.waitForState(collectionName, 45, TimeUnit.SECONDS, (l, c) -> c.getSlice(sliceId) == null);
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
} catch (SolrException e) {

View File

@ -69,7 +69,7 @@ public class DeleteSnapshotCmd implements OverseerCollectionMessageHandler.Cmd {
String asyncId = message.getStr(ASYNC);
Map<String, String> requestMap = new HashMap<>();
NamedList shardRequestResults = new NamedList();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
SolrZkClient zkClient = ocmh.zkStateReader.getZkClient();
Optional<CollectionSnapshotMetaData> meta = SolrSnapshotManager.getCollectionLevelSnapshot(zkClient, collectionName, commitName);

View File

@ -42,6 +42,7 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import org.apache.solr.update.SolrIndexSplitter;
@ -146,7 +147,7 @@ public class MigrateCmd implements OverseerCollectionMessageHandler.Cmd {
DocRouter.Range keyHashRange = sourceRouter.keyHashRange(splitKey);
ShardHandlerFactory shardHandlerFactory = ocmh.shardHandlerFactory;
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ((HttpShardHandlerFactory)shardHandlerFactory).getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
log.info("Hash range for split.key: {} is: {}", splitKey, keyHashRange);
// intersect source range, keyHashRange and target range
@ -181,7 +182,7 @@ public class MigrateCmd implements OverseerCollectionMessageHandler.Cmd {
"targetCollection", targetCollection.getName(),
"expireAt", RoutingRule.makeExpiryAt(timeout));
log.info("Adding routing rule: " + m);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
// wait for a while until we see the new rule
log.info("Waiting to see routing rule updated in clusterstate");

View File

@ -16,6 +16,58 @@
*/
package org.apache.solr.cloud.api.collections;
import static org.apache.solr.client.solrj.cloud.autoscaling.Policy.POLICY;
import static org.apache.solr.common.cloud.DocCollection.SNITCH;
import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionAdminParams.COLLECTION;
import static org.apache.solr.common.params.CollectionAdminParams.COLOCATED_WITH;
import static org.apache.solr.common.params.CollectionAdminParams.WITH_COLLECTION;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDREPLICAPROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ADDROLE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.ALIASPROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.BACKUP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.BALANCESHARDUNIQUE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATEALIAS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESNAPSHOT;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEALIAS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETENODE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETEREPLICAPROP;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESHARD;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.DELETESNAPSHOT;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MAINTAINROUTEDALIAS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MIGRATE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MIGRATESTATEFORMAT;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_COLL_TASK;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_REPLICA_TASK;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOCK_SHARD_TASK;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MODIFYCOLLECTION;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.MOVEREPLICA;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.OVERSEERSTATUS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.REBALANCELEADERS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.RELOAD;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.REMOVEROLE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.REPLACENODE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.RESTORE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.SPLITSHARD;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.UTILIZENODE;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
import static org.apache.solr.common.util.Utils.makeMap;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
@ -30,13 +82,12 @@ import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang.StringUtils;
import org.apache.solr.client.solrj.SolrResponse;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.cloud.DistribStateManager;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.AlreadyExistsException;
import org.apache.solr.client.solrj.cloud.autoscaling.BadVersionException;
@ -79,8 +130,8 @@ import org.apache.solr.common.util.StrUtils;
import org.apache.solr.common.util.SuppressForbidden;
import org.apache.solr.common.util.TimeSource;
import org.apache.solr.common.util.Utils;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.handler.component.ShardHandler;
import org.apache.solr.handler.component.ShardHandlerFactory;
import org.apache.solr.handler.component.ShardRequest;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.logging.MDCLoggingContext;
@ -92,25 +143,7 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.client.solrj.cloud.autoscaling.Policy.POLICY;
import static org.apache.solr.common.cloud.DocCollection.SNITCH;
import static org.apache.solr.common.cloud.ZkStateReader.BASE_URL_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.CORE_NODE_NAME_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.ELECTION_NODE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.PROPERTY_VALUE_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REJOIN_AT_HEAD_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.params.CollectionAdminParams.COLLECTION;
import static org.apache.solr.common.params.CollectionAdminParams.COLOCATED_WITH;
import static org.apache.solr.common.params.CollectionAdminParams.WITH_COLLECTION;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.*;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
import static org.apache.solr.common.util.Utils.makeMap;
import com.google.common.collect.ImmutableMap;
/**
* A {@link OverseerMessageHandler} that handles Collections API related
@ -158,7 +191,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
Overseer overseer;
ShardHandlerFactory shardHandlerFactory;
HttpShardHandlerFactory shardHandlerFactory;
String adminPath;
ZkStateReader zkStateReader;
SolrCloudManager cloudManager;
@ -191,7 +224,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
private volatile boolean isClosed;
public OverseerCollectionMessageHandler(ZkStateReader zkStateReader, String myId,
final ShardHandlerFactory shardHandlerFactory,
final HttpShardHandlerFactory shardHandlerFactory,
String adminPath,
Stats stats,
Overseer overseer,
@ -334,7 +367,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
sreq.shards = new String[] {baseUrl};
sreq.actualShards = sreq.shards;
sreq.params = params;
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = shardHandlerFactory.getShardHandler(overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
shardHandler.submit(sreq, baseUrl, sreq.params);
}
@ -343,24 +376,22 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
throws Exception {
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP);
SolrZkClient zkClient = zkStateReader.getZkClient();
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICAPROP.toLower());
propMap.putAll(message.getProperties());
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
overseer.offerStateUpdate(Utils.toJSON(m));
}
private void processReplicaDeletePropertyCommand(ClusterState clusterState, ZkNodeProps message, NamedList results)
throws Exception {
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP);
SolrZkClient zkClient = zkStateReader.getZkClient();
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, DELETEREPLICAPROP.toLower());
propMap.putAll(message.getProperties());
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
overseer.offerStateUpdate(Utils.toJSON(m));
}
private void balanceProperty(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception {
@ -370,11 +401,10 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
"' parameters are required for the BALANCESHARDUNIQUE operation, no action taken");
}
SolrZkClient zkClient = zkStateReader.getZkClient();
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, BALANCESHARDUNIQUE.toLower());
propMap.putAll(message.getProperties());
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
Map<String, Object> m = new HashMap<>();
m.put(Overseer.QUEUE_OPERATION, BALANCESHARDUNIQUE.toLower());
m.putAll(message.getProperties());
overseer.offerStateUpdate(Utils.toJSON(m));
}
/**
@ -417,20 +447,21 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
}
boolean waitForCoreNodeGone(String collectionName, String shard, String replicaName, int timeoutms) throws InterruptedException {
TimeOut timeout = new TimeOut(timeoutms, TimeUnit.MILLISECONDS, timeSource);
while (! timeout.hasTimedOut()) {
timeout.sleep(100);
DocCollection docCollection = zkStateReader.getClusterState().getCollection(collectionName);
if (docCollection == null) { // someone already deleted the collection
try {
zkStateReader.waitForState(collectionName, timeoutms, TimeUnit.MILLISECONDS, (n, c) -> {
if (c == null)
return true;
}
Slice slice = docCollection.getSlice(shard);
Slice slice = c.getSlice(shard);
if(slice == null || slice.getReplica(replicaName) == null) {
return true;
}
}
// replica still exists after the timeout
return false;
});
} catch (TimeoutException e) {
return false;
}
return true;
}
void deleteCoreNode(String collectionName, String replicaName, Replica replica, String core) throws Exception {
@ -441,7 +472,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
ZkStateReader.COLLECTION_PROP, collectionName,
ZkStateReader.CORE_NODE_NAME_PROP, replicaName,
ZkStateReader.BASE_URL_PROP, replica.getStr(ZkStateReader.BASE_URL_PROP));
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
overseer.offerStateUpdate(Utils.toJSON(m));
}
void checkRequired(ZkNodeProps message, String... props) {
@ -475,7 +506,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
// Actually queue the migration command.
firstLoop = false;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, MIGRATESTATEFORMAT.toLower(), COLLECTION_PROP, collectionName);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
overseer.offerStateUpdate(Utils.toJSON(m));
}
timeout.sleep(100);
}
@ -584,7 +615,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
}
public static void sendShardRequest(String nodeName, ModifiableSolrParams params, ShardHandler shardHandler,
public void sendShardRequest(String nodeName, ModifiableSolrParams params, ShardHandler shardHandler,
String asyncId, Map<String, String> requestMap, String adminPath,
ZkStateReader zkStateReader) {
if (asyncId != null) {
@ -640,7 +671,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
reloadCollection(null, new ZkNodeProps(NAME, collectionName), results);
}
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
overseer.offerStateUpdate(Utils.toJSON(message));
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, timeSource);
boolean areChangesVisible = true;
@ -680,8 +711,9 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
}
Map<String, Replica> waitToSeeReplicasInState(String collectionName, Collection<String> coreNames) throws InterruptedException {
assert coreNames.size() > 0;
Map<String, Replica> result = new HashMap<>();
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS, timeSource);
TimeOut timeout = new TimeOut(Integer.getInteger("solr.waitToSeeReplicasInStateTimeoutSeconds", 120), TimeUnit.SECONDS, timeSource); // could be a big cluster
while (true) {
DocCollection coll = zkStateReader.getClusterState().getCollection(collectionName);
for (String coreName : coreNames) {
@ -791,7 +823,7 @@ public class OverseerCollectionMessageHandler implements OverseerMessageHandler,
NamedList results, Replica.State stateMatcher, String asyncId, Map<String, String> requestMap, Set<String> okayExceptions) {
log.info("Executing Collection Cmd={}, asyncId={}", params, asyncId);
String collectionName = message.getStr(NAME);
ShardHandler shardHandler = shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = shardHandlerFactory.getShardHandler(overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
ClusterState clusterState = zkStateReader.getClusterState();
DocCollection coll = clusterState.getCollection(collectionName);

View File

@ -18,6 +18,20 @@
package org.apache.solr.cloud.api.collections;
import static org.apache.solr.common.cloud.DocCollection.STATE_FORMAT;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE;
import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_TYPE;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
import java.lang.invoke.MethodHandles;
import java.net.URI;
import java.util.ArrayList;
@ -33,7 +47,6 @@ import java.util.Optional;
import java.util.Properties;
import java.util.Set;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.overseer.OverseerAction;
@ -60,20 +73,6 @@ import org.apache.solr.handler.component.ShardHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.cloud.DocCollection.STATE_FORMAT;
import static org.apache.solr.common.cloud.ZkStateReader.COLLECTION_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.MAX_SHARDS_PER_NODE;
import static org.apache.solr.common.cloud.ZkStateReader.NRT_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.PULL_REPLICAS;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICATION_FACTOR;
import static org.apache.solr.common.cloud.ZkStateReader.REPLICA_TYPE;
import static org.apache.solr.common.cloud.ZkStateReader.SHARD_ID_PROP;
import static org.apache.solr.common.cloud.ZkStateReader.TLOG_REPLICAS;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATE;
import static org.apache.solr.common.params.CollectionParams.CollectionAction.CREATESHARD;
import static org.apache.solr.common.params.CommonAdminParams.ASYNC;
import static org.apache.solr.common.params.CommonParams.NAME;
public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -89,7 +88,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
String restoreCollectionName = message.getStr(COLLECTION_PROP);
String backupName = message.getStr(NAME); // of backup
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
String asyncId = message.getStr(ASYNC);
String repo = message.getStr(CoreAdminParams.BACKUP_REPOSITORY);
Map<String, String> requestMap = new HashMap<>();
@ -209,8 +208,6 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
DocCollection restoreCollection = zkStateReader.getClusterState().getCollection(restoreCollectionName);
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
//Mark all shards in CONSTRUCTION STATE while we restore the data
{
//TODO might instead createCollection accept an initial state? Is there a race?
@ -220,7 +217,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
propMap.put(shard.getName(), Slice.State.CONSTRUCTION.toString());
}
propMap.put(ZkStateReader.COLLECTION_PROP, restoreCollectionName);
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
ocmh.overseer.offerStateUpdate(Utils.toJSON(new ZkNodeProps(propMap)));
}
// TODO how do we leverage the RULE / SNITCH logic in createCollection?
@ -323,7 +320,7 @@ public class RestoreCmd implements OverseerCollectionMessageHandler.Cmd {
for (Slice shard : restoreCollection.getSlices()) {
propMap.put(shard.getName(), Slice.State.ACTIVE.toString());
}
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
ocmh.overseer.offerStateUpdate((Utils.toJSON(new ZkNodeProps(propMap))));
}
if (totalReplicasPerShard > 1) {

View File

@ -30,7 +30,6 @@ import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.client.solrj.cloud.NodeStateProvider;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.PolicyHelper;
@ -249,8 +248,8 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
propMap.put(ZkStateReader.SHARD_PARENT_PROP, parentSlice.getName());
propMap.put("shard_parent_node", nodeName);
propMap.put("shard_parent_zk_session", leaderZnodeStat.getEphemeralOwner());
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
inQueue.offer(Utils.toJSON(new ZkNodeProps(propMap)));
ocmh.overseer.offerStateUpdate(Utils.toJSON(new ZkNodeProps(propMap)));
// wait until we are able to see the new shard in cluster state
ocmh.waitForNewShard(collectionName, subSlice);
@ -281,7 +280,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
ocmh.addReplica(clusterState, new ZkNodeProps(propMap), results, null);
}
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler(ocmh.overseer.getCoreContainer().getUpdateShardHandler().getDefaultHttpClient());
ocmh.processResponses(results, shardHandler, true, "SPLITSHARD failed to create subshard leaders", asyncId, requestMap);
@ -412,7 +411,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
ZkStateReader.BASE_URL_PROP, zkStateReader.getBaseUrlForNodeName(subShardNodeName),
ZkStateReader.NODE_NAME_PROP, subShardNodeName,
CommonAdminParams.WAIT_FOR_FINAL_STATE, Boolean.toString(waitForFinalState));
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props));
ocmh.overseer.offerStateUpdate(Utils.toJSON(props));
HashMap<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICA.toLower());
@ -446,7 +445,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
leaderZnodeStat = zkStateReader.getZkClient().exists(ZkStateReader.LIVE_NODES_ZKNODE + "/" + parentShardLeader.getNodeName(), null, true);
if (leaderZnodeStat == null || ephemeralOwner != leaderZnodeStat.getEphemeralOwner()) {
// put sub-shards in recovery_failed state
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
for (String subSlice : subSlices) {
@ -454,7 +453,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
if (leaderZnodeStat == null) {
// the leader is not live anymore, fail the split!
@ -473,8 +472,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
if (repFactor == 1) {
// switch sub shard states to 'active'
log.debug("Replication factor is 1 so switching shard states");
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
log.info("Replication factor is 1 so switching shard states");
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
propMap.put(slice.get(), Slice.State.INACTIVE.toString());
@ -483,10 +481,9 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
} else {
log.debug("Requesting shard state be set to 'recovery'");
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
log.info("Requesting shard state be set to 'recovery'");
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
for (String subSlice : subSlices) {
@ -494,7 +491,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
}
propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
}
t = timings.sub("createCoresForReplicas");
@ -590,7 +587,6 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
// set already created sub shards states to CONSTRUCTION - this prevents them
// from entering into RECOVERY or ACTIVE (SOLR-9455)
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkStateReader.getZkClient());
final Map<String, Object> propMap = new HashMap<>();
boolean sendUpdateState = false;
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
@ -618,7 +614,7 @@ public class SplitShardCmd implements OverseerCollectionMessageHandler.Cmd {
if (sendUpdateState) {
try {
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
ocmh.overseer.offerStateUpdate(Utils.toJSON(m));
} catch (Exception e) {
// don't give up yet - just log the error, we may still be able to clean up
log.warn("Cleanup failed after failed split of " + collectionName + "/" + parentShard + ": (slice state changes)", e);

View File

@ -32,6 +32,7 @@ import java.util.concurrent.TimeUnit;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
@ -62,7 +63,7 @@ public class NodeLostTrigger extends TriggerBase {
public void init() throws Exception {
super.init();
lastLiveNodes = new HashSet<>(cloudManager.getClusterStateProvider().getLiveNodes());
log.debug("NodeLostTrigger {} - Initial livenodes: {}", name, lastLiveNodes);
log.info("NodeLostTrigger {} - Initial livenodes: {}", name, lastLiveNodes);
// pick up lost nodes for which marker paths were created
try {
List<String> lost = stateManager.listData(ZkStateReader.SOLR_AUTOSCALING_NODE_LOST_PATH);
@ -147,7 +148,7 @@ public class NodeLostTrigger extends TriggerBase {
}
Set<String> newLiveNodes = new HashSet<>(cloudManager.getClusterStateProvider().getLiveNodes());
log.debug("Running NodeLostTrigger: {} with currently live nodes: {}", name, newLiveNodes.size());
log.info("Running NodeLostTrigger: {} with currently live nodes: {} and last live nodes: {}", name, newLiveNodes.size(), lastLiveNodes.size());
// have any nodes that we were tracking been added to the cluster?
// if so, remove them from the tracking map
@ -158,7 +159,7 @@ public class NodeLostTrigger extends TriggerBase {
Set<String> copyOfLastLiveNodes = new HashSet<>(lastLiveNodes);
copyOfLastLiveNodes.removeAll(newLiveNodes);
copyOfLastLiveNodes.forEach(n -> {
log.debug("Tracking lost node: {}", n);
log.info("Tracking lost node: {}", n);
nodeNameVsTimeRemoved.put(n, cloudManager.getTimeSource().getTimeNs());
});
@ -170,7 +171,8 @@ public class NodeLostTrigger extends TriggerBase {
String nodeName = entry.getKey();
Long timeRemoved = entry.getValue();
long now = cloudManager.getTimeSource().getTimeNs();
if (TimeUnit.SECONDS.convert(now - timeRemoved, TimeUnit.NANOSECONDS) >= getWaitForSecond()) {
long te = TimeUnit.SECONDS.convert(now - timeRemoved, TimeUnit.NANOSECONDS);
if (te >= getWaitForSecond()) {
nodeNames.add(nodeName);
times.add(timeRemoved);
}
@ -197,6 +199,8 @@ public class NodeLostTrigger extends TriggerBase {
}
}
lastLiveNodes = new HashSet<>(newLiveNodes);
} catch (AlreadyClosedException e) {
} catch (RuntimeException e) {
log.error("Unexpected exception in NodeLostTrigger", e);
}

View File

@ -29,12 +29,12 @@ import java.util.Set;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.autoscaling.BadVersionException;
import org.apache.solr.client.solrj.cloud.DistribStateManager;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrCloseable;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.IOUtils;
@ -135,6 +135,8 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
log.debug("Adding .auto_add_replicas and .scheduled_maintenance triggers");
cloudManager.getDistribStateManager().setData(SOLR_AUTOSCALING_CONF_PATH, Utils.toJSON(updatedConfig), updatedConfig.getZkVersion());
break;
} catch (AlreadyClosedException e) {
break;
} catch (BadVersionException bve) {
// somebody else has changed the configuration so we must retry
} catch (InterruptedException e) {
@ -178,7 +180,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
// must check for close here before we await on the condition otherwise we can only be woken up on interruption
if (isClosed) {
log.warn("OverseerTriggerThread has been closed, exiting.");
log.info("OverseerTriggerThread has been closed, exiting.");
break;
}
@ -190,7 +192,7 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
// are we closed?
if (isClosed) {
log.warn("OverseerTriggerThread woken up but we are closed, exiting.");
log.info("OverseerTriggerThread woken up but we are closed, exiting.");
break;
}
@ -211,7 +213,6 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
} catch (InterruptedException e) {
// Restore the interrupted status
Thread.currentThread().interrupt();
log.warn("Interrupted", e);
break;
}
@ -240,6 +241,8 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
}
try {
scheduledTriggers.add(entry.getValue());
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.warn("Exception initializing trigger " + entry.getKey() + ", configuration ignored", e);
}
@ -275,6 +278,8 @@ public class OverseerTriggerThread implements Runnable, SolrCloseable {
});
} catch (NoSuchElementException e) {
// ignore
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.warn("Error removing old nodeAdded markers", e);
}

View File

@ -151,8 +151,8 @@ public class ScheduledTrigger extends TriggerBase {
public void run() {
synchronized (this) {
if (isClosed) {
log.warn("ScheduledTrigger ran but was already closed");
throw new RuntimeException("Trigger has been closed");
log.debug("ScheduledTrigger ran but was already closed");
return;
}
}

View File

@ -42,7 +42,6 @@ import java.util.concurrent.locks.ReentrantLock;
import java.util.stream.Collectors;
import org.apache.commons.lang3.exception.ExceptionUtils;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.solr.client.solrj.cloud.autoscaling.AutoScalingConfig;
import org.apache.solr.client.solrj.cloud.DistribStateManager;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
@ -51,6 +50,7 @@ import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData;
import org.apache.solr.client.solrj.request.CollectionAdminRequest.RequestStatusResponse;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.cloud.Stats;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.ExecutorUtil;
@ -205,7 +205,7 @@ public class ScheduledTriggers implements Closeable {
try {
st = new TriggerWrapper(newTrigger, cloudManager, queueStats);
} catch (Exception e) {
if (isClosed) {
if (isClosed || e instanceof AlreadyClosedException) {
throw new AlreadyClosedException("ScheduledTriggers has been closed and cannot be used anymore");
}
if (cloudManager.isClosed()) {
@ -567,6 +567,8 @@ public class ScheduledTriggers implements Closeable {
// execution of the same trigger instance
synchronized (TriggerWrapper.this) {
// replay accumulated events on first run, if any
try {
if (replay) {
TriggerEvent event;
// peek first without removing - we may crash before calling the listener
@ -587,8 +589,15 @@ public class ScheduledTriggers implements Closeable {
}
replay = false;
}
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.error("Unexpected exception from trigger: " + trigger.getName(), e);
}
try {
trigger.run();
} catch (AlreadyClosedException e) {
} catch (Exception e) {
// log but do not propagate exception because an exception thrown from a scheduled operation
// will suppress future executions

View File

@ -36,6 +36,7 @@ import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.cloud.autoscaling.TriggerEventType;
import org.apache.solr.client.solrj.cloud.autoscaling.VersionedData;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.Utils;
import org.apache.solr.core.SolrResourceLoader;
@ -239,7 +240,9 @@ public abstract class TriggerBase implements AutoScaling.Trigger {
stateManager.createData(path, data, CreateMode.PERSISTENT);
}
lastState = state;
} catch (InterruptedException | BadVersionException | AlreadyExistsException | IOException | KeeperException e) {
} catch (AlreadyExistsException e) {
} catch (InterruptedException | BadVersionException | IOException | KeeperException e) {
log.warn("Exception updating trigger state '" + path + "'", e);
}
}
@ -253,6 +256,8 @@ public abstract class TriggerBase implements AutoScaling.Trigger {
VersionedData versionedData = stateManager.getData(path);
data = versionedData.getData();
}
} catch (AlreadyClosedException e) {
} catch (Exception e) {
log.warn("Exception getting trigger state '" + path + "'", e);
}

View File

@ -24,6 +24,7 @@ import java.util.Map;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.cloud.Stats;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.util.Utils;
import org.apache.solr.common.util.TimeSource;
@ -78,7 +79,11 @@ public class TriggerEventQueue {
continue;
}
}
} catch (Exception e) {
}
catch (AlreadyClosedException e) {
}
catch (Exception e) {
log.warn("Exception peeking queue of trigger " + triggerName, e);
}
return null;

View File

@ -124,10 +124,10 @@ public class CloudConfig {
public static class CloudConfigBuilder {
private static final int DEFAULT_ZK_CLIENT_TIMEOUT = 15000;
private static final int DEFAULT_ZK_CLIENT_TIMEOUT = 45000;
private static final int DEFAULT_LEADER_VOTE_WAIT = 180000; // 3 minutes
private static final int DEFAULT_LEADER_CONFLICT_RESOLVE_WAIT = 180000;
private static final int DEFAULT_CREATE_COLLECTION_ACTIVE_WAIT = 30; // 30 seconds
private static final int DEFAULT_CREATE_COLLECTION_ACTIVE_WAIT = 45; // 45 seconds
private static final boolean DEFAULT_CREATE_COLLECTION_CHECK_LEADER_ACTIVE = false;
private static final int DEFAULT_AUTO_REPLICA_FAILOVER_WAIT_AFTER_EXPIRATION = 120000;

View File

@ -16,6 +16,22 @@
*/
package org.apache.solr.core;
import static java.util.Objects.requireNonNull;
import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH;
import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH;
import static org.apache.solr.common.params.CommonParams.METRICS_PATH;
import static org.apache.solr.common.params.CommonParams.ZK_PATH;
import static org.apache.solr.common.params.CommonParams.ZK_STATUS_PATH;
import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME;
import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.nio.file.Path;
@ -35,10 +51,9 @@ import java.util.Properties;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.apache.http.auth.AuthSchemeProvider;
import org.apache.http.client.CredentialsProvider;
import org.apache.http.config.Lookup;
@ -58,6 +73,7 @@ import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.cloud.autoscaling.AutoScalingHandler;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.DocCollection;
@ -106,24 +122,13 @@ import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.OrderedExecutor;
import org.apache.solr.util.stats.MetricUtils;
import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.ConnectionLossException;
import org.apache.zookeeper.KeeperException.SessionExpiredException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static java.util.Objects.requireNonNull;
import static org.apache.solr.common.params.CommonParams.AUTHC_PATH;
import static org.apache.solr.common.params.CommonParams.AUTHZ_PATH;
import static org.apache.solr.common.params.CommonParams.AUTOSCALING_HISTORY_PATH;
import static org.apache.solr.common.params.CommonParams.COLLECTIONS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CONFIGSETS_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.CORES_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.HEALTH_CHECK_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.INFO_HANDLER_PATH;
import static org.apache.solr.common.params.CommonParams.METRICS_HISTORY_PATH;
import static org.apache.solr.common.params.CommonParams.METRICS_PATH;
import static org.apache.solr.common.params.CommonParams.ZK_PATH;
import static org.apache.solr.common.params.CommonParams.ZK_STATUS_PATH;
import static org.apache.solr.core.CorePropertiesLocator.PROPERTIES_FILENAME;
import static org.apache.solr.security.AuthenticationPlugin.AUTHENTICATION_PLUGIN_PROP;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
/**
*
@ -148,32 +153,32 @@ public class CoreContainer {
protected final Map<String, CoreLoadFailure> coreInitFailures = new ConcurrentHashMap<>();
protected CoreAdminHandler coreAdminHandler = null;
protected CollectionsHandler collectionsHandler = null;
protected HealthCheckHandler healthCheckHandler = null;
protected volatile CoreAdminHandler coreAdminHandler = null;
protected volatile CollectionsHandler collectionsHandler = null;
protected volatile HealthCheckHandler healthCheckHandler = null;
private InfoHandler infoHandler;
protected ConfigSetsHandler configSetsHandler = null;
private volatile InfoHandler infoHandler;
protected volatile ConfigSetsHandler configSetsHandler = null;
private PKIAuthenticationPlugin pkiAuthenticationPlugin;
private volatile PKIAuthenticationPlugin pkiAuthenticationPlugin;
protected Properties containerProperties;
protected volatile Properties containerProperties;
private ConfigSetService coreConfigService;
private volatile ConfigSetService coreConfigService;
protected ZkContainer zkSys = new ZkContainer();
protected ShardHandlerFactory shardHandlerFactory;
protected final ZkContainer zkSys = new ZkContainer();
protected volatile ShardHandlerFactory shardHandlerFactory;
private UpdateShardHandler updateShardHandler;
private volatile UpdateShardHandler updateShardHandler;
private ExecutorService coreContainerWorkExecutor = ExecutorUtil.newMDCAwareCachedThreadPool(
private volatile ExecutorService coreContainerWorkExecutor = ExecutorUtil.newMDCAwareCachedThreadPool(
new DefaultSolrThreadFactory("coreContainerWorkExecutor") );
private final OrderedExecutor replayUpdatesExecutor;
protected LogWatcher logging = null;
protected volatile LogWatcher logging = null;
private CloserThread backgroundCloser = null;
private volatile CloserThread backgroundCloser = null;
protected final NodeConfig cfg;
protected final SolrResourceLoader loader;
@ -181,33 +186,33 @@ public class CoreContainer {
protected final CoresLocator coresLocator;
private String hostName;
private volatile String hostName;
private final BlobRepository blobRepository = new BlobRepository(this);
private PluginBag<SolrRequestHandler> containerHandlers = new PluginBag<>(SolrRequestHandler.class, null);
private volatile PluginBag<SolrRequestHandler> containerHandlers = new PluginBag<>(SolrRequestHandler.class, null);
private boolean asyncSolrCoreLoad;
private volatile boolean asyncSolrCoreLoad;
protected SecurityConfHandler securityConfHandler;
protected volatile SecurityConfHandler securityConfHandler;
private SecurityPluginHolder<AuthorizationPlugin> authorizationPlugin;
private volatile SecurityPluginHolder<AuthorizationPlugin> authorizationPlugin;
private SecurityPluginHolder<AuthenticationPlugin> authenticationPlugin;
private volatile SecurityPluginHolder<AuthenticationPlugin> authenticationPlugin;
private BackupRepositoryFactory backupRepoFactory;
private volatile BackupRepositoryFactory backupRepoFactory;
protected SolrMetricManager metricManager;
protected volatile SolrMetricManager metricManager;
protected String metricTag = Integer.toHexString(hashCode());
protected volatile String metricTag = Integer.toHexString(hashCode());
protected MetricsHandler metricsHandler;
protected MetricsHistoryHandler metricsHistoryHandler;
protected volatile MetricsHistoryHandler metricsHistoryHandler;
protected MetricsCollectorHandler metricsCollectorHandler;
protected volatile MetricsCollectorHandler metricsCollectorHandler;
protected AutoscalingHistoryHandler autoscalingHistoryHandler;
protected volatile AutoscalingHistoryHandler autoscalingHistoryHandler;
// Bits for the state variable.
@ -216,7 +221,7 @@ public class CoreContainer {
public final static long INITIAL_CORE_LOAD_COMPLETE = 0x4L;
private volatile long status = 0L;
protected AutoScalingHandler autoScalingHandler;
protected volatile AutoScalingHandler autoScalingHandler;
private enum CoreInitFailedAction { fromleader, none }
@ -759,6 +764,7 @@ public class CoreContainer {
name = getZkController().getNodeName();
cloudManager = getZkController().getSolrCloudManager();
client = new CloudSolrClient.Builder(Collections.singletonList(getZkController().getZkServerAddress()), Optional.empty())
.withSocketTimeout(30000).withConnectionTimeout(15000)
.withHttpClient(updateShardHandler.getDefaultHttpClient()).build();
} else {
name = getNodeConfig().getNodeName();
@ -818,53 +824,40 @@ public class CoreContainer {
return isShutDown;
}
/**
* Stops all cores.
*/
public void shutdown() {
log.info("Shutting down CoreContainer instance="
+ System.identityHashCode(this));
ForkJoinPool customThreadPool = new ForkJoinPool(6);
isShutDown = true;
ExecutorUtil.shutdownAndAwaitTermination(coreContainerWorkExecutor);
replayUpdatesExecutor.shutdownAndAwaitTermination();
if (metricsHistoryHandler != null) {
IOUtils.closeQuietly(metricsHistoryHandler.getSolrClient());
metricsHistoryHandler.close();
}
if (metricManager != null) {
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.node));
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm));
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jetty));
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.node), metricTag);
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm), metricTag);
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jetty), metricTag);
}
try {
if (isZooKeeperAware()) {
cancelCoreRecoveries();
if (isZooKeeperAware()) {
cancelCoreRecoveries();
zkSys.zkController.publishNodeAsDown(zkSys.zkController.getNodeName());
try {
zkSys.zkController.removeEphemeralLiveNode();
} catch (AlreadyClosedException | SessionExpiredException | ConnectionLossException e) {
} catch (Exception e) {
log.warn("Error removing live node. Continuing to close CoreContainer", e);
}
if (metricManager != null) {
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.cluster));
}
}
try {
if (coreAdminHandler != null) coreAdminHandler.shutdown();
if (zkSys.zkController.getZkClient().getConnectionManager().isConnected()) {
log.info("Publish this node as DOWN...");
zkSys.zkController.publishNodeAsDown(zkSys.zkController.getNodeName());
}
} catch (Exception e) {
log.warn("Error shutting down CoreAdminHandler. Continuing to close CoreContainer.", e);
log.warn("Error publishing nodes as down. Continuing to close CoreContainer", e);
}
}
try {
ExecutorUtil.shutdownAndAwaitTermination(coreContainerWorkExecutor);
// First wake up the closer thread, it'll terminate almost immediately since it checks isShutDown.
synchronized (solrCores.getModifyLock()) {
solrCores.getModifyLock().notifyAll(); // wake up anyone waiting
@ -897,21 +890,71 @@ public class CoreContainer {
solrCores.getModifyLock().notifyAll(); // wake up the thread
}
customThreadPool.submit(() -> Collections.singleton(replayUpdatesExecutor).parallelStream().forEach(c -> {
c.shutdownAndAwaitTermination();
}));
if (metricsHistoryHandler != null) {
customThreadPool.submit(() -> Collections.singleton(metricsHistoryHandler).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
customThreadPool.submit(() -> Collections.singleton(metricsHistoryHandler.getSolrClient()).parallelStream().forEach(c -> {
IOUtils.closeQuietly(c);
}));
}
if (metricManager != null) {
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.node));
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm));
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jetty));
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.node), metricTag);
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jvm), metricTag);
metricManager.unregisterGauges(SolrMetricManager.getRegistryName(SolrInfoBean.Group.jetty), metricTag);
}
if (isZooKeeperAware()) {
cancelCoreRecoveries();
if (metricManager != null) {
metricManager.closeReporters(SolrMetricManager.getRegistryName(SolrInfoBean.Group.cluster));
}
}
try {
if (coreAdminHandler != null) {
customThreadPool.submit(() -> Collections.singleton(coreAdminHandler).parallelStream().forEach(c -> {
c.shutdown();
}));
}
} catch (Exception e) {
log.warn("Error shutting down CoreAdminHandler. Continuing to close CoreContainer.", e);
}
} finally {
try {
if (shardHandlerFactory != null) {
shardHandlerFactory.close();
customThreadPool.submit(() -> Collections.singleton(shardHandlerFactory).parallelStream().forEach(c -> {
c.close();
}));
}
} finally {
try {
if (updateShardHandler != null) {
customThreadPool.submit(() -> Collections.singleton(shardHandlerFactory).parallelStream().forEach(c -> {
updateShardHandler.close();
}));
}
} finally {
try {
// we want to close zk stuff last
zkSys.close();
} finally {
ExecutorUtil.shutdownAndAwaitTermination(customThreadPool);
}
}
}
}
// It should be safe to close the authorization plugin at this point.
@ -1384,6 +1427,9 @@ public class CoreContainer {
* @param name the name of the SolrCore to reload
*/
public void reload(String name) {
if (isShutDown) {
throw new AlreadyClosedException();
}
SolrCore core = solrCores.getCoreFromAnyList(name, false);
if (core != null) {

View File

@ -162,6 +162,7 @@ import org.apache.solr.util.NumberUtils;
import org.apache.solr.util.PropertiesInputStream;
import org.apache.solr.util.PropertiesOutputStream;
import org.apache.solr.util.RefCounted;
import org.apache.solr.util.TestInjection;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
import org.apache.solr.util.plugin.PluginInfoInitialized;
import org.apache.solr.util.plugin.SolrCoreAware;
@ -764,10 +765,14 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
// Create the index if it doesn't exist.
if (!indexExists) {
log.debug("{}Solr index directory '{}' doesn't exist. Creating new index...", logid, indexDir);
SolrIndexWriter writer = SolrIndexWriter.create(this, "SolrCore.initIndex", indexDir, getDirectoryFactory(), true,
SolrIndexWriter writer = null;
try {
writer = SolrIndexWriter.create(this, "SolrCore.initIndex", indexDir, getDirectoryFactory(), true,
getLatestSchema(), solrConfig.indexConfig, solrDelPolicy, codec);
writer.close();
} finally {
IOUtils.closeQuietly(writer);
}
}
cleanupOldIndexDirectories(reload);
@ -992,6 +997,33 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
resourceLoader.inform(resourceLoader);
resourceLoader.inform(this); // last call before the latch is released.
this.updateHandler.informEventListeners(this);
infoRegistry.put("core", this);
// register any SolrInfoMBeans SolrResourceLoader initialized
//
// this must happen after the latch is released, because a JMX server impl may
// choose to block on registering until properties can be fetched from an MBean,
// and a SolrCoreAware MBean may have properties that depend on getting a Searcher
// from the core.
resourceLoader.inform(infoRegistry);
// Allow the directory factory to report metrics
if (directoryFactory instanceof SolrMetricProducer) {
((SolrMetricProducer) directoryFactory).initializeMetrics(metricManager, coreMetricManager.getRegistryName(),
metricTag, "directoryFactory");
}
// seed version buckets with max from index during core initialization ... requires a searcher!
seedVersionBuckets();
bufferUpdatesIfConstructing(coreDescriptor);
this.ruleExpiryLock = new ReentrantLock();
this.snapshotDelLock = new ReentrantLock();
registerConfListener();
} catch (Throwable e) {
// release the latch, otherwise we block trying to do the close. This
// should be fine, since counting down on a latch of 0 is still fine
@ -1017,31 +1049,6 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
latch.countDown();
}
infoRegistry.put("core", this);
// register any SolrInfoMBeans SolrResourceLoader initialized
//
// this must happen after the latch is released, because a JMX server impl may
// choose to block on registering until properties can be fetched from an MBean,
// and a SolrCoreAware MBean may have properties that depend on getting a Searcher
// from the core.
resourceLoader.inform(infoRegistry);
// Allow the directory factory to report metrics
if (directoryFactory instanceof SolrMetricProducer) {
((SolrMetricProducer)directoryFactory).initializeMetrics(metricManager, coreMetricManager.getRegistryName(), metricTag, "directoryFactory");
}
// seed version buckets with max from index during core initialization ... requires a searcher!
seedVersionBuckets();
bufferUpdatesIfConstructing(coreDescriptor);
this.ruleExpiryLock = new ReentrantLock();
this.snapshotDelLock = new ReentrantLock();
registerConfListener();
assert ObjectReleaseTracker.track(this);
}
@ -1999,7 +2006,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
*/
public RefCounted<SolrIndexSearcher> openNewSearcher(boolean updateHandlerReopens, boolean realtime) {
if (isClosed()) { // catch some errors quicker
throw new SolrException(ErrorCode.SERVER_ERROR, "openNewSearcher called on closed core");
throw new SolrCoreState.CoreIsClosedException();
}
SolrIndexSearcher tmp;
@ -2372,7 +2379,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
return returnSearcher ? newSearchHolder : null;
} catch (Exception e) {
if (e instanceof SolrException) throw (SolrException)e;
if (e instanceof RuntimeException) throw (RuntimeException)e;
throw new SolrException(ErrorCode.SERVER_ERROR, e);
} finally {
@ -2491,6 +2498,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
// even in the face of errors.
onDeckSearchers--;
searcherLock.notifyAll();
assert TestInjection.injectSearcherHooks(getCoreDescriptor() != null && getCoreDescriptor().getCloudDescriptor() != null ? getCoreDescriptor().getCloudDescriptor().getCollectionName() : null);
}
}
}
@ -3008,7 +3016,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
int solrConfigversion, overlayVersion, managedSchemaVersion = 0;
SolrConfig cfg = null;
try (SolrCore solrCore = cc.solrCores.getCoreFromAnyList(coreName, true)) {
if (solrCore == null || solrCore.isClosed()) return;
if (solrCore == null || solrCore.isClosed() || solrCore.getCoreContainer().isShutDown()) return;
cfg = solrCore.getSolrConfig();
solrConfigversion = solrCore.getSolrConfig().getOverlay().getZnodeVersion();
overlayVersion = solrCore.getSolrConfig().getZnodeVersion();
@ -3042,7 +3050,7 @@ public final class SolrCore implements SolrInfoBean, SolrMetricProducer, Closeab
}
//some files in conf directory may have other than managedschema, overlay, params
try (SolrCore solrCore = cc.solrCores.getCoreFromAnyList(coreName, true)) {
if (solrCore == null || solrCore.isClosed()) return;
if (solrCore == null || solrCore.isClosed() || cc.isShutDown()) return;
for (Runnable listener : solrCore.confListeners) {
try {
listener.run();

View File

@ -31,7 +31,7 @@ import org.slf4j.LoggerFactory;
public abstract class TransientSolrCoreCacheFactory {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
private CoreContainer coreContainer = null;
private volatile CoreContainer coreContainer = null;
public abstract TransientSolrCoreCache getTransientSolrCoreCache();
/**

View File

@ -18,7 +18,7 @@ package org.apache.solr.core;
public class TransientSolrCoreCacheFactoryDefault extends TransientSolrCoreCacheFactory {
TransientSolrCoreCache transientSolrCoreCache = null;
volatile TransientSolrCoreCache transientSolrCoreCache = null;
@Override
public TransientSolrCoreCache getTransientSolrCoreCache() {

View File

@ -31,6 +31,7 @@ import java.util.function.Predicate;
import org.apache.solr.cloud.CurrentCoreDescriptorProvider;
import org.apache.solr.cloud.SolrZkServer;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.ZkConfigManager;
@ -174,24 +175,31 @@ public class ZkContainer {
return zkRun.substring(0, zkRun.lastIndexOf('/'));
}
public static Predicate<CoreDescriptor> testing_beforeRegisterInZk;
public static volatile Predicate<CoreDescriptor> testing_beforeRegisterInZk;
public void registerInZk(final SolrCore core, boolean background, boolean skipRecovery) {
CoreDescriptor cd = core.getCoreDescriptor(); // save this here - the core may not have it later
Runnable r = () -> {
MDCLoggingContext.setCore(core);
try {
try {
if (testing_beforeRegisterInZk != null) {
testing_beforeRegisterInZk.test(core.getCoreDescriptor());
testing_beforeRegisterInZk.test(cd);
}
if (!core.getCoreContainer().isShutDown()) {
zkController.register(core.getName(), cd, skipRecovery);
}
zkController.register(core.getName(), core.getCoreDescriptor(), skipRecovery);
} catch (InterruptedException e) {
// Restore the interrupted status
Thread.currentThread().interrupt();
SolrException.log(log, "", e);
} catch (KeeperException e) {
SolrException.log(log, "", e);
} catch (AlreadyClosedException e) {
} catch (Exception e) {
try {
zkController.publish(core.getCoreDescriptor(), Replica.State.DOWN);
zkController.publish(cd, Replica.State.DOWN);
} catch (InterruptedException e1) {
Thread.currentThread().interrupt();
log.error("", e1);

View File

@ -97,6 +97,7 @@ class CdcrReplicatorManager implements CdcrStateManager.CdcrStateObserver {
String targetCollection = params.get(CdcrParams.TARGET_COLLECTION_PARAM);
CloudSolrClient client = new Builder(Collections.singletonList(zkHost), Optional.empty())
.withSocketTimeout(30000).withConnectionTimeout(15000)
.sendUpdatesOnlyToShardLeaders()
.build();
client.setDefaultCollection(targetCollection);

View File

@ -222,7 +222,7 @@ public class IndexFetcher {
httpClientParams.set(HttpClientUtil.PROP_BASIC_AUTH_PASS, httpBasicAuthPassword);
httpClientParams.set(HttpClientUtil.PROP_ALLOW_COMPRESSION, useCompression);
return HttpClientUtil.createClient(httpClientParams, core.getCoreContainer().getUpdateShardHandler().getDefaultConnectionManager(), true);
return HttpClientUtil.createClient(httpClientParams, core.getCoreContainer().getUpdateShardHandler().getRecoveryOnlyConnectionManager(), true);
}
public IndexFetcher(final NamedList initArgs, final ReplicationHandler handler, final SolrCore sc) {

View File

@ -197,7 +197,7 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw
private boolean replicateOnStart = false;
private ScheduledExecutorService executorService;
private volatile ScheduledExecutorService executorService;
private volatile long executorStartTime;
@ -1369,6 +1369,8 @@ public class ReplicationHandler extends RequestHandlerBase implements SolrCoreAw
if (restoreFuture != null) {
restoreFuture.cancel(false);
}
ExecutorUtil.shutdownAndAwaitTermination(executorService);
}
/**

View File

@ -125,7 +125,7 @@ public class AutoscalingHistoryHandler extends RequestHandlerBase implements Per
}
}
}
try (CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(coreContainer.getZkController().getZkServerAddress()), Optional.empty())
try (CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(coreContainer.getZkController().getZkServerAddress()), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000)
.withHttpClient(coreContainer.getUpdateShardHandler().getDefaultHttpClient())
.build()) {
QueryResponse qr = cloudSolrClient.query(collection, params);

View File

@ -31,6 +31,7 @@ import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.stream.Collectors;
import com.google.common.collect.ImmutableList;
@ -45,10 +46,10 @@ import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.CoreAdminRequest.RequestSyncShard;
import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.client.solrj.util.SolrIdentifierValidator;
import org.apache.solr.cloud.Overseer;
import org.apache.solr.cloud.OverseerSolrResponse;
import org.apache.solr.cloud.OverseerTaskQueue;
import org.apache.solr.cloud.OverseerTaskQueue.QueueEvent;
import org.apache.solr.cloud.ZkController.NotInClusterStateException;
import org.apache.solr.cloud.ZkController;
import org.apache.solr.cloud.ZkShardTerms;
import org.apache.solr.cloud.overseer.SliceMutator;
@ -285,7 +286,7 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
} else {
// submits and doesn't wait for anything (no response)
Overseer.getStateUpdateQueue(coreContainer.getZkController().getZkClient()).offer(Utils.toJSON(props));
coreContainer.getZkController().getOverseer().offerStateUpdate(Utils.toJSON(props));
}
}
@ -1249,28 +1250,30 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
return;
}
int replicaFailCount;
if (createCollResponse.getResponse().get("failure") != null) {
// TODO: we should not wait for Replicas we know failed
replicaFailCount = ((NamedList) createCollResponse.getResponse().get("failure")).size();
} else {
replicaFailCount = 0;
}
String replicaNotAlive = null;
String replicaState = null;
String nodeNotLive = null;
CloudConfig ccfg = cc.getConfig().getCloudConfig();
Integer numRetries = ccfg.getCreateCollectionWaitTimeTillActive(); // this config is actually # seconds, not # tries
Integer seconds = ccfg.getCreateCollectionWaitTimeTillActive();
Boolean checkLeaderOnly = ccfg.isCreateCollectionCheckLeaderActive();
log.info("Wait for new collection to be active for at most " + numRetries + " seconds. Check all shard "
log.info("Wait for new collection to be active for at most " + seconds + " seconds. Check all shard "
+ (checkLeaderOnly ? "leaders" : "replicas"));
ZkStateReader zkStateReader = cc.getZkController().getZkStateReader();
for (int i = 0; i < numRetries; i++) {
ClusterState clusterState = zkStateReader.getClusterState();
final DocCollection docCollection = clusterState.getCollectionOrNull(collectionName);
try {
cc.getZkController().getZkStateReader().waitForState(collectionName, seconds, TimeUnit.SECONDS, (n, c) -> {
if (docCollection != null && docCollection.getSlices() != null) {
Collection<Slice> shards = docCollection.getSlices();
replicaNotAlive = null;
if (c == null) {
// the collection was not created, don't wait
return true;
}
if (c.getSlices() != null) {
Collection<Slice> shards = c.getSlices();
int replicaNotAliveCnt = 0;
for (Slice shard : shards) {
Collection<Replica> replicas;
if (!checkLeaderOnly) replicas = shard.getReplicas();
@ -1282,28 +1285,24 @@ public class CollectionsHandler extends RequestHandlerBase implements Permission
String state = replica.getStr(ZkStateReader.STATE_PROP);
log.debug("Checking replica status, collection={} replica={} state={}", collectionName,
replica.getCoreUrl(), state);
if (!clusterState.liveNodesContain(replica.getNodeName())
if (!n.contains(replica.getNodeName())
|| !state.equals(Replica.State.ACTIVE.toString())) {
replicaNotAlive = replica.getCoreUrl();
nodeNotLive = replica.getNodeName();
replicaState = state;
break;
replicaNotAliveCnt++;
return false;
}
}
if (replicaNotAlive != null) break;
}
if (replicaNotAlive == null) return;
if ((replicaNotAliveCnt == 0) || (replicaNotAliveCnt <= replicaFailCount)) return true;
}
Thread.sleep(1000); // thus numRetries is roughly number of seconds
}
if (nodeNotLive != null && replicaState != null) {
log.error("Timed out waiting for new collection's replicas to become ACTIVE "
+ (replicaState.equals(Replica.State.ACTIVE.toString()) ? "node " + nodeNotLive + " is not live"
: "replica " + replicaNotAlive + " is in state of " + replicaState.toString()) + " with timeout=" + numRetries);
} else {
log.error("Timed out waiting for new collection's replicas to become ACTIVE with timeout=" + numRetries);
return false;
});
} catch (TimeoutException | InterruptedException e) {
String error = "Timeout waiting for active collection " + collectionName + " with timeout=" + seconds;
throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
}
}
public static void verifyRuleParams(CoreContainer cc, Map<String, Object> m) {

View File

@ -371,7 +371,7 @@ public class CoreAdminHandler extends RequestHandlerBase implements PermissionNa
* Method to ensure shutting down of the ThreadPool Executor.
*/
public void shutdown() {
if (parallelExecutor != null && !parallelExecutor.isShutdown())
if (parallelExecutor != null)
ExecutorUtil.shutdownAndAwaitTermination(parallelExecutor);
}

View File

@ -642,7 +642,17 @@ public class MetricsHistoryHandler extends RequestHandlerBase implements Permiss
public void close() {
log.debug("Closing " + hashCode());
if (collectService != null) {
boolean shutdown = false;
while (!shutdown) {
try {
// Wait a while for existing tasks to terminate
collectService.shutdownNow();
shutdown = collectService.awaitTermination(5, TimeUnit.SECONDS);
} catch (InterruptedException ie) {
// Preserve interrupt status
Thread.currentThread().interrupt();
}
}
}
if (factory != null) {
factory.close();

View File

@ -18,13 +18,15 @@
package org.apache.solr.handler.admin;
import java.lang.invoke.MethodHandles;
import java.util.Objects;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.solr.cloud.CloudDescriptor;
import org.apache.solr.cloud.ZkController.NotInClusterStateException;
import org.apache.solr.cloud.ZkShardTerms;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.ZkStateReader;
@ -47,10 +49,7 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
final SolrParams params = it.req.getParams();
String cname = params.get(CoreAdminParams.CORE);
if (cname == null) {
cname = "";
}
String cname = params.get(CoreAdminParams.CORE, "");
String nodeName = params.get("nodeName");
String coreNodeName = params.get("coreNodeName");
@ -59,55 +58,46 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
Boolean onlyIfLeader = params.getBool("onlyIfLeader");
Boolean onlyIfLeaderActive = params.getBool("onlyIfLeaderActive");
CoreContainer coreContainer = it.handler.coreContainer;
// wait long enough for the leader conflict to work itself out plus a little extra
int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait();
int maxTries = (int) Math.round(conflictWaitMs / 1000) + 3;
log.info("Going to wait for coreNodeName: {}, state: {}, checkLive: {}, onlyIfLeader: {}, onlyIfLeaderActive: {}, maxTime: {} s",
coreNodeName, waitForState, checkLive, onlyIfLeader, onlyIfLeaderActive, maxTries);
log.info(
"Going to wait for coreNodeName: {}, state: {}, checkLive: {}, onlyIfLeader: {}, onlyIfLeaderActive: {}",
coreNodeName, waitForState, checkLive, onlyIfLeader, onlyIfLeaderActive);
Replica.State state = null;
boolean live = false;
int retry = 0;
while (true) {
String collectionName;
CloudDescriptor cloudDescriptor;
try (SolrCore core = coreContainer.getCore(cname)) {
if (core == null && retry == Math.min(30, maxTries)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:"
+ cname);
if (core == null) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:" + cname);
collectionName = core.getCoreDescriptor().getCloudDescriptor().getCollectionName();
cloudDescriptor = core.getCoreDescriptor()
.getCloudDescriptor();
}
if (core != null) {
AtomicReference<String> errorMessage = new AtomicReference<>();
try {
coreContainer.getZkController().getZkStateReader().waitForState(collectionName, conflictWaitMs, TimeUnit.MILLISECONDS, (n, c) -> {
if (c == null)
return false;
try (SolrCore core = coreContainer.getCore(cname)) {
if (core == null) throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:" + cname);
if (onlyIfLeader != null && onlyIfLeader) {
if (!core.getCoreDescriptor().getCloudDescriptor().isLeader()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "We are not the leader");
}
}
}
// wait until we are sure the recovering node is ready
// to accept updates
CloudDescriptor cloudDescriptor = core.getCoreDescriptor()
.getCloudDescriptor();
String collectionName = cloudDescriptor.getCollectionName();
if (retry % 15 == 0) {
if (retry > 0 && log.isInfoEnabled())
log.info("After " + retry + " seconds, core " + cname + " (" +
cloudDescriptor.getShardId() + " of " +
cloudDescriptor.getCollectionName() + ") still does not have state: " +
waitForState + "; forcing ClusterState update from ZooKeeper");
// force a cluster state update
coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName);
}
ClusterState clusterState = coreContainer.getZkController().getClusterState();
DocCollection collection = clusterState.getCollection(collectionName);
Slice slice = collection.getSlice(cloudDescriptor.getShardId());
Replica.State state = null;
boolean live = false;
Slice slice = c.getSlice(cloudDescriptor.getShardId());
if (slice != null) {
final Replica replica = slice.getReplicasMap().get(coreNodeName);
if (replica != null) {
state = replica.getState();
live = clusterState.liveNodesContain(nodeName);
live = n.contains(nodeName);
final Replica.State localState = cloudDescriptor.getLastPublished();
@ -116,76 +106,62 @@ class PrepRecoveryOp implements CoreAdminHandler.CoreAdminOp {
// this is a safeguard
boolean leaderDoesNotNeedRecovery = (onlyIfLeader != null &&
onlyIfLeader &&
core.getName().equals(replica.getStr("core")) &&
cname.equals(replica.getStr("core")) &&
waitForState == Replica.State.RECOVERING &&
localState == Replica.State.ACTIVE &&
state == Replica.State.ACTIVE);
if (leaderDoesNotNeedRecovery) {
log.warn("Leader " + core.getName() + " ignoring request to be in the recovering state because it is live and active.");
log.warn(
"Leader " + cname + " ignoring request to be in the recovering state because it is live and active.");
}
ZkShardTerms shardTerms = coreContainer.getZkController().getShardTerms(collectionName, slice.getName());
// if the replica is waiting for leader to see recovery state, the leader should refresh its terms
if (waitForState == Replica.State.RECOVERING && shardTerms.registered(coreNodeName) && shardTerms.skipSendingUpdatesTo(coreNodeName)) {
if (waitForState == Replica.State.RECOVERING && shardTerms.registered(coreNodeName)
&& shardTerms.skipSendingUpdatesTo(coreNodeName)) {
// The replica changed it term, then published itself as RECOVERING.
// This core already see replica as RECOVERING
// so it is guarantees that a live-fetch will be enough for this core to see max term published
shardTerms.refreshTerms();
}
boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive && localState != Replica.State.ACTIVE;
log.info("In WaitForState(" + waitForState + "): collection=" + collectionName + ", shard=" + slice.getName() +
", thisCore=" + core.getName() + ", leaderDoesNotNeedRecovery=" + leaderDoesNotNeedRecovery +
", isLeader? " + core.getCoreDescriptor().getCloudDescriptor().isLeader() +
", live=" + live + ", checkLive=" + checkLive + ", currentState=" + state.toString() + ", localState=" + localState + ", nodeName=" + nodeName +
", coreNodeName=" + coreNodeName + ", onlyIfActiveCheckResult=" + onlyIfActiveCheckResult + ", nodeProps: " + replica);
boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive
&& localState != Replica.State.ACTIVE;
log.info(
"In WaitForState(" + waitForState + "): collection=" + collectionName + ", shard=" + slice.getName() +
", thisCore=" + cname + ", leaderDoesNotNeedRecovery=" + leaderDoesNotNeedRecovery +
", isLeader? " + cloudDescriptor.isLeader() +
", live=" + live + ", checkLive=" + checkLive + ", currentState=" + state.toString()
+ ", localState=" + localState + ", nodeName=" + nodeName +
", coreNodeName=" + coreNodeName + ", onlyIfActiveCheckResult=" + onlyIfActiveCheckResult
+ ", nodeProps: " + replica);
if (!onlyIfActiveCheckResult && replica != null && (state == waitForState || leaderDoesNotNeedRecovery)) {
if (checkLive == null) {
break;
return true;
} else if (checkLive && live) {
break;
return true;
} else if (!checkLive && !live) {
break;
return true;
}
}
}
}
}
if (retry++ == maxTries) {
String collection = null;
String leaderInfo = null;
String shardId = null;
try {
CloudDescriptor cloudDescriptor =
core.getCoreDescriptor().getCloudDescriptor();
collection = cloudDescriptor.getCollectionName();
shardId = cloudDescriptor.getShardId();
leaderInfo = coreContainer.getZkController().
getZkStateReader().getLeaderUrl(collection, shardId, 5000);
} catch (Exception exc) {
leaderInfo = "Not available due to: " + exc;
}
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"I was asked to wait on state " + waitForState + " for "
+ shardId + " in " + collection + " on " + nodeName
+ " but I still do not see the requested state. I see state: "
+ Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo);
}
if (coreContainer.isShutDown()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
"Solr is shutting down");
}
}
Thread.sleep(1000);
return false;
});
} catch (TimeoutException | InterruptedException e) {
String error = errorMessage.get();
if (error == null)
error = "Timeout waiting for collection state.";
throw new NotInClusterStateException(ErrorCode.SERVER_ERROR, error);
}
log.info("Waited coreNodeName: " + coreNodeName + ", state: " + waitForState
+ ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader + " for: " + retry + " seconds.");
}
}

View File

@ -16,13 +16,16 @@
*/
package org.apache.solr.handler.component;
import java.lang.invoke.MethodHandles;
import java.util.concurrent.Callable;
import java.util.concurrent.Future;
import java.util.concurrent.ExecutorService;
import java.util.List;
import java.util.ArrayList;
import static org.apache.solr.common.params.CommonParams.DISTRIB;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.impl.HttpClientUtil;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
@ -34,16 +37,14 @@ import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.SolrjNamedThreadFactory;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.http.client.HttpClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.DISTRIB;
public abstract class IterativeMergeStrategy implements MergeStrategy {
protected ExecutorService executorService;
protected static HttpClient httpClient;
protected volatile ExecutorService executorService;
protected volatile CloseableHttpClient httpClient;
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -51,11 +52,13 @@ public abstract class IterativeMergeStrategy implements MergeStrategy {
rb._responseDocs = new SolrDocumentList(); // Null pointers will occur otherwise.
rb.onePassDistributedQuery = true; // Turn off the second pass distributed.
executorService = ExecutorUtil.newMDCAwareCachedThreadPool(new SolrjNamedThreadFactory("IterativeMergeStrategy"));
httpClient = getHttpClient();
try {
process(rb, sreq);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
HttpClientUtil.close(httpClient);
executorService.shutdownNow();
}
}
@ -76,7 +79,7 @@ public abstract class IterativeMergeStrategy implements MergeStrategy {
}
public static class CallBack implements Callable<CallBack> {
public class CallBack implements Callable<CallBack> {
private HttpSolrClient solrClient;
private QueryRequest req;
private QueryResponse response;
@ -85,7 +88,7 @@ public abstract class IterativeMergeStrategy implements MergeStrategy {
public CallBack(ShardResponse originalShardResponse, QueryRequest req) {
this.solrClient = new Builder(originalShardResponse.getShardAddress())
.withHttpClient(getHttpClient())
.withHttpClient(httpClient)
.build();
this.req = req;
this.originalShardResponse = originalShardResponse;
@ -122,16 +125,16 @@ public abstract class IterativeMergeStrategy implements MergeStrategy {
protected abstract void process(ResponseBuilder rb, ShardRequest sreq) throws Exception;
static synchronized HttpClient getHttpClient() {
if(httpClient == null) {
private CloseableHttpClient getHttpClient() {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(HttpClientUtil.PROP_MAX_CONNECTIONS, 128);
params.set(HttpClientUtil.PROP_MAX_CONNECTIONS_PER_HOST, 32);
httpClient = HttpClientUtil.createClient(params);
return httpClient;
} else {
CloseableHttpClient httpClient = HttpClientUtil.createClient(params);
return httpClient;
}
}
}

View File

@ -38,7 +38,6 @@ import org.apache.solr.common.util.DataInputInputStream;
import org.apache.solr.common.util.FastInputStream;
import org.apache.solr.common.util.JavaBinCodec;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.RequestHandlerUtils;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.AddUpdateCommand;
@ -89,13 +88,6 @@ public class JavabinLoader extends ContentStreamLoader {
@Override
public void update(SolrInputDocument document, UpdateRequest updateRequest, Integer commitWithin, Boolean overwrite) {
if (document == null) {
// Perhaps commit from the parameters
try {
RequestHandlerUtils.handleCommit(req, processor, updateRequest.getParams(), false);
RequestHandlerUtils.handleRollback(req, processor, updateRequest.getParams(), false);
} catch (IOException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "ERROR handling commit/rollback");
}
return;
}
if (addCmd == null) {

View File

@ -53,7 +53,7 @@ class SolrSchema extends AbstractSchema {
@Override
protected Map<String, Table> getTableMap() {
String zk = this.properties.getProperty("zk");
try(CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zk), Optional.empty()).build()) {
try(CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zk), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000).build()) {
cloudSolrClient.connect();
ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
@ -77,7 +77,7 @@ class SolrSchema extends AbstractSchema {
private Map<String, LukeResponse.FieldInfo> getFieldInfo(String collection) {
String zk = this.properties.getProperty("zk");
try(CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zk), Optional.empty()).build()) {
try(CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zk), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000).build()) {
cloudSolrClient.connect();
LukeRequest lukeRequest = new LukeRequest();
lukeRequest.setNumTerms(0);

View File

@ -34,8 +34,6 @@ import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.RunnableFuture;
import java.util.concurrent.Semaphore;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import org.apache.lucene.index.LeafReader;
@ -66,7 +64,6 @@ import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.GroupParams;
import org.apache.solr.common.params.RequiredSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.common.util.StrUtils;
@ -93,7 +90,6 @@ import org.apache.solr.search.facet.FacetDebugInfo;
import org.apache.solr.search.facet.FacetRequest;
import org.apache.solr.search.grouping.GroupingSpecification;
import org.apache.solr.util.BoundedTreeSet;
import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.RTimer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -170,6 +166,7 @@ public class SimpleFacets {
this.docsOrig = docs;
this.global = params;
this.rb = rb;
this.facetExecutor = req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor();
}
public void setFacetDebugInfo(FacetDebugInfo fdebugParent) {
@ -773,13 +770,7 @@ public class SimpleFacets {
}
};
static final Executor facetExecutor = new ExecutorUtil.MDCAwareThreadPoolExecutor(
0,
Integer.MAX_VALUE,
10, TimeUnit.SECONDS, // terminate idle threads after 10 sec
new SynchronousQueue<Runnable>() // directly hand off tasks
, new DefaultSolrThreadFactory("facetExecutor")
);
private final Executor facetExecutor;
/**
* Returns a list of value constraints and the associated facet counts

View File

@ -55,7 +55,7 @@ public class SolrRequestInfo {
SolrRequestInfo prev = threadLocal.get();
if (prev != null) {
log.error("Previous SolrRequestInfo was not closed! req=" + prev.req.getOriginalParams().toString());
log.error("prev == info : {}", prev.req == info.req);
log.error("prev == info : {}", prev.req == info.req, new RuntimeException());
}
assert prev == null;

View File

@ -60,7 +60,7 @@ public class PKIAuthenticationPlugin extends AuthenticationPlugin implements Htt
private final Map<String, PublicKey> keyCache = new ConcurrentHashMap<>();
private final PublicKeyHandler publicKeyHandler;
private final CoreContainer cores;
private final int MAX_VALIDITY = Integer.parseInt(System.getProperty("pkiauth.ttl", "10000"));
private final int MAX_VALIDITY = Integer.parseInt(System.getProperty("pkiauth.ttl", "15000"));
private final String myNodeName;
private final HttpHeaderClientInterceptor interceptor = new HttpHeaderClientInterceptor();
private boolean interceptorRegistered = false;

View File

@ -885,9 +885,8 @@ public class HttpSolrCall {
boolean byCoreName = false;
if (slices == null) {
activeSlices = new ArrayList<>();
// look by core name
byCoreName = true;
activeSlices = new ArrayList<>();
getSlicesForCollections(clusterState, activeSlices, true);
if (activeSlices.isEmpty()) {
getSlicesForCollections(clusterState, activeSlices, false);
@ -930,7 +929,7 @@ public class HttpSolrCall {
if (!activeReplicas || (liveNodes.contains(replica.getNodeName())
&& replica.getState() == Replica.State.ACTIVE)) {
if (byCoreName && !collectionName.equals(replica.getStr(CORE_NAME_PROP))) {
if (byCoreName && !origCorename.equals(replica.getStr(CORE_NAME_PROP))) {
// if it's by core name, make sure they match
continue;
}

View File

@ -102,6 +102,7 @@ public class SolrDispatchFilter extends BaseSolrFilter {
private final String metricTag = Integer.toHexString(hashCode());
private SolrMetricManager metricManager;
private String registryName;
private volatile boolean closeOnDestroy = true;
/**
* Enum to define action that needs to be processed.
@ -294,11 +295,22 @@ public class SolrDispatchFilter extends BaseSolrFilter {
@Override
public void destroy() {
if (closeOnDestroy) {
close();
}
}
public void close() {
CoreContainer cc = cores;
cores = null;
try {
try {
FileCleaningTracker fileCleaningTracker = SolrRequestParsers.fileCleaningTracker;
if (fileCleaningTracker != null) {
fileCleaningTracker.exitWhenFinished();
}
} catch (NullPointerException e) {
// okay
} catch (Exception e) {
log.warn("Exception closing FileCleaningTracker", e);
} finally {
@ -306,14 +318,20 @@ public class SolrDispatchFilter extends BaseSolrFilter {
}
if (metricManager != null) {
metricManager.unregisterGauges(registryName, metricTag);
}
if (cores != null) {
try {
cores.shutdown();
metricManager.unregisterGauges(registryName, metricTag);
} catch (NullPointerException e) {
// okay
} catch (Exception e) {
log.warn("Exception closing FileCleaningTracker", e);
} finally {
cores = null;
metricManager = null;
}
}
} finally {
if (cc != null) {
httpClient = null;
cc.shutdown();
}
}
}
@ -594,4 +612,8 @@ public class SolrDispatchFilter extends BaseSolrFilter {
return response;
}
}
public void closeOnDestroy(boolean closeOnDestroy) {
this.closeOnDestroy = closeOnDestroy;
}
}

View File

@ -59,7 +59,7 @@ public final class CommitTracker implements Runnable {
private long tLogFileSizeUpperBound;
private final ScheduledExecutorService scheduler =
Executors.newScheduledThreadPool(1, new DefaultSolrThreadFactory("commitScheduler"));
Executors.newScheduledThreadPool(0, new DefaultSolrThreadFactory("commitScheduler"));
private ScheduledFuture pending;
// state

View File

@ -814,7 +814,7 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
}
public static boolean commitOnClose = true; // TODO: make this a real config option or move it to TestInjection
public static volatile boolean commitOnClose = true; // TODO: make this a real config option or move it to TestInjection
// IndexWriterCloser interface method - called from solrCoreState.decref(this)
@Override
@ -823,16 +823,14 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
assert TestInjection.injectNonGracefullClose(core.getCoreContainer());
boolean clearRequestInfo = false;
solrCoreState.getCommitLock().lock();
try {
SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
SolrQueryResponse rsp = new SolrQueryResponse();
if (SolrRequestInfo.getRequestInfo() == null) {
clearRequestInfo = true;
SolrRequestInfo.setRequestInfo(new SolrRequestInfo(req, rsp)); // important for debugging
}
try {
if (!commitOnClose) {
if (writer != null) {
writer.rollback();
@ -846,8 +844,12 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
}
// do a commit before we quit?
boolean tryToCommit = writer != null && ulog != null && ulog.hasUncommittedChanges() && ulog.getState() == UpdateLog.State.ACTIVE;
boolean tryToCommit = writer != null && ulog != null && ulog.hasUncommittedChanges()
&& ulog.getState() == UpdateLog.State.ACTIVE;
// be tactical with this lock! closing the updatelog can deadlock when it tries to commit
solrCoreState.getCommitLock().lock();
try {
try {
if (tryToCommit) {
log.info("Committing on IndexWriter close.");
@ -878,6 +880,13 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
}
}
} finally {
solrCoreState.getCommitLock().unlock();
}
} finally {
if (clearRequestInfo) SolrRequestInfo.clearRequestInfo();
}
// we went through the normal process to commit, so we don't have to artificially
// cap any ulog files.
try {
@ -893,10 +902,6 @@ public class DirectUpdateHandler2 extends UpdateHandler implements SolrCoreState
writer.close();
}
} finally {
solrCoreState.getCommitLock().unlock();
if (clearRequestInfo) SolrRequestInfo.clearRequestInfo();
}
}
@Override

View File

@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.Sort;
import org.apache.solr.cloud.ActionThrottle;
import org.apache.solr.cloud.RecoveryStrategy;
import org.apache.solr.common.AlreadyClosedException;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.DirectoryFactory;
@ -172,7 +173,12 @@ public abstract class SolrCoreState {
public abstract void setLastReplicateIndexSuccess(boolean success);
public static class CoreIsClosedException extends IllegalStateException {
public static class CoreIsClosedException extends AlreadyClosedException {
public CoreIsClosedException() {
super();
}
public CoreIsClosedException(String s) {
super(s);
}

View File

@ -183,7 +183,7 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
}
long id = -1;
protected State state = State.ACTIVE;
protected volatile State state = State.ACTIVE;
protected TransactionLog bufferTlog;
protected TransactionLog tlog;
@ -1351,9 +1351,10 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
}
public void close(boolean committed, boolean deleteOnClose) {
synchronized (this) {
recoveryExecutor.shutdown(); // no new tasks
synchronized (this) {
// Don't delete the old tlogs, we want to be able to replay from them and retrieve old versions
doClose(prevTlog, committed);
@ -1373,13 +1374,14 @@ public class UpdateLog implements PluginInfoInitialized, SolrMetricProducer {
bufferTlog.forceClose();
}
}
try {
ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor);
} catch (Exception e) {
SolrException.log(log, e);
}
}
}
static class Update {

View File

@ -66,10 +66,14 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
private final CloseableHttpClient updateOnlyClient;
private final CloseableHttpClient recoveryOnlyClient;
private final CloseableHttpClient defaultClient;
private final InstrumentedPoolingHttpClientConnectionManager updateOnlyConnectionManager;
private final InstrumentedPoolingHttpClientConnectionManager recoveryOnlyConnectionManager;
private final InstrumentedPoolingHttpClientConnectionManager defaultConnectionManager;
private final InstrumentedHttpRequestExecutor httpRequestExecutor;
@ -83,10 +87,13 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
public UpdateShardHandler(UpdateShardHandlerConfig cfg) {
updateOnlyConnectionManager = new InstrumentedPoolingHttpClientConnectionManager(HttpClientUtil.getSchemaRegisteryProvider().getSchemaRegistry());
recoveryOnlyConnectionManager = new InstrumentedPoolingHttpClientConnectionManager(HttpClientUtil.getSchemaRegisteryProvider().getSchemaRegistry());
defaultConnectionManager = new InstrumentedPoolingHttpClientConnectionManager(HttpClientUtil.getSchemaRegisteryProvider().getSchemaRegistry());
if (cfg != null ) {
updateOnlyConnectionManager.setMaxTotal(cfg.getMaxUpdateConnections());
updateOnlyConnectionManager.setDefaultMaxPerRoute(cfg.getMaxUpdateConnectionsPerHost());
recoveryOnlyConnectionManager.setMaxTotal(cfg.getMaxUpdateConnections());
recoveryOnlyConnectionManager.setDefaultMaxPerRoute(cfg.getMaxUpdateConnectionsPerHost());
defaultConnectionManager.setMaxTotal(cfg.getMaxUpdateConnections());
defaultConnectionManager.setDefaultMaxPerRoute(cfg.getMaxUpdateConnectionsPerHost());
}
@ -110,6 +117,7 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
httpRequestExecutor = new InstrumentedHttpRequestExecutor(metricNameStrategy);
updateOnlyClient = HttpClientUtil.createClient(clientParams, updateOnlyConnectionManager, false, httpRequestExecutor);
recoveryOnlyClient = HttpClientUtil.createClient(clientParams, recoveryOnlyConnectionManager, false, httpRequestExecutor);
defaultClient = HttpClientUtil.createClient(clientParams, defaultConnectionManager, false, httpRequestExecutor);
// following is done only for logging complete configuration.
@ -178,6 +186,11 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
return updateOnlyClient;
}
// don't introduce a bug, this client is for recovery ops only!
public HttpClient getRecoveryOnlyHttpClient() {
return recoveryOnlyClient;
}
/**
* This method returns an executor that is meant for non search related tasks.
@ -192,6 +205,10 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
return defaultConnectionManager;
}
public PoolingHttpClientConnectionManager getRecoveryOnlyConnectionManager() {
return recoveryOnlyConnectionManager;
}
/**
*
* @return executor for recovery operations
@ -206,12 +223,14 @@ public class UpdateShardHandler implements SolrMetricProducer, SolrInfoBean {
ExecutorUtil.shutdownAndAwaitTermination(updateExecutor);
ExecutorUtil.shutdownAndAwaitTermination(recoveryExecutor);
} catch (Exception e) {
SolrException.log(log, e);
throw new RuntimeException(e);
} finally {
HttpClientUtil.close(updateOnlyClient);
HttpClientUtil.close(recoveryOnlyClient);
HttpClientUtil.close(defaultClient);
updateOnlyConnectionManager.close();
defaultConnectionManager.close();
recoveryOnlyConnectionManager.close();
}
}

View File

@ -16,6 +16,9 @@
*/
package org.apache.solr.update.processor;
import static org.apache.solr.common.params.CommonParams.DISTRIB;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
@ -28,6 +31,9 @@ import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantLock;
@ -37,7 +43,6 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrRequest.METHOD;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.cloud.DistributedQueue;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
import org.apache.solr.client.solrj.request.GenericSolrRequest;
import org.apache.solr.client.solrj.request.UpdateRequest;
@ -97,9 +102,6 @@ import org.apache.zookeeper.KeeperException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.common.params.CommonParams.DISTRIB;
import static org.apache.solr.update.processor.DistributingUpdateProcessorFactory.DISTRIB_UPDATE_PARAM;
// NOT mt-safe... create a new processor for each add thread
// TODO: we really should not wait for distrib after local? unless a certain replication factor is asked for
public class DistributedUpdateProcessor extends UpdateRequestProcessor {
@ -116,12 +118,12 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
/**
* Request forwarded to a leader of a different shard will be retried up to this amount of times by default
*/
static final int MAX_RETRIES_ON_FORWARD_DEAULT = 25;
static final int MAX_RETRIES_ON_FORWARD_DEAULT = Integer.getInteger("solr.retries.on.forward", 25);
/**
* Requests from leader to it's followers will be retried this amount of times by default
*/
static final int MAX_RETRIES_TO_FOLLOWERS_DEFAULT = 3;
static final int MAX_RETRIES_TO_FOLLOWERS_DEFAULT = Integer.getInteger("solr.retries.to.followers", 3);
/**
* Values this processor supports for the <code>DISTRIB_UPDATE_PARAM</code>.
@ -434,6 +436,46 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
return false;
}
private List<Node> getReplicaNodesForLeader(String shardId, Replica leaderReplica) {
ClusterState clusterState = zkController.getZkStateReader().getClusterState();
String leaderCoreNodeName = leaderReplica.getName();
List<Replica> replicas = clusterState.getCollection(collection)
.getSlice(shardId)
.getReplicas(EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
replicas.removeIf((replica) -> replica.getName().equals(leaderCoreNodeName));
if (replicas.isEmpty()) {
return null;
}
// check for test param that lets us miss replicas
String[] skipList = req.getParams().getParams(TEST_DISTRIB_SKIP_SERVERS);
Set<String> skipListSet = null;
if (skipList != null) {
skipListSet = new HashSet<>(skipList.length);
skipListSet.addAll(Arrays.asList(skipList));
log.info("test.distrib.skip.servers was found and contains:" + skipListSet);
}
List<Node> nodes = new ArrayList<>(replicas.size());
skippedCoreNodeNames = new HashSet<>();
ZkShardTerms zkShardTerms = zkController.getShardTerms(collection, shardId);
for (Replica replica : replicas) {
String coreNodeName = replica.getName();
if (skipList != null && skipListSet.contains(replica.getCoreUrl())) {
log.info("check url:" + replica.getCoreUrl() + " against:" + skipListSet + " result:true");
} else if (zkShardTerms.registered(coreNodeName) && zkShardTerms.skipSendingUpdatesTo(coreNodeName)) {
log.debug("skip url:{} cause its term is less than leader", replica.getCoreUrl());
skippedCoreNodeNames.add(replica.getName());
} else if (!clusterState.getLiveNodes().contains(replica.getNodeName())
|| replica.getState() == Replica.State.DOWN) {
skippedCoreNodeNames.add(replica.getName());
} else {
nodes.add(new StdNode(new ZkCoreNodeProps(replica), collection, shardId));
}
}
return nodes;
}
/** For {@link org.apache.solr.common.params.CollectionParams.CollectionAction#SPLITSHARD} */
private List<Node> getSubShardLeaders(DocCollection coll, String shardId, String docId, SolrInputDocument doc) {
Collection<Slice> allSlices = coll.getSlices();
@ -521,8 +563,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
ZkStateReader.SHARD_ID_PROP, myShardId,
"routeKey", routeKey + "!");
SolrZkClient zkClient = zkController.getZkClient();
DistributedQueue queue = Overseer.getStateUpdateQueue(zkClient);
queue.offer(Utils.toJSON(map));
zkController.getOverseer().offerStateUpdate(Utils.toJSON(map));
} catch (KeeperException e) {
log.warn("Exception while removing routing rule for route key: " + routeKey, e);
} catch (Exception e) {
@ -1865,27 +1906,33 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
updateCommand = cmd;
List<Node> nodes = null;
boolean singleLeader = false;
Replica leaderReplica = null;
if (zkEnabled) {
zkCheck();
try {
leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, cloudDesc.getShardId());
} catch (InterruptedException e) {
Thread.interrupted();
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + cloudDesc.getShardId(), e);
}
isLeader = leaderReplica.getName().equals(cloudDesc.getCoreNodeName());
nodes = getCollectionUrls(collection, EnumSet.of(Replica.Type.TLOG,Replica.Type.NRT));
nodes = getCollectionUrls(collection, EnumSet.of(Replica.Type.TLOG,Replica.Type.NRT), true);
if (nodes == null) {
// This could happen if there are only pull replicas
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,
"Unable to distribute commit operation. No replicas available of types " + Replica.Type.TLOG + " or " + Replica.Type.NRT);
}
if (isLeader && nodes.size() == 1 && replicaType != Replica.Type.PULL) {
singleLeader = true;
}
nodes.removeIf((node) -> node.getNodeProps().getNodeName().equals(zkController.getNodeName())
&& node.getNodeProps().getCoreName().equals(req.getCore().getName()));
}
if (!zkEnabled || req.getParams().getBool(COMMIT_END_POINT, false) || singleLeader) {
CompletionService<Exception> completionService = new ExecutorCompletionService<>(req.getCore().getCoreContainer().getUpdateShardHandler().getUpdateExecutor());
Set<Future<Exception>> pending = new HashSet<>();
if (!zkEnabled || (!isLeader && req.getParams().get(COMMIT_END_POINT, "").equals("replicas"))) {
if (replicaType == Replica.Type.TLOG) {
try {
Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(
collection, cloudDesc.getShardId());
isLeader = leaderReplica.getName().equals(cloudDesc.getCoreNodeName());
if (isLeader) {
long commitVersion = vinfo.getNewClock();
cmd.setVersion(commitVersion);
@ -1894,9 +1941,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
assert TestInjection.waitForInSyncWithLeader(req.getCore(),
zkController, collection, cloudDesc.getShardId()) : "Core " + req.getCore() + " not in sync with leader";
}
} catch (InterruptedException e) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + cloudDesc.getShardId(), e);
}
} else if (replicaType == Replica.Type.PULL) {
log.warn("Commit not supported on replicas of type " + Replica.Type.PULL);
} else {
@ -1905,21 +1950,51 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
long commitVersion = vinfo.getNewClock();
cmd.setVersion(commitVersion);
}
doLocalCommit(cmd);
}
} else {
ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
if (!req.getParams().getBool(COMMIT_END_POINT, false)) {
params.set(COMMIT_END_POINT, true);
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
List<Node> useNodes = null;
if (req.getParams().get(COMMIT_END_POINT) == null) {
useNodes = nodes;
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.TOLEADER.toString());
params.set(COMMIT_END_POINT, "leaders");
if (useNodes != null) {
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
zkController.getBaseUrl(), req.getCore().getName()));
if (nodes != null) {
cmdDistrib.distribCommit(cmd, nodes, params);
cmdDistrib.distribCommit(cmd, useNodes, params);
cmdDistrib.blockAndDoRetries();
}
}
if (isLeader) {
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(COMMIT_END_POINT, "replicas");
useNodes = getReplicaNodesForLeader(cloudDesc.getShardId(), leaderReplica);
if (useNodes != null) {
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(
zkController.getBaseUrl(), req.getCore().getName()));
cmdDistrib.distribCommit(cmd, useNodes, params);
}
// NRT replicas will always commit
if (vinfo != null) {
long commitVersion = vinfo.getNewClock();
cmd.setVersion(commitVersion);
}
doLocalCommit(cmd);
if (useNodes != null) {
cmdDistrib.blockAndDoRetries();
}
}
}
}
private void doLocalCommit(CommitUpdateCommand cmd) throws IOException {
@ -1951,7 +2026,7 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
if (next != null && nodes == null) next.finish();
}
private List<Node> getCollectionUrls(String collection, EnumSet<Replica.Type> types) {
private List<Node> getCollectionUrls(String collection, EnumSet<Replica.Type> types, boolean onlyLeaders) {
ClusterState clusterState = zkController.getClusterState();
final DocCollection docCollection = clusterState.getCollectionOrNull(collection);
if (collection == null || docCollection.getSlicesMap() == null) {
@ -1962,7 +2037,14 @@ public class DistributedUpdateProcessor extends UpdateRequestProcessor {
final List<Node> urls = new ArrayList<>(slices.size());
for (Map.Entry<String,Slice> sliceEntry : slices.entrySet()) {
Slice replicas = slices.get(sliceEntry.getKey());
if (onlyLeaders) {
Replica replica = docCollection.getLeader(replicas.getName());
if (replica != null) {
ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(replica);
urls.add(new StdNode(nodeProps, collection, replicas.getName()));
}
continue;
}
Map<String,Replica> shardMap = replicas.getReplicasMap();
for (Entry<String,Replica> entry : shardMap.entrySet()) {

View File

@ -2381,7 +2381,7 @@ public class SolrCLI {
protected void deleteCollection(CommandLine cli) throws Exception {
String zkHost = getZkHost(cli);
try (CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zkHost), Optional.empty()).build()) {
try (CloudSolrClient cloudSolrClient = new CloudSolrClient.Builder(Collections.singletonList(zkHost), Optional.empty()).withSocketTimeout(30000).withConnectionTimeout(15000).build()) {
echoIfVerbose("Connecting to ZooKeeper at " + zkHost, cli);
cloudSolrClient.connect();
deleteCollection(cloudSolrClient, cli);

View File

@ -16,6 +16,9 @@
*/
package org.apache.solr.util;
import static org.apache.solr.handler.ReplicationHandler.CMD_DETAILS;
import static org.apache.solr.handler.ReplicationHandler.COMMAND;
import java.lang.invoke.MethodHandles;
import java.lang.reflect.Method;
import java.util.Collections;
@ -24,6 +27,7 @@ import java.util.Random;
import java.util.Set;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
@ -50,9 +54,6 @@ import org.apache.solr.update.SolrIndexWriter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.handler.ReplicationHandler.CMD_DETAILS;
import static org.apache.solr.handler.ReplicationHandler.COMMAND;
/**
* Allows random faults to be injected in running code during test runs.
@ -116,43 +117,50 @@ public class TestInjection {
}
}
public static String nonGracefullClose = null;
public volatile static String nonGracefullClose = null;
public static String failReplicaRequests = null;
public volatile static String failReplicaRequests = null;
public static String failUpdateRequests = null;
public volatile static String failUpdateRequests = null;
public static String nonExistentCoreExceptionAfterUnload = null;
public volatile static String nonExistentCoreExceptionAfterUnload = null;
public static String updateLogReplayRandomPause = null;
public volatile static String updateLogReplayRandomPause = null;
public static String updateRandomPause = null;
public volatile static String updateRandomPause = null;
public static String prepRecoveryOpPauseForever = null;
public volatile static String prepRecoveryOpPauseForever = null;
public static String randomDelayInCoreCreation = null;
public volatile static String randomDelayInCoreCreation = null;
public static int randomDelayMaxInCoreCreationInSec = 10;
public volatile static int randomDelayMaxInCoreCreationInSec = 10;
public static String splitFailureBeforeReplicaCreation = null;
public volatile static String splitFailureBeforeReplicaCreation = null;
public static String splitFailureAfterReplicaCreation = null;
public volatile static String splitFailureAfterReplicaCreation = null;
public static CountDownLatch splitLatch = null;
public volatile static CountDownLatch splitLatch = null;
public static String waitForReplicasInSync = "true:60";
public volatile static String waitForReplicasInSync = "true:60";
public static String failIndexFingerprintRequests = null;
public volatile static String failIndexFingerprintRequests = null;
public static String wrongIndexFingerprint = null;
public volatile static String wrongIndexFingerprint = null;
private static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
private volatile static Set<Timer> timers = Collections.synchronizedSet(new HashSet<Timer>());
private static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0);
private volatile static AtomicInteger countPrepRecoveryOpPauseForever = new AtomicInteger(0);
public static Integer delayBeforeSlaveCommitRefresh=null;
public volatile static Integer delayBeforeSlaveCommitRefresh=null;
public static boolean uifOutOfMemoryError = false;
public volatile static boolean uifOutOfMemoryError = false;
private volatile static CountDownLatch notifyPauseForeverDone = new CountDownLatch(1);
public static void notifyPauseForeverDone() {
notifyPauseForeverDone.countDown();
notifyPauseForeverDone = new CountDownLatch(1);
}
public static void reset() {
nonGracefullClose = null;
@ -172,7 +180,8 @@ public class TestInjection {
wrongIndexFingerprint = null;
delayBeforeSlaveCommitRefresh = null;
uifOutOfMemoryError = false;
notifyPauseForeverDone();
newSearcherHooks.clear();
for (Timer timer : timers) {
timer.cancel();
}
@ -371,19 +380,20 @@ public class TestInjection {
}
public static boolean injectPrepRecoveryOpPauseForever() {
if (prepRecoveryOpPauseForever != null) {
String val = prepRecoveryOpPauseForever;
if (val != null) {
Random rand = random();
if (null == rand) return true;
Pair<Boolean,Integer> pair = parseValue(prepRecoveryOpPauseForever);
Pair<Boolean,Integer> pair = parseValue(val);
boolean enabled = pair.first();
int chanceIn100 = pair.second();
// Prevent for continuous pause forever
if (enabled && rand.nextInt(100) >= (100 - chanceIn100) && countPrepRecoveryOpPauseForever.get() < 1) {
countPrepRecoveryOpPauseForever.incrementAndGet();
log.info("inject pause forever for prep recovery op");
try {
Thread.sleep(Integer.MAX_VALUE);
notifyPauseForeverDone.await();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
@ -481,9 +491,12 @@ public class TestInjection {
return false;
}
private static Pair<Boolean,Integer> parseValue(String raw) {
private static Pair<Boolean,Integer> parseValue(final String raw) {
if (raw == null) return new Pair<>(false, 0);
Matcher m = ENABLED_PERCENT.matcher(raw);
if (!m.matches()) throw new RuntimeException("No match, probably bad syntax: " + raw);
if (!m.matches()) {
throw new RuntimeException("No match, probably bad syntax: " + raw);
}
String val = m.group(1);
String percent = "100";
if (m.groupCount() == 2) {
@ -511,4 +524,24 @@ public class TestInjection {
return true;
}
static Set<Hook> newSearcherHooks = ConcurrentHashMap.newKeySet();
public interface Hook {
public void newSearcher(String collectionName);
public void waitForSearcher(String collection, int cnt, int timeoutms, boolean failOnTimeout) throws InterruptedException;
}
public static boolean newSearcherHook(Hook hook) {
newSearcherHooks.add(hook);
return true;
}
public static boolean injectSearcherHooks(String collectionName) {
for (Hook hook : newSearcherHooks) {
hook.newSearcher(collectionName);
}
return true;
}
}

View File

@ -61,8 +61,13 @@ public class TimeOut {
public void waitFor(String messageOnTimeOut, Supplier<Boolean> supplier)
throws InterruptedException, TimeoutException {
while (!supplier.get() && !hasTimedOut()) {
Thread.sleep(500);
Thread.sleep(250);
}
if (hasTimedOut()) throw new TimeoutException(messageOnTimeOut);
}
@Override
public String toString() {
return "TimeOut [timeoutAt=" + timeoutAt + ", startTime=" + startTime + ", timeSource=" + timeSource + "]";
}
}

View File

@ -35,6 +35,7 @@
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>
<int name="autoReplicaFailoverWorkLoopDelay">${autoReplicaFailoverWorkLoopDelay:10000}</int>
<int name="autoReplicaFailoverBadNodeExpiration">${autoReplicaFailoverBadNodeExpiration:60000}</int>
<int name="createCollectionWaitTimeTillActive">${createCollectionWaitTimeTillActive:30}</int>
</solrcloud>
<metrics>

View File

@ -27,7 +27,7 @@
<shardHandlerFactory name="shardHandlerFactory" class="HttpShardHandlerFactory">
<str name="urlScheme">${urlScheme:}</str>
<int name="socketTimeout">${socketTimeout:90000}</int>
<int name="socketTimeout">${socketTimeout:15000}</int>
<int name="connTimeout">${connTimeout:15000}</int>
</shardHandlerFactory>
@ -40,12 +40,12 @@
<str name="host">127.0.0.1</str>
<int name="hostPort">${hostPort:8983}</int>
<str name="hostContext">${hostContext:solr}</str>
<int name="zkClientTimeout">${solr.zkclienttimeout:30000}</int>
<int name="zkClientTimeout">${solr.zkclienttimeout:60000}</int> <!-- This should be high by default - dc's are expensive -->
<bool name="genericCoreNodeNames">${genericCoreNodeNames:true}</bool>
<int name="leaderVoteWait">${leaderVoteWait:10000}</int>
<int name="leaderConflictResolveWait">${leaderConflictResolveWait:180000}</int>
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:45000}</int>
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:340000}</int>
<int name="leaderVoteWait">${leaderVoteWait:15000}</int> <!-- We are running tests - the default should be low, not like production -->
<int name="leaderConflictResolveWait">${leaderConflictResolveWait:45000}</int>
<int name="distribUpdateConnTimeout">${distribUpdateConnTimeout:5000}</int>
<int name="distribUpdateSoTimeout">${distribUpdateSoTimeout:15000}</int> <!-- We are running tests - the default should be low, not like production -->
<int name="autoReplicaFailoverWaitAfterExpiration">${autoReplicaFailoverWaitAfterExpiration:10000}</int>
<int name="autoReplicaFailoverWorkLoopDelay">${autoReplicaFailoverWorkLoopDelay:10000}</int>
<int name="autoReplicaFailoverBadNodeExpiration">${autoReplicaFailoverBadNodeExpiration:60000}</int>

View File

@ -22,9 +22,14 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.util.LuceneTestCase.Slow;
@ -38,16 +43,15 @@ import org.apache.solr.client.solrj.response.FacetField;
import org.apache.solr.client.solrj.response.FieldStatsInfo;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.RangeFacet;
import org.apache.solr.cloud.ChaosMonkey;
import org.apache.solr.common.EnumFieldValue;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.FacetParams.FacetRangeMethod;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.StatsParams;
import org.apache.solr.common.params.FacetParams.FacetRangeMethod;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.component.ShardResponse;
import org.apache.solr.handler.component.StatsComponentTest.StatSetCombinations;
@ -100,6 +104,11 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
// we validate the connection before use on the restarted
// server so that we don't use a bad one
System.setProperty("validateAfterInactivity", "200");
System.setProperty("solr.httpclient.retries", "0");
System.setProperty("distribUpdateSoTimeout", "5000");
}
public TestDistributedSearch() {
@ -109,6 +118,9 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
@Test
public void test() throws Exception {
assertEquals(clients.size(), jettys.size());
QueryResponse rsp = null;
int backupStress = stress; // make a copy so we can restore
@ -952,23 +964,30 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
assertEquals("should have an entry for each shard ["+sinfo+"] "+shards, cnt, sinfo.size());
// test shards.tolerant=true
for(int numDownServers = 0; numDownServers < jettys.size()-1; numDownServers++)
{
List<JettySolrRunner> upJettys = new ArrayList<>(jettys);
List<SolrClient> upClients = new ArrayList<>(clients);
List<JettySolrRunner> downJettys = new ArrayList<>();
List<String> upShards = new ArrayList<>(Arrays.asList(shardsArr));
for(int i=0; i<numDownServers; i++)
{
List<JettySolrRunner> upJettys = Collections.synchronizedList(new ArrayList<>(jettys));
List<SolrClient> upClients = Collections.synchronizedList(new ArrayList<>(clients));
List<JettySolrRunner> downJettys = Collections.synchronizedList(new ArrayList<>());
List<String> upShards = Collections.synchronizedList(new ArrayList<>(Arrays.asList(shardsArr)));
int cap = Math.max(upJettys.size() - 1, 1);
int numDownServers = random().nextInt(cap);
for (int i = 0; i < numDownServers; i++) {
if (upJettys.size() == 1) {
continue;
}
// shut down some of the jettys
int indexToRemove = r.nextInt(upJettys.size());
int indexToRemove = r.nextInt(upJettys.size() - 1);
JettySolrRunner downJetty = upJettys.remove(indexToRemove);
upClients.remove(indexToRemove);
upShards.remove(indexToRemove);
ChaosMonkey.stop(downJetty);
downJetty.stop();
downJettys.add(downJetty);
}
Thread.sleep(100);
queryPartialResults(upShards, upClients,
"q", "*:*",
"facet", "true",
@ -995,7 +1014,7 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
"group.query", t1 + ":kings OR " + t1 + ":eggs",
"group.limit", 10,
"sort", i1 + " asc, id asc",
CommonParams.TIME_ALLOWED, 1,
CommonParams.TIME_ALLOWED, 10000,
ShardParams.SHARDS_INFO, "true",
ShardParams.SHARDS_TOLERANT, "true");
@ -1017,10 +1036,10 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
// restart the jettys
for (JettySolrRunner downJetty : downJettys) {
ChaosMonkey.start(downJetty);
}
downJetty.start();
}
// This index has the same number for every field
// TODO: This test currently fails because debug info is obtained only
@ -1125,17 +1144,22 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
params.remove("distrib");
setDistributedParams(params);
if (upClients.size() == 0) {
return;
}
QueryResponse rsp = queryRandomUpServer(params, upClients);
comparePartialResponses(rsp, controlRsp, upShards);
if (stress > 0) {
log.info("starting stress...");
Thread[] threads = new Thread[nThreads];
Set<Future<Object>> pending = new HashSet<>();;
ExecutorCompletionService<Object> cs = new ExecutorCompletionService<>(executor);
Callable[] threads = new Callable[nThreads];
for (int i = 0; i < threads.length; i++) {
threads[i] = new Thread() {
threads[i] = new Callable() {
@Override
public void run() {
public Object call() {
for (int j = 0; j < stress; j++) {
int which = r.nextInt(upClients.size());
SolrClient client = upClients.get(which);
@ -1148,21 +1172,32 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
throw new RuntimeException(e);
}
}
return null;
}
};
threads[i].start();
pending.add(cs.submit(threads[i]));
}
for (Thread thread : threads) {
thread.join();
while (pending.size() > 0) {
Future<Object> future = cs.take();
pending.remove(future);
future.get();
}
}
}
protected QueryResponse queryRandomUpServer(ModifiableSolrParams params, List<SolrClient> upClients) throws SolrServerException, IOException {
protected QueryResponse queryRandomUpServer(ModifiableSolrParams params, List<SolrClient> upClients)
throws SolrServerException, IOException {
// query a random "up" server
int which = r.nextInt(upClients.size());
SolrClient client = upClients.get(which);
SolrClient client;
if (upClients.size() == 1) {
client = upClients.get(0);
} else {
int which = r.nextInt(upClients.size() - 1);
client = upClients.get(which);
}
QueryResponse rsp = client.query(params);
return rsp;
}
@ -1195,7 +1230,7 @@ public class TestDistributedSearch extends BaseDistributedSearchTestCase {
assertTrue("Expected timeAllowedError or to find shardAddress in the up shard info: " + info.toString(), info.get("shardAddress") != null);
}
} else {
assertEquals("Expected to find the " + SolrQueryResponse.RESPONSE_HEADER_PARTIAL_RESULTS_KEY + " header set if a shard is down",
assertEquals("Expected to find the " + SolrQueryResponse.RESPONSE_HEADER_PARTIAL_RESULTS_KEY + " header set if a shard is down. Response: " + rsp,
Boolean.TRUE, rsp.getHeader().get(SolrQueryResponse.RESPONSE_HEADER_PARTIAL_RESULTS_KEY));
assertTrue("Expected to find error in the down shard info: " + info.toString(), info.get("error") != null);
}

View File

@ -16,14 +16,16 @@
*/
package org.apache.solr;
import java.io.IOException;
import org.apache.lucene.search.TimeLimitingCollector;
import org.apache.lucene.util.TestUtil;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrInputDocument;
import org.junit.AfterClass;
import org.junit.Test;
import java.io.IOException;
/**
* Tests that highlighting doesn't break on grouped documents
* with duplicate unique key fields stored on multiple shards.
@ -34,6 +36,12 @@ public class TestHighlightDedupGrouping extends BaseDistributedSearchTestCase {
private static final String group_ti1 = "group_ti1";
private static final String shard_i1 = "shard_i1";
@AfterClass
public static void afterClass() throws Exception {
TimeLimitingCollector.getGlobalTimerThread().stopTimer();
TimeLimitingCollector.getGlobalTimerThread().join();
}
@Test
@ShardsFixed(num = 2)
public void test() throws Exception {

View File

@ -57,7 +57,7 @@ public class TestTolerantSearch extends SolrJettyTestBase {
@BeforeClass
public static void createThings() throws Exception {
solrHome = createSolrHome();
createJetty(solrHome.getAbsolutePath());
createAndStartJetty(solrHome.getAbsolutePath());
String url = jetty.getBaseUrl().toString();
collection1 = getHttpSolrClient(url + "/collection1");
collection2 = getHttpSolrClient(url + "/collection2");

View File

@ -16,6 +16,9 @@
*/
package org.apache.solr.cloud;
import static org.apache.solr.client.solrj.response.RequestStatusState.COMPLETED;
import static org.apache.solr.client.solrj.response.RequestStatusState.FAILED;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.EnumSet;
@ -27,26 +30,21 @@ import org.apache.solr.client.solrj.response.RequestStatusState;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.util.LogLevel;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.apache.solr.client.solrj.response.RequestStatusState.COMPLETED;
import static org.apache.solr.client.solrj.response.RequestStatusState.FAILED;
/**
*
*/
@LogLevel("org.apache.solr.cloud=DEBUG;org.apache.solr.cloud.Overseer=DEBUG;org.apache.solr.cloud.overseer=DEBUG;")
public class AddReplicaTest extends SolrCloudTestCase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@BeforeClass
public static void setupCluster() throws Exception {
configureCluster(4)
configureCluster(3)
.addConfig("conf1", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
.configure();
}
@ -59,13 +57,14 @@ public class AddReplicaTest extends SolrCloudTestCase {
@Test
public void testAddMultipleReplicas() throws Exception {
cluster.waitForAllNodes(5);
String collection = "testAddMultipleReplicas";
CloudSolrClient cloudClient = cluster.getSolrClient();
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(collection, "conf1", 1, 1);
create.setMaxShardsPerNode(2);
cloudClient.request(create);
cluster.waitForActiveCollection(collection, 1, 1);
CollectionAdminRequest.AddReplica addReplica = CollectionAdminRequest.addReplicaToShard(collection, "shard1")
.setNrtReplicas(1)
@ -73,6 +72,9 @@ public class AddReplicaTest extends SolrCloudTestCase {
.setPullReplicas(1);
RequestStatusState status = addReplica.processAndWait(collection + "_xyz1", cloudClient, 120);
assertEquals(COMPLETED, status);
cluster.waitForActiveCollection(collection, 1, 4);
DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollectionOrNull(collection);
assertNotNull(docCollection);
assertEquals(4, docCollection.getReplicas().size());
@ -110,6 +112,7 @@ public class AddReplicaTest extends SolrCloudTestCase {
.setCreateNodeSet(String.join(",", createNodeSet));
status = addReplica.processAndWait(collection + "_xyz1", cloudClient, 120);
assertEquals(COMPLETED, status);
waitForState("Timedout wait for collection to be created", collection, clusterShape(1, 9));
docCollection = cloudClient.getZkStateReader().getClusterState().getCollectionOrNull(collection);
assertNotNull(docCollection);
// sanity check that everything is as before
@ -120,9 +123,8 @@ public class AddReplicaTest extends SolrCloudTestCase {
}
@Test
//commented 2-Aug-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 09-Apr-2018
public void test() throws Exception {
cluster.waitForAllNodes(5);
String collection = "addreplicatest_coll";
CloudSolrClient cloudClient = cluster.getSolrClient();
@ -131,6 +133,8 @@ public class AddReplicaTest extends SolrCloudTestCase {
create.setMaxShardsPerNode(2);
cloudClient.request(create);
cluster.waitForActiveCollection(collection, 2, 2);
ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
DocCollection coll = clusterState.getCollection(collection);
String sliceName = coll.getSlices().iterator().next().getName();
@ -140,6 +144,7 @@ public class AddReplicaTest extends SolrCloudTestCase {
CollectionAdminRequest.RequestStatus requestStatus = CollectionAdminRequest.requestStatus("000");
CollectionAdminRequest.RequestStatusResponse rsp = requestStatus.process(cloudClient);
assertNotSame(rsp.getRequestStatus(), COMPLETED);
// wait for async request success
boolean success = false;
for (int i = 0; i < 200; i++) {
@ -152,11 +157,10 @@ public class AddReplicaTest extends SolrCloudTestCase {
Thread.sleep(500);
}
assertTrue(success);
Collection<Replica> replicas2 = cloudClient.getZkStateReader().getClusterState().getCollection(collection).getSlice(sliceName).getReplicas();
replicas2.removeAll(replicas);
assertEquals(1, replicas2.size());
Replica r = replicas2.iterator().next();
assertNotSame(r.toString(), r.getState(), Replica.State.ACTIVE);
// use waitForFinalState
addReplica.setWaitForFinalState(true);

View File

@ -90,7 +90,11 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
public void testProperties() throws Exception {
CollectionAdminRequest.createCollection("collection1meta", "conf", 2, 1).process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("collection2meta", "conf", 1, 1).process(cluster.getSolrClient());
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1meta", clusterShape(2, 1));
cluster.waitForActiveCollection("collection1meta", 2, 2);
cluster.waitForActiveCollection("collection2meta", 1, 1);
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1meta", clusterShape(2, 2));
waitForState("Expected collection2 to be created with 1 shard and 1 replica", "collection2meta", clusterShape(1, 1));
ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
zkStateReader.createClusterStateWatchersAndUpdate();
@ -204,7 +208,7 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
@Test
public void testModifyPropertiesV2() throws Exception {
final String aliasName = getTestName();
final String aliasName = getSaferTestName();
ZkStateReader zkStateReader = createColectionsAndAlias(aliasName);
final String baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString();
//TODO fix Solr test infra so that this /____v2/ becomes /api/
@ -226,7 +230,7 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
@Test
public void testModifyPropertiesV1() throws Exception {
// note we don't use TZ in this test, thus it's UTC
final String aliasName = getTestName();
final String aliasName = getSaferTestName();
ZkStateReader zkStateReader = createColectionsAndAlias(aliasName);
final String baseUrl = cluster.getRandomJetty(random()).getBaseUrl().toString();
HttpGet get = new HttpGet(baseUrl + "/admin/collections?action=ALIASPROP" +
@ -241,7 +245,7 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
@Test
public void testModifyPropertiesCAR() throws Exception {
// note we don't use TZ in this test, thus it's UTC
final String aliasName = getTestName();
final String aliasName = getSaferTestName();
ZkStateReader zkStateReader = createColectionsAndAlias(aliasName);
CollectionAdminRequest.SetAliasProperty setAliasProperty = CollectionAdminRequest.setAliasProperty(aliasName);
setAliasProperty.addProperty("foo","baz");
@ -278,7 +282,11 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
private ZkStateReader createColectionsAndAlias(String aliasName) throws SolrServerException, IOException, KeeperException, InterruptedException {
CollectionAdminRequest.createCollection("collection1meta", "conf", 2, 1).process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("collection2meta", "conf", 1, 1).process(cluster.getSolrClient());
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1meta", clusterShape(2, 1));
cluster.waitForActiveCollection("collection1meta", 2, 2);
cluster.waitForActiveCollection("collection2meta", 1, 1);
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1meta", clusterShape(2, 2));
waitForState("Expected collection2 to be created with 1 shard and 1 replica", "collection2meta", clusterShape(1, 1));
ZkStateReader zkStateReader = cluster.getSolrClient().getZkStateReader();
zkStateReader.createClusterStateWatchersAndUpdate();
@ -326,7 +334,11 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
public void testDeleteAliasWithExistingCollectionName() throws Exception {
CollectionAdminRequest.createCollection("collection_old", "conf", 2, 1).process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("collection_new", "conf", 1, 1).process(cluster.getSolrClient());
waitForState("Expected collection_old to be created with 2 shards and 1 replica", "collection_old", clusterShape(2, 1));
cluster.waitForActiveCollection("collection_old", 2, 2);
cluster.waitForActiveCollection("collection_new", 1, 1);
waitForState("Expected collection_old to be created with 2 shards and 1 replica", "collection_old", clusterShape(2, 2));
waitForState("Expected collection_new to be created with 1 shard and 1 replica", "collection_new", clusterShape(1, 1));
new UpdateRequest()
@ -399,7 +411,11 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
public void testDeleteOneOfTwoCollectionsAliased() throws Exception {
CollectionAdminRequest.createCollection("collection_one", "conf", 2, 1).process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("collection_two", "conf", 1, 1).process(cluster.getSolrClient());
waitForState("Expected collection_one to be created with 2 shards and 1 replica", "collection_one", clusterShape(2, 1));
cluster.waitForActiveCollection("collection_one", 2, 2);
cluster.waitForActiveCollection("collection_two", 1, 1);
waitForState("Expected collection_one to be created with 2 shards and 1 replica", "collection_one", clusterShape(2, 2));
waitForState("Expected collection_two to be created with 1 shard and 1 replica", "collection_two", clusterShape(1, 1));
new UpdateRequest()
@ -439,8 +455,9 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
// was deleted (and, assuming that it only points to collection_old).
try {
cluster.getSolrClient().query("collection_one", new SolrQuery("*:*"));
} catch (SolrServerException se) {
assertTrue(se.getMessage().contains("No live SolrServers"));
fail("should have failed");
} catch (SolrServerException | SolrException se) {
}
// Clean up
@ -464,7 +481,11 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
public void test() throws Exception {
CollectionAdminRequest.createCollection("collection1", "conf", 2, 1).process(cluster.getSolrClient());
CollectionAdminRequest.createCollection("collection2", "conf", 1, 1).process(cluster.getSolrClient());
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1", clusterShape(2, 1));
cluster.waitForActiveCollection("collection1", 2, 2);
cluster.waitForActiveCollection("collection2", 1, 1);
waitForState("Expected collection1 to be created with 2 shards and 1 replica", "collection1", clusterShape(2, 2));
waitForState("Expected collection2 to be created with 1 shard and 1 replica", "collection2", clusterShape(1, 1));
new UpdateRequest()
@ -495,6 +516,8 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
// test alias pointing to two collections. collection2 first because it's not on every node
CollectionAdminRequest.createAlias("testalias2", "collection2,collection1").process(cluster.getSolrClient());
Thread.sleep(100);
searchSeveralWays("testalias2", new SolrQuery("*:*"), 5);
///////////////
@ -618,7 +641,9 @@ public class AliasIntegrationTest extends SolrCloudTestCase {
@Test
public void testErrorChecks() throws Exception {
CollectionAdminRequest.createCollection("testErrorChecks-collection", "conf", 2, 1).process(cluster.getSolrClient());
waitForState("Expected testErrorChecks-collection to be created with 2 shards and 1 replica", "testErrorChecks-collection", clusterShape(2, 1));
cluster.waitForActiveCollection("testErrorChecks-collection", 2, 2);
waitForState("Expected testErrorChecks-collection to be created with 2 shards and 1 replica", "testErrorChecks-collection", clusterShape(2, 2));
ignoreException(".");

View File

@ -56,8 +56,6 @@ public class AssignBackwardCompatibilityTest extends SolrCloudTestCase {
}
@Test
//05-Jul-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 21-May-2018
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2-Aug-2018
public void test() throws IOException, SolrServerException, KeeperException, InterruptedException {
Set<String> coreNames = new HashSet<>();
Set<String> coreNodeNames = new HashSet<>();
@ -81,6 +79,7 @@ public class AssignBackwardCompatibilityTest extends SolrCloudTestCase {
DocCollection dc = getCollectionState(COLLECTION);
Replica replica = getRandomReplica(dc.getSlice("shard1"), (r) -> r.getState() == Replica.State.ACTIVE);
CollectionAdminRequest.deleteReplica(COLLECTION, "shard1", replica.getName()).process(cluster.getSolrClient());
coreNames.remove(replica.getCoreName());
numLiveReplicas--;
} else {
CollectionAdminResponse response = CollectionAdminRequest.addReplicaToShard(COLLECTION, "shard1")

View File

@ -40,7 +40,7 @@ public class AsyncCallRequestStatusResponseTest extends SolrCloudTestCase {
String asyncId =
CollectionAdminRequest.createCollection("asynccall", "conf", 2, 1).processAsync(cluster.getSolrClient());
waitForState("Expected collection 'asynccall' to have 2 shards and 1 replica", "asynccall", clusterShape(2, 1));
waitForState("Expected collection 'asynccall' to have 2 shards and 1 replica", "asynccall", clusterShape(2, 2));
int tries = 0;
while (true) {

View File

@ -67,7 +67,7 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
@Override
protected boolean useTlogReplicas() {
return onlyLeaderIndexes;
return false; // TODO: tlog replicas makes commits take way to long due to what is likely a bug and it's TestInjection use
}
@Test
@ -351,7 +351,7 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
// query("q","matchesnothing","fl","*,score", "debugQuery", "true");
// this should trigger a recovery phase on deadShard
ChaosMonkey.start(deadShard.jetty);
deadShard.jetty.start();
// make sure we have published we are recovering
Thread.sleep(1500);
@ -381,7 +381,7 @@ public class BasicDistributedZk2Test extends AbstractFullDistribZkTestBase {
Thread.sleep(1500);
ChaosMonkey.start(deadShard.jetty);
deadShard.jetty.start();
// make sure we have published we are recovering
Thread.sleep(1500);

View File

@ -28,12 +28,16 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.Future;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.util.IOUtils;
@ -74,7 +78,9 @@ import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ExecutorUtil;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.DefaultSolrThreadFactory;
import org.apache.solr.util.RTimer;
import org.apache.solr.util.TestInjection;
import org.apache.solr.util.TestInjection.Hook;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -86,7 +92,6 @@ import org.slf4j.LoggerFactory;
*/
@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776")
// DO NOT ENABLE @LuceneTestCase.BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 2018-06-18
public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -94,6 +99,7 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
private static final String DEFAULT_COLLECTION = "collection1";
private final boolean onlyLeaderIndexes = random().nextBoolean();
String t1="a_t";
String i1="a_i1";
String tlong = "other_tl1";
@ -108,13 +114,37 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
private AtomicInteger nodeCounter = new AtomicInteger();
ThreadPoolExecutor executor = new ExecutorUtil.MDCAwareThreadPoolExecutor(0,
Integer.MAX_VALUE, 5, TimeUnit.SECONDS, new SynchronousQueue<Runnable>(),
new DefaultSolrThreadFactory("testExecutor"));
CompletionService<Object> completionService;
Set<Future<Object>> pending;
private static Hook newSearcherHook = new Hook() {
volatile CountDownLatch latch;
AtomicReference<String> collection = new AtomicReference<>();
@Override
public void newSearcher(String collectionName) {
String c = collection.get();
if (c != null && c.equals(collectionName)) {
log.info("Hook detected newSearcher");
try {
latch.countDown();
} catch (NullPointerException e) {
}
}
}
public void waitForSearcher(String collection, int cnt, int timeoutms, boolean failOnTimeout) throws InterruptedException {
latch = new CountDownLatch(cnt);
this.collection.set(collection);
boolean timeout = !latch.await(timeoutms, TimeUnit.MILLISECONDS);
if (timeout && failOnTimeout) {
fail("timed out waiting for new searcher event " + latch.getCount());
}
}
};
public BasicDistributedZkTest() {
// we need DVs on point fields to compute stats & facets
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
@ -125,9 +155,14 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
}
@BeforeClass
public static void beforeBDZKTClass() {
TestInjection.newSearcherHook(newSearcherHook);
}
@Override
protected boolean useTlogReplicas() {
return onlyLeaderIndexes;
return false; // TODO: tlog replicas makes commits take way to long due to what is likely a bug and it's TestInjection use
}
@Override
@ -149,8 +184,6 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
@Test
@ShardsFixed(num = 4)
//DO NOT ENABLE @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 12-Jun-2018
@BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028") // 14-Oct-2018
public void test() throws Exception {
// setLoggingLevel(null);
@ -345,7 +378,12 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
params.set("commitWithin", 10);
add(cloudClient, params , getDoc("id", 300), getDoc("id", 301));
waitForDocCount(before + 2, 30000, "add commitWithin did not work");
newSearcherHook.waitForSearcher(DEFAULT_COLLECTION, 2, 20000, false);
ClusterState clusterState = getCommonCloudSolrClient().getZkStateReader().getClusterState();
DocCollection dColl = clusterState.getCollection(DEFAULT_COLLECTION);
assertSliceCounts("should have found 2 docs, 300 and 301", before + 2, dColl);
// try deleteById commitWithin
UpdateRequest deleteByIdReq = new UpdateRequest();
@ -353,7 +391,9 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
deleteByIdReq.setCommitWithin(10);
deleteByIdReq.process(cloudClient);
waitForDocCount(before + 1, 30000, "deleteById commitWithin did not work");
newSearcherHook.waitForSearcher(DEFAULT_COLLECTION, 2, 20000, false);
assertSliceCounts("deleteById commitWithin did not work", before + 1, dColl);
// try deleteByQuery commitWithin
UpdateRequest deleteByQueryReq = new UpdateRequest();
@ -361,7 +401,10 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
deleteByQueryReq.setCommitWithin(10);
deleteByQueryReq.process(cloudClient);
waitForDocCount(before, 30000, "deleteByQuery commitWithin did not work");
newSearcherHook.waitForSearcher(DEFAULT_COLLECTION, 2, 20000, false);
assertSliceCounts("deleteByQuery commitWithin did not work", before, dColl);
// TODO: This test currently fails because debug info is obtained only
// on shards with matches.
@ -384,24 +427,41 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
testStopAndStartCoresInOneInstance();
}
// Insure that total docs found is the expected number.
private void assertSliceCounts(String msg, long expected, DocCollection dColl) throws Exception {
long found = checkSlicesSameCounts(dColl);
if (found != expected) {
// we get one do over in a bad race
Thread.sleep(1000);
found = checkSlicesSameCounts(dColl);
}
assertEquals(msg, expected, checkSlicesSameCounts(dColl));
}
// Ensure that total docs found is the expected number.
private void waitForDocCount(long expectedNumFound, long waitMillis, String failureMessage)
throws Exception {
RTimer timer = new RTimer();
long timeout = (long)timer.getTime() + waitMillis;
ClusterState clusterState = getCommonCloudSolrClient().getZkStateReader().getClusterState();
DocCollection dColl = clusterState.getCollection(DEFAULT_COLLECTION);
long docTotal = -1; // Could use this for 0 hits too!
while (docTotal != expectedNumFound && timeout > (long) timer.getTime()) {
docTotal = checkSlicesSameCounts(dColl);
if (docTotal != expectedNumFound) {
Thread.sleep(100);
AtomicLong total = new AtomicLong(-1);
try {
getCommonCloudSolrClient().getZkStateReader().waitForState(DEFAULT_COLLECTION, waitMillis, TimeUnit.MILLISECONDS, (n, c) -> {
long docTotal;
try {
docTotal = checkSlicesSameCounts(c);
} catch (SolrServerException | IOException e) {
throw new RuntimeException(e);
}
total.set(docTotal);
if (docTotal == expectedNumFound) {
return true;
}
return false;
});
} catch (TimeoutException | InterruptedException e) {
}
// We could fail here if we broke out of the above because we exceeded the time allowed.
assertEquals(failureMessage, expectedNumFound, docTotal);
assertEquals(failureMessage, expectedNumFound, total.get());
// This should be redundant, but it caught a test error after all.
for (SolrClient client : clients) {
@ -557,11 +617,10 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
}
}
ChaosMonkey.stop(cloudJettys.get(0).jetty);
cloudJettys.get(0).jetty.stop();
printLayout();
Thread.sleep(5000);
ChaosMonkey.start(cloudJettys.get(0).jetty);
cloudJettys.get(0).jetty.start();
cloudClient.getZkStateReader().forceUpdateCollection("multiunload2");
try {
cloudClient.getZkStateReader().getLeaderRetry("multiunload2", "shard1", 30000);
@ -803,6 +862,8 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
for (String coreName : resp.getCollectionCoresStatus().keySet()) {
collectionClients.add(createNewSolrClient(coreName, jettys.get(0).getBaseUrl().toString()));
}
}
SolrClient client1 = collectionClients.get(0);
@ -864,15 +925,36 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
String leader = props.getCoreUrl();
unloadClient.request(unloadCmd);
testExecutor.execute(new Runnable() {
int tries = 50;
while (leader.equals(zkStateReader.getLeaderUrl(oneInstanceCollection2, "shard1", 10000))) {
Thread.sleep(100);
if (tries-- == 0) {
fail("Leader never changed");
@Override
public void run() {
try {
unloadClient.request(unloadCmd);
} catch (SolrServerException e) {
throw new RuntimeException(e);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
});
try {
getCommonCloudSolrClient().getZkStateReader().waitForState(oneInstanceCollection2, 20000, TimeUnit.MILLISECONDS, (n, c) -> {
try {
if (leader.equals(zkStateReader.getLeaderUrl(oneInstanceCollection2, "shard1", 10000))) {
return false;
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
return true;
});
} catch (TimeoutException | InterruptedException e) {
fail("Leader never changed");
}
}
IOUtils.close(collectionClients);
@ -1036,10 +1118,10 @@ public class BasicDistributedZkTest extends AbstractFullDistribZkTestBase {
long collection2Docs = otherCollectionClients.get("collection2").get(0)
.query(new SolrQuery("*:*")).getResults().getNumFound();
System.out.println("found2: "+ collection2Docs);
long collection3Docs = otherCollectionClients.get("collection3").get(0)
.query(new SolrQuery("*:*")).getResults().getNumFound();
System.out.println("found3: "+ collection3Docs);
SolrQuery query = new SolrQuery("*:*");
query.set("collection", "collection2,collection3");

View File

@ -115,7 +115,7 @@ public class BasicZkTest extends AbstractZkTestCase {
// try a reconnect from disconnect
zkServer = new ZkTestServer(zkDir, zkPort);
zkServer.run();
zkServer.run(false);
Thread.sleep(300);

View File

@ -23,7 +23,6 @@ import java.util.Set;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
@ -35,8 +34,6 @@ import org.junit.Test;
@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776")
//@ThreadLeakLingering(linger = 60000)
@SuppressObjectReleaseTracker(bugUrl="Testing purposes")
public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase {
private static final int FAIL_TOLERANCE = 100;
@ -48,6 +45,9 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
public static void beforeSuperClass() {
schemaString = "schema15.xml"; // we need a string id
System.setProperty("solr.autoCommit.maxTime", "15000");
System.clearProperty("solr.httpclient.retries");
System.clearProperty("solr.retries.on.forward");
System.clearProperty("solr.retries.to.followers");
setErrorHook();
}
@ -57,11 +57,23 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
clearErrorHook();
}
@Override
protected void destroyServers() throws Exception {
super.destroyServers();
}
protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"};
protected static final RandVal[] randVals = new RandVal[]{rint, rfloat, rdouble, rlong, rdate};
private int clientSoTimeout = 60000;
private volatile FullThrottleStoppableIndexingThread ftIndexThread;
private final boolean runFullThrottle;
public String[] getFieldNames() {
return fieldNames;
}
@ -78,6 +90,16 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
useFactory("solr.StandardDirectoryFactory");
}
@Override
public void distribTearDown() throws Exception {
try {
ftIndexThread.safeStop();
} catch (NullPointerException e) {
// okay
}
super.distribTearDown();
}
public ChaosMonkeyNothingIsSafeTest() {
super();
sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1"));
@ -94,11 +116,15 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
fixShardCount(numShards);
// TODO: we only do this sometimes so that we can sometimes compare against control,
// it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer
runFullThrottle = random().nextBoolean();
}
@Override
protected boolean useTlogReplicas() {
return onlyLeaderIndexes;
return false; // TODO: tlog replicas makes commits take way to long due to what is likely a bug and it's TestInjection use
}
@Override
@ -119,9 +145,9 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
// None of the operations used here are particularly costly, so this should work.
// Using this low timeout will also help us catch index stalling.
clientSoTimeout = 5000;
cloudClient = createCloudClient(DEFAULT_COLLECTION);
boolean testSuccessful = false;
try {
try (CloudSolrClient ourCloudClient = createCloudClient(DEFAULT_COLLECTION)) {
handle.clear();
handle.put("timestamp", SKIPVAL);
ZkStateReader zkStateReader = cloudClient.getZkStateReader();
@ -155,13 +181,9 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
searchThread.start();
}
// TODO: we only do this sometimes so that we can sometimes compare against control,
// it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer
boolean runFullThrottle = random().nextBoolean();
if (runFullThrottle) {
FullThrottleStoppableIndexingThread ftIndexThread =
new FullThrottleStoppableIndexingThread(controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout);
threads.add(ftIndexThread);
ftIndexThread =
new FullThrottleStoppableIndexingThread(cloudClient.getHttpClient(),controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout);
ftIndexThread.start();
}
@ -189,6 +211,11 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
// ideally this should go into chaosMonkey
restartZk(1000 * (5 + random().nextInt(4)));
if (runFullThrottle) {
ftIndexThread.safeStop();
}
for (StoppableThread indexThread : threads) {
indexThread.safeStop();
}
@ -219,7 +246,6 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
zkStateReader.updateLiveNodes();
assertTrue(zkStateReader.getClusterState().getLiveNodes().size() > 0);
// we expect full throttle fails, but cloud client should not easily fail
for (StoppableThread indexThread : threads) {
if (indexThread instanceof StoppableIndexingThread && !(indexThread instanceof FullThrottleStoppableIndexingThread)) {
@ -230,6 +256,10 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
}
waitForThingsToLevelOut(20);
commit();
Set<String> addFails = getAddFails(indexTreads);
Set<String> deleteFails = getDeleteFails(indexTreads);
// full throttle thread can
@ -253,7 +283,7 @@ public class ChaosMonkeyNothingIsSafeTest extends AbstractFullDistribZkTestBase
// sometimes we restart zookeeper as well
if (random().nextBoolean()) {
restartZk(1000 * (5 + random().nextInt(4)));
// restartZk(1000 * (5 + random().nextInt(4)));
}
try (CloudSolrClient client = createCloudClient("collection1", 30000)) {

View File

@ -25,7 +25,6 @@ import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker;
import org.apache.solr.SolrTestCaseJ4.SuppressSSL;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
@ -43,12 +42,8 @@ import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.carrotsearch.randomizedtesting.annotations.ThreadLeakLingering;
@Slow
@SuppressSSL(bugUrl = "https://issues.apache.org/jira/browse/SOLR-5776")
@ThreadLeakLingering(linger = 60000)
@SuppressObjectReleaseTracker(bugUrl="Testing purposes")
public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDistribZkTestBase {
private static final int FAIL_TOLERANCE = 100;
@ -71,6 +66,9 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
if (usually()) {
System.setProperty("solr.autoCommit.maxTime", "15000");
}
System.clearProperty("solr.httpclient.retries");
System.clearProperty("solr.retries.on.forward");
System.clearProperty("solr.retries.to.followers");
TestInjection.waitForReplicasInSync = null;
setErrorHook();
}
@ -85,7 +83,11 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
protected static final String[] fieldNames = new String[]{"f_i", "f_f", "f_d", "f_l", "f_dt"};
protected static final RandVal[] randVals = new RandVal[]{rint, rfloat, rdouble, rlong, rdate};
private int clientSoTimeout = 60000;
private int clientSoTimeout;
private volatile FullThrottleStoppableIndexingThread ftIndexThread;
private final boolean runFullThrottle;
public String[] getFieldNames() {
return fieldNames;
@ -103,6 +105,16 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
useFactory("solr.StandardDirectoryFactory");
}
@Override
public void distribTearDown() throws Exception {
try {
ftIndexThread.safeStop();
} catch (NullPointerException e) {
// okay
}
super.distribTearDown();
}
public ChaosMonkeyNothingIsSafeWithPullReplicasTest() {
super();
numPullReplicas = random().nextInt(TEST_NIGHTLY ? 2 : 1) + 1;
@ -116,12 +128,12 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
fixShardCount(numNodes);
log.info("Starting ChaosMonkey test with {} shards and {} nodes", sliceCount, numNodes);
runFullThrottle = random().nextBoolean();
}
@Override
protected boolean useTlogReplicas() {
return useTlogReplicas;
return false; // TODO: tlog replicas makes commits take way to long due to what is likely a bug and it's TestInjection use
}
@Override
@ -140,8 +152,8 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
public void test() throws Exception {
// None of the operations used here are particularly costly, so this should work.
// Using this low timeout will also help us catch index stalling.
clientSoTimeout = 5000;
cloudClient = createCloudClient(DEFAULT_COLLECTION);
clientSoTimeout = 8000;
DocCollection docCollection = cloudClient.getZkStateReader().getClusterState().getCollection(DEFAULT_COLLECTION);
assertEquals(this.sliceCount, docCollection.getSlices().size());
Slice s = docCollection.getSlice("shard1");
@ -163,8 +175,6 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
waitForRecoveriesToFinish(false);
// we cannot do delete by query
// as it's not supported for recovery
del("*:*");
List<StoppableThread> threads = new ArrayList<>();
@ -172,7 +182,7 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
int threadCount = TEST_NIGHTLY ? 3 : 1;
int i = 0;
for (i = 0; i < threadCount; i++) {
StoppableIndexingThread indexThread = new StoppableIndexingThread(controlClient, cloudClient, Integer.toString(i), true);
StoppableIndexingThread indexThread = new StoppableIndexingThread(controlClient, cloudClient, Integer.toString(i), true, 35, 1, true);
threads.add(indexThread);
indexTreads.add(indexThread);
indexThread.start();
@ -192,13 +202,9 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
commitThread.start();
}
// TODO: we only do this sometimes so that we can sometimes compare against control,
// it's currently hard to know what requests failed when using ConcurrentSolrUpdateServer
boolean runFullThrottle = random().nextBoolean();
if (runFullThrottle) {
FullThrottleStoppableIndexingThread ftIndexThread =
new FullThrottleStoppableIndexingThread(controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout);
threads.add(ftIndexThread);
ftIndexThread =
new FullThrottleStoppableIndexingThread(cloudClient.getHttpClient(), controlClient, cloudClient, clients, "ft1", true, this.clientSoTimeout);
ftIndexThread.start();
}
@ -213,7 +219,7 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
runTimes = new int[] {5000, 6000, 10000, 15000, 25000, 30000,
30000, 45000, 90000, 120000};
} else {
runTimes = new int[] {5000, 7000, 15000};
runTimes = new int[] {5000, 7000, 10000};
}
runLength = runTimes[random().nextInt(runTimes.length - 1)];
}
@ -225,6 +231,10 @@ public class ChaosMonkeyNothingIsSafeWithPullReplicasTest extends AbstractFullDi
// ideally this should go into chaosMonkey
restartZk(1000 * (5 + random().nextInt(4)));
if (runFullThrottle) {
ftIndexThread.safeStop();
}
for (StoppableThread indexThread : threads) {
indexThread.safeStop();
}

View File

@ -38,6 +38,9 @@ public class ChaosMonkeySafeLeaderTest extends AbstractFullDistribZkTestBase {
public static void beforeSuperClass() {
schemaString = "schema15.xml"; // we need a string id
System.setProperty("solr.autoCommit.maxTime", "15000");
System.clearProperty("solr.httpclient.retries");
System.clearProperty("solr.retries.on.forward");
System.clearProperty("solr.retries.to.followers");
setErrorHook();
}
@ -81,7 +84,6 @@ public class ChaosMonkeySafeLeaderTest extends AbstractFullDistribZkTestBase {
}
@Test
// 29-June-2018 @BadApple(bugUrl="https://issues.apache.org/jira/browse/SOLR-12028")
public void test() throws Exception {
handle.clear();
@ -170,7 +172,7 @@ public class ChaosMonkeySafeLeaderTest extends AbstractFullDistribZkTestBase {
if (random().nextBoolean()) {
zkServer.shutdown();
zkServer = new ZkTestServer(zkServer.getZkDir(), zkServer.getPort());
zkServer.run();
zkServer.run(false);
}
try (CloudSolrClient client = createCloudClient("collection1")) {

View File

@ -23,7 +23,6 @@ import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4.SuppressObjectReleaseTracker;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
@ -42,7 +41,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@Slow
@SuppressObjectReleaseTracker(bugUrl="Testing purposes")
public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistribZkTestBase {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
@ -60,7 +58,7 @@ public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistr
@Override
protected boolean useTlogReplicas() {
return useTlogReplicas;
return false; // TODO: tlog replicas makes commits take way to long due to what is likely a bug and it's TestInjection use
}
@BeforeClass
@ -69,6 +67,9 @@ public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistr
if (usually()) {
System.setProperty("solr.autoCommit.maxTime", "15000");
}
System.clearProperty("solr.httpclient.retries");
System.clearProperty("solr.retries.on.forward");
System.clearProperty("solr.retries.to.followers");
TestInjection.waitForReplicasInSync = null;
setErrorHook();
}
@ -99,8 +100,8 @@ public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistr
public ChaosMonkeySafeLeaderWithPullReplicasTest() {
super();
numPullReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;;
numRealtimeOrTlogReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;;
numPullReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;
numRealtimeOrTlogReplicas = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;
sliceCount = Integer.parseInt(System.getProperty("solr.tests.cloud.cm.slicecount", "-1"));
if (sliceCount == -1) {
sliceCount = random().nextInt(TEST_NIGHTLY ? 3 : 2) + 1;
@ -219,7 +220,7 @@ public class ChaosMonkeySafeLeaderWithPullReplicasTest extends AbstractFullDistr
if (random().nextBoolean()) {
zkServer.shutdown();
zkServer = new ZkTestServer(zkServer.getZkDir(), zkServer.getPort());
zkServer.run();
zkServer.run(false);
}
try (CloudSolrClient client = createCloudClient("collection1")) {

View File

@ -36,10 +36,12 @@ import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.core.CloudConfig;
import org.apache.solr.handler.component.HttpShardHandler;
import org.apache.solr.handler.component.HttpShardHandlerFactory;
import org.apache.solr.update.UpdateShardHandler;
import org.apache.solr.update.UpdateShardHandlerConfig;
import org.apache.zookeeper.KeeperException;
import org.junit.BeforeClass;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
@ -57,6 +59,13 @@ public class ChaosMonkeyShardSplitTest extends ShardSplitTest {
static final int TIMEOUT = 10000;
private AtomicInteger killCounter = new AtomicInteger();
@BeforeClass
public static void beforeSuperClass() {
System.clearProperty("solr.httpclient.retries");
System.clearProperty("solr.retries.on.forward");
System.clearProperty("solr.retries.to.followers");
}
@Test
public void test() throws Exception {
waitForThingsToLevelOut(15);
@ -100,7 +109,7 @@ public class ChaosMonkeyShardSplitTest extends ShardSplitTest {
// kill the leader
CloudJettyRunner leaderJetty = shardToLeaderJetty.get("shard1");
chaosMonkey.killJetty(leaderJetty);
leaderJetty.jetty.stop();
Thread.sleep(2000);
@ -122,7 +131,7 @@ public class ChaosMonkeyShardSplitTest extends ShardSplitTest {
}
// bring back dead node
ChaosMonkey.start(deadJetty.jetty); // he is not the leader anymore
deadJetty.jetty.start(); // he is not the leader anymore
waitTillRecovered();
@ -251,7 +260,7 @@ public class ChaosMonkeyShardSplitTest extends ShardSplitTest {
LeaderElector overseerElector = new LeaderElector(zkClient);
UpdateShardHandler updateShardHandler = new UpdateShardHandler(UpdateShardHandlerConfig.DEFAULT);
// TODO: close Overseer
Overseer overseer = new Overseer(new HttpShardHandlerFactory().getShardHandler(), updateShardHandler, "/admin/cores",
Overseer overseer = new Overseer((HttpShardHandler) new HttpShardHandlerFactory().getShardHandler(), updateShardHandler, "/admin/cores",
reader, null, new CloudConfig.CloudConfigBuilder("127.0.0.1", 8983, "solr").build());
overseer.close();
ElectionContext ec = new OverseerElectionContext(zkClient, overseer,

View File

@ -96,13 +96,13 @@ public class CleanupOldIndexTest extends SolrCloudTestCase {
assertTrue(oldIndexDir2.isDirectory());
// bring shard replica down
ChaosMonkey.stop(jetty);
jetty.stop();
// wait a moment - lets allow some docs to be indexed so replication time is non 0
Thread.sleep(waitTimes[random().nextInt(waitTimes.length - 1)]);
// bring shard replica up
ChaosMonkey.start(jetty);
jetty.start();
// make sure replication can start
Thread.sleep(3000);

View File

@ -136,12 +136,12 @@ public class CloudTestUtils {
boolean requireLeaders) {
return (liveNodes, collectionState) -> {
if (collectionState == null) {
log.trace("-- null collection");
log.info("-- null collection");
return false;
}
Collection<Slice> slices = withInactive ? collectionState.getSlices() : collectionState.getActiveSlices();
if (slices.size() != expectedShards) {
log.trace("-- wrong number of slices, expected={}, found={}: {}", expectedShards, collectionState.getSlices().size(), collectionState.getSlices());
log.info("-- wrong number of slices, expected={}, found={}: {}", expectedShards, collectionState.getSlices().size(), collectionState.getSlices());
return false;
}
Set<String> leaderless = new HashSet<>();
@ -160,14 +160,14 @@ public class CloudTestUtils {
activeReplicas++;
}
if (activeReplicas != expectedReplicas) {
log.trace("-- wrong number of active replicas in slice {}, expected={}, found={}", slice.getName(), expectedReplicas, activeReplicas);
log.info("-- wrong number of active replicas in slice {}, expected={}, found={}", slice.getName(), expectedReplicas, activeReplicas);
return false;
}
}
if (leaderless.isEmpty()) {
return true;
} else {
log.trace("-- shards without leaders: {}", leaderless);
log.info("-- shards without leaders: {}", leaderless);
return false;
}
};

View File

@ -22,6 +22,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.common.cloud.ClusterState;
import org.apache.solr.common.cloud.DocCollection;
@ -44,7 +45,6 @@ public class ClusterStateUpdateTest extends SolrCloudTestCase {
configureCluster(3)
.addConfig("conf", configset("cloud-minimal"))
.configure();
}
@BeforeClass
@ -112,7 +112,7 @@ public class ClusterStateUpdateTest extends SolrCloudTestCase {
assertEquals(3, liveNodes.size());
// shut down node 2
cluster.stopJettySolrRunner(2);
JettySolrRunner j = cluster.stopJettySolrRunner(2);
// slight pause (15s timeout) for watch to trigger
for(int i = 0; i < (5 * 15); i++) {
@ -122,6 +122,8 @@ public class ClusterStateUpdateTest extends SolrCloudTestCase {
Thread.sleep(200);
}
cluster.waitForJettyToStop(j);
assertEquals(2, zkController2.getClusterState().getLiveNodes().size());
cluster.getJettySolrRunner(1).stop();

Some files were not shown because too many files have changed in this diff Show More