From 2c49c4a27d99316c9d962a50171f728de58bfa2a Mon Sep 17 00:00:00 2001 From: Erick Erickson Date: Sat, 31 Oct 2020 19:34:09 -0400 Subject: [PATCH] SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition) --- solr/CHANGES.txt | 3 + .../org/apache/solr/core/CoreContainer.java | 141 ++++++++++-------- .../apache/solr/core/TestCoreContainer.java | 103 ++++++++++++- 3 files changed, 179 insertions(+), 68 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index e41477ed40a..18c309708a0 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -167,6 +167,9 @@ Bug Fixes * SOLR-14940: ReplicationHandler memory leak through SolrCore.closeHooks with unstable ZK connection. (Anver Sotnikov, Mike Drob) +* SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition) + (Erick Erickson, Andreas Hubold) + Other Changes --------------------- diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java index 79ccd40f94e..2e86b58bd6b 100644 --- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java +++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java @@ -30,6 +30,7 @@ import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Locale; @@ -1244,6 +1245,7 @@ public class CoreContainer { return create(coreName, cfg.getCoreRootDirectory().resolve(coreName), parameters, false); } + Set inFlightCreations = new HashSet<>(); // See SOLR-14969 /** * Creates a new core in a specified instance directory, publishing the core state to the cluster * @@ -1253,77 +1255,93 @@ public class CoreContainer { * @return the newly created core */ public SolrCore create(String coreName, Path instancePath, Map parameters, boolean newCollection) { - - CoreDescriptor cd = new CoreDescriptor(coreName, instancePath, parameters, getContainerProperties(), getZkController()); - - // TODO: There's a race here, isn't there? - // Since the core descriptor is removed when a core is unloaded, it should never be anywhere when a core is created. - if (getAllCoreNames().contains(coreName)) { - log.warn("Creating a core with existing name is not allowed"); - // TODO: Shouldn't this be a BAD_REQUEST? - throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists."); - } - - // Validate paths are relative to known locations to avoid path traversal - assertPathAllowed(cd.getInstanceDir()); - assertPathAllowed(Paths.get(cd.getDataDir())); - - boolean preExisitingZkEntry = false; + SolrCore core = null; + boolean iAdded = false; try { - if (getZkController() != null) { - if (cd.getCloudDescriptor().getCoreNodeName() == null) { - throw new SolrException(ErrorCode.SERVER_ERROR, "coreNodeName missing " + parameters.toString()); + synchronized (inFlightCreations) { + if (inFlightCreations.add(coreName)) { + iAdded = true; + } else { + String msg = "Already creating a core with name '" + coreName + "', call aborted '"; + log.warn(msg); + throw new SolrException(ErrorCode.SERVER_ERROR, msg); } - preExisitingZkEntry = getZkController().checkIfCoreNodeNameAlreadyExists(cd); + } + CoreDescriptor cd = new CoreDescriptor(coreName, instancePath, parameters, getContainerProperties(), getZkController()); + + // Since the core descriptor is removed when a core is unloaded, it should never be anywhere when a core is created. + if (getAllCoreNames().contains(coreName)) { + log.warn("Creating a core with existing name is not allowed: '{}'", coreName); + // TODO: Shouldn't this be a BAD_REQUEST? + throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists."); } - // Much of the logic in core handling pre-supposes that the core.properties file already exists, so create it - // first and clean it up if there's an error. - coresLocator.create(this, cd); + // Validate paths are relative to known locations to avoid path traversal + assertPathAllowed(cd.getInstanceDir()); + assertPathAllowed(Paths.get(cd.getDataDir())); - SolrCore core = null; + boolean preExisitingZkEntry = false; try { - solrCores.waitAddPendingCoreOps(cd.getName()); - core = createFromDescriptor(cd, true, newCollection); - coresLocator.persist(this, cd); // Write out the current core properties in case anything changed when the core was created - } finally { - solrCores.removeFromPendingOps(cd.getName()); - } + if (getZkController() != null) { + if (cd.getCloudDescriptor().getCoreNodeName() == null) { + throw new SolrException(ErrorCode.SERVER_ERROR, "coreNodeName missing " + parameters.toString()); + } + preExisitingZkEntry = getZkController().checkIfCoreNodeNameAlreadyExists(cd); + } + + // Much of the logic in core handling pre-supposes that the core.properties file already exists, so create it + // first and clean it up if there's an error. + coresLocator.create(this, cd); - return core; - } catch (Exception ex) { - // First clean up any core descriptor, there should never be an existing core.properties file for any core that - // failed to be created on-the-fly. - coresLocator.delete(this, cd); - if (isZooKeeperAware() && !preExisitingZkEntry) { try { - getZkController().unregister(coreName, cd); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - SolrException.log(log, null, e); - } catch (KeeperException e) { - SolrException.log(log, null, e); - } catch (Exception e) { - SolrException.log(log, null, e); + solrCores.waitAddPendingCoreOps(cd.getName()); + core = createFromDescriptor(cd, true, newCollection); + coresLocator.persist(this, cd); // Write out the current core properties in case anything changed when the core was created + } finally { + solrCores.removeFromPendingOps(cd.getName()); + } + + return core; + } catch (Exception ex) { + // First clean up any core descriptor, there should never be an existing core.properties file for any core that + // failed to be created on-the-fly. + coresLocator.delete(this, cd); + if (isZooKeeperAware() && !preExisitingZkEntry) { + try { + getZkController().unregister(coreName, cd); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + SolrException.log(log, null, e); + } catch (KeeperException e) { + SolrException.log(log, null, e); + } catch (Exception e) { + SolrException.log(log, null, e); + } + } + + Throwable tc = ex; + Throwable c = null; + do { + tc = tc.getCause(); + if (tc != null) { + c = tc; + } + } while (tc != null); + + String rootMsg = ""; + if (c != null) { + rootMsg = " Caused by: " + c.getMessage(); + } + + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "Error CREATEing SolrCore '" + coreName + "': " + ex.getMessage() + rootMsg, ex); + } + } finally { + synchronized (inFlightCreations) { + if (iAdded) { + inFlightCreations.remove(coreName); } } - - Throwable tc = ex; - Throwable c = null; - do { - tc = tc.getCause(); - if (tc != null) { - c = tc; - } - } while (tc != null); - - String rootMsg = ""; - if (c != null) { - rootMsg = " Caused by: " + c.getMessage(); - } - - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, - "Error CREATEing SolrCore '" + coreName + "': " + ex.getMessage() + rootMsg, ex); } } @@ -1777,6 +1795,7 @@ public class CoreContainer { } if (cd == null) { + log.warn("Cannot unload non-existent core '{}'", name); throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]"); } diff --git a/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java b/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java index 2ebeb797caa..6c5093222d3 100644 --- a/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java +++ b/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java @@ -20,6 +20,7 @@ import java.io.File; import java.io.FileOutputStream; import java.nio.file.Path; import java.nio.file.Paths; + import java.util.ArrayList; import java.util.Collection; import java.util.List; @@ -117,22 +118,27 @@ public class TestCoreContainer extends SolrTestCaseJ4 { } } + private static class TestReloadThread extends Thread { + private final CoreContainer cc; + TestReloadThread(CoreContainer cc) { + this.cc = cc; + } + @Override + public void run() { + cc.reload("core1"); + } + } + @Test public void testReloadThreaded() throws Exception { final CoreContainer cc = init(CONFIGSETS_SOLR_XML); cc.create("core1", ImmutableMap.of("configSet", "minimal")); - class TestThread extends Thread { - @Override - public void run() { - cc.reload("core1"); - } - } List threads = new ArrayList<>(); int numThreads = 4; for (int i = 0; i < numThreads; i++) { - threads.add(new TestThread()); + threads.add(new TestReloadThread(cc)); } for (Thread thread : threads) { @@ -147,6 +153,89 @@ public class TestCoreContainer extends SolrTestCaseJ4 { } + private static class TestCreateThread extends Thread { + final CoreContainer cc; + final String coreName; + boolean foundExpectedError = false; + SolrCore core = null; + + TestCreateThread(CoreContainer cc, String coreName) { + this.cc = cc; + this.coreName = coreName; + } + @Override + public void run() { + try { + core = cc.create(coreName, ImmutableMap.of("configSet", "minimal")); + } catch (SolrException e) { + String msg = e.getMessage(); + foundExpectedError = msg.contains("Already creating a core with name") || msg.contains("already exists"); + } + } + + int verifyMe() { + if (foundExpectedError) { + assertNull("failed create should have returned null for core", core); + return 0; + } else { + assertNotNull("Core should not be null if there was no error", core); + return 1; + } + } + } + + @Test + public void testCreateThreaded() throws Exception { + final CoreContainer cc = init(CONFIGSETS_SOLR_XML); + final int NUM_THREADS = 3; + + + // Try this a few times to increase the chances of failure. + for (int idx = 0; idx < 3; ++idx) { + TestCreateThread[] threads = new TestCreateThread[NUM_THREADS]; + // Let's add a little stress in here by using the same core name each + // time around. The final unload in the loop should delete the core and + // allow the next time around to succeed. + // This also checks the bookkeeping in CoreContainer.create + // that prevents muliple simulatneous creations, + // currently "inFlightCreations" + String testName = "coreToTest"; + for (int thread = 0; thread < NUM_THREADS; ++thread) { + threads[thread] = new TestCreateThread(cc, testName); + } + // only one of these should succeed. + for (int thread = 0; thread < NUM_THREADS; ++thread) { + threads[thread].start(); + } + + for (int thread = 0; thread < NUM_THREADS; ++thread) { + threads[thread].join(); + } + + // We can't guarantee that which thread failed, so go find it + int goodCount = 0; + for (int thread = 0; thread < NUM_THREADS; ++thread) { + goodCount += threads[thread].verifyMe(); + } + assertEquals("Only one create should have succeeded", 1, goodCount); + + + // Check bookkeeping by removing and creating the core again, making sure + // we didn't leave the record of trying to create this core around. + // NOTE: unloading the core closes it too. + cc.unload(testName, true, true, true); + cc.create(testName, ImmutableMap.of("configSet", "minimal")); + // This call should fail with a different error because the core was + // created successfully. + SolrException thrown = expectThrows(SolrException.class, () -> + cc.create(testName, ImmutableMap.of("configSet", "minimal"))); + assertTrue("Should have 'already exists' error", thrown.getMessage().contains("already exists")); + + cc.unload(testName, true, true, true); + } + cc.shutdown(); + } + @Test public void testNoCores() throws Exception {