SOLR-9847: Stop blocking further schema updates while waiting for a pending update to propagate to other replicas. This reduces the likelihood of a (time-limited) distributed deadlock during concurrent schema updates.

This commit is contained in:
Steve Rowe 2016-12-20 12:05:33 -05:00
parent b37a72d941
commit 04108d9935
2 changed files with 59 additions and 44 deletions

View File

@ -271,6 +271,10 @@ Bug Fixes
* SOLR-1953: It may be possible for temporary files to accumulate until the Solr process is shut down.
(Karl Wright, Mark Miller)
* SOLR-9847: Stop blocking further schema updates while waiting for a pending update to propagate to other replicas.
This reduces the likelihood of a (time-limited) distributed deadlock during concurrent schema updates.
(Mark Miller, Steve Rowe)
Other Changes
----------------------

View File

@ -88,9 +88,7 @@ public class SchemaManager {
IndexSchema schema = req.getCore().getLatestSchema();
if (schema instanceof ManagedIndexSchema && schema.isMutable()) {
synchronized (schema.getSchemaUpdateLock()) {
return doOperations(ops);
}
} else {
return singletonList(singletonMap(CommandOperation.ERR_MSGS, "schema is not editable"));
}
@ -107,6 +105,10 @@ public class SchemaManager {
TimeOut timeOut = new TimeOut(timeout, TimeUnit.SECONDS);
SolrCore core = req.getCore();
String errorMsg = "Unable to persist managed schema. ";
List errors = Collections.emptyList();
int latestVersion = -1;
synchronized (req.getSchema().getSchemaUpdateLock()) {
while (!timeOut.hasTimedOut()) {
managedIndexSchema = getFreshManagedSchema(req.getCore());
for (CommandOperation op : operations) {
@ -117,8 +119,8 @@ public class SchemaManager {
op.addError("No such operation : " + op.name);
}
}
List errs = CommandOperation.captureErrors(operations);
if (!errs.isEmpty()) return errs;
errors = CommandOperation.captureErrors(operations);
if (!errors.isEmpty()) break;
SolrResourceLoader loader = req.getCore().getResourceLoader();
if (loader instanceof ZkSolrResourceLoader) {
ZkSolrResourceLoader zkLoader = (ZkSolrResourceLoader) loader;
@ -131,11 +133,11 @@ public class SchemaManager {
}
try {
int latestVersion = ZkController.persistConfigResourceToZooKeeper(zkLoader, managedIndexSchema.getSchemaZkVersion(),
managedIndexSchema.getResourceName(), sw.toString().getBytes(StandardCharsets.UTF_8), true);
latestVersion = ZkController.persistConfigResourceToZooKeeper
(zkLoader, managedIndexSchema.getSchemaZkVersion(), managedIndexSchema.getResourceName(),
sw.toString().getBytes(StandardCharsets.UTF_8), true);
req.getCore().getCoreDescriptor().getCoreContainer().reload(req.getCore().getName());
waitForOtherReplicasToUpdate(timeOut, latestVersion);
return Collections.emptyList();
break;
} catch (ZkController.ResourceModifiedInZkException e) {
log.info("Schema was modified by another node. Retrying..");
}
@ -144,15 +146,24 @@ public class SchemaManager {
//only for non cloud stuff
managedIndexSchema.persistManagedSchema(false);
core.setLatestSchema(managedIndexSchema);
return Collections.emptyList();
} catch (SolrException e) {
log.warn(errorMsg);
return singletonList(errorMsg + e.getMessage());
errors = singletonList(errorMsg + e.getMessage());
}
break;
}
}
}
if (req.getCore().getResourceLoader() instanceof ZkSolrResourceLoader) {
// Don't block further schema updates while waiting for a pending update to propagate to other replicas.
// This reduces the likelihood of a (time-limited) distributed deadlock during concurrent schema updates.
waitForOtherReplicasToUpdate(timeOut, latestVersion);
}
if (errors.isEmpty() && timeOut.hasTimedOut()) {
log.warn(errorMsg + "Timed out.");
return singletonList(errorMsg + "Timed out.");
errors = singletonList(errorMsg + "Timed out.");
}
return errors;
}
private void waitForOtherReplicasToUpdate(TimeOut timeOut, int latestVersion) {