mirror of https://github.com/apache/lucene.git
SOLR-11445: Overseer should not hang when process bad message
This commit is contained in:
parent
b21721f152
commit
df5fefb0db
|
@ -159,7 +159,15 @@ public class Overseer implements Closeable {
|
||||||
log.debug("processMessage: workQueueSize: {}, message = {}", workQueue.getStats().getQueueLength(), message);
|
log.debug("processMessage: workQueueSize: {}, message = {}", workQueue.getStats().getQueueLength(), message);
|
||||||
// force flush to ZK after each message because there is no fallback if workQueue items
|
// force flush to ZK after each message because there is no fallback if workQueue items
|
||||||
// are removed from workQueue but fail to be written to ZK
|
// are removed from workQueue but fail to be written to ZK
|
||||||
clusterState = processQueueItem(message, clusterState, zkStateWriter, false, null);
|
try {
|
||||||
|
clusterState = processQueueItem(message, clusterState, zkStateWriter, false, null);
|
||||||
|
} catch (Exception e) {
|
||||||
|
if (isBadMessage(e)) {
|
||||||
|
log.warn("Exception when process message = {}, consider as bad message and poll out from the queue", message);
|
||||||
|
workQueue.poll();
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
workQueue.poll(); // poll-ing removes the element we got by peek-ing
|
workQueue.poll(); // poll-ing removes the element we got by peek-ing
|
||||||
data = workQueue.peek();
|
data = workQueue.peek();
|
||||||
}
|
}
|
||||||
|
@ -167,33 +175,28 @@ public class Overseer implements Closeable {
|
||||||
if (hadWorkItems) {
|
if (hadWorkItems) {
|
||||||
clusterState = zkStateWriter.writePendingUpdates();
|
clusterState = zkStateWriter.writePendingUpdates();
|
||||||
}
|
}
|
||||||
} catch (KeeperException e) {
|
} catch (KeeperException.SessionExpiredException e) {
|
||||||
if (e.code() == KeeperException.Code.SESSIONEXPIRED) {
|
log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
|
||||||
log.warn("Solr cannot talk to ZK, exiting Overseer work queue loop", e);
|
return;
|
||||||
return;
|
|
||||||
}
|
|
||||||
log.error("Exception in Overseer work queue loop", e);
|
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
return;
|
return;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Exception in Overseer work queue loop", e);
|
log.error("Exception in Overseer when process message from work queue, retrying", e);
|
||||||
|
refreshClusterState = true;
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] head = null;
|
byte[] head = null;
|
||||||
try {
|
try {
|
||||||
head = stateUpdateQueue.peek(true);
|
head = stateUpdateQueue.peek(true);
|
||||||
} catch (KeeperException e) {
|
} catch (KeeperException.SessionExpiredException e) {
|
||||||
if (e.code() == KeeperException.Code.SESSIONEXPIRED) {
|
log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
|
||||||
log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
|
return;
|
||||||
return;
|
|
||||||
}
|
|
||||||
log.error("Exception in Overseer main queue loop", e);
|
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
return;
|
return;
|
||||||
|
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error("Exception in Overseer main queue loop", e);
|
log.error("Exception in Overseer main queue loop", e);
|
||||||
}
|
}
|
||||||
|
@ -237,16 +240,9 @@ public class Overseer implements Closeable {
|
||||||
// clean work queue
|
// clean work queue
|
||||||
while (workQueue.poll() != null);
|
while (workQueue.poll() != null);
|
||||||
|
|
||||||
} catch (KeeperException.BadVersionException bve) {
|
} catch (KeeperException.SessionExpiredException e) {
|
||||||
log.warn("Bad version writing to ZK using compare-and-set, will force refresh cluster state: {}", bve.getMessage());
|
log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
|
||||||
refreshClusterState = true;
|
return;
|
||||||
} catch (KeeperException e) {
|
|
||||||
if (e.code() == KeeperException.Code.SESSIONEXPIRED) {
|
|
||||||
log.warn("Solr cannot talk to ZK, exiting Overseer main queue loop", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
log.error("Exception in Overseer main queue loop", e);
|
|
||||||
refreshClusterState = true; // force refresh state in case of all errors
|
|
||||||
} catch (InterruptedException e) {
|
} catch (InterruptedException e) {
|
||||||
Thread.currentThread().interrupt();
|
Thread.currentThread().interrupt();
|
||||||
return;
|
return;
|
||||||
|
@ -262,6 +258,16 @@ public class Overseer implements Closeable {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return true whenever the exception thrown by ZkStateWriter is correspond
|
||||||
|
// to a invalid state or 'bad' message (in this case, we should remove that message from queue)
|
||||||
|
private boolean isBadMessage(Exception e) {
|
||||||
|
if (e instanceof KeeperException) {
|
||||||
|
KeeperException ke = (KeeperException) e;
|
||||||
|
return ke.code() == KeeperException.Code.NONODE || ke.code() == KeeperException.Code.NODEEXISTS;
|
||||||
|
}
|
||||||
|
return !(e instanceof InterruptedException);
|
||||||
|
}
|
||||||
|
|
||||||
private ClusterState processQueueItem(ZkNodeProps message, ClusterState clusterState, ZkStateWriter zkStateWriter, boolean enableBatching, ZkStateWriter.ZkWriteCallback callback) throws Exception {
|
private ClusterState processQueueItem(ZkNodeProps message, ClusterState clusterState, ZkStateWriter zkStateWriter, boolean enableBatching, ZkStateWriter.ZkWriteCallback callback) throws Exception {
|
||||||
final String operation = message.getStr(QUEUE_OPERATION);
|
final String operation = message.getStr(QUEUE_OPERATION);
|
||||||
List<ZkWriteCommand> zkWriteCommands = null;
|
List<ZkWriteCommand> zkWriteCommands = null;
|
||||||
|
|
|
@ -689,6 +689,63 @@ public class OverseerTest extends SolrTestCaseJ4 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExceptionWhenFlushClusterState() throws Exception {
|
||||||
|
String zkDir = createTempDir("zkData").toFile().getAbsolutePath();
|
||||||
|
|
||||||
|
ZkTestServer server = new ZkTestServer(zkDir);
|
||||||
|
|
||||||
|
SolrZkClient controllerClient = null;
|
||||||
|
SolrZkClient overseerClient = null;
|
||||||
|
ZkStateReader reader = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
server.run();
|
||||||
|
controllerClient = new SolrZkClient(server.getZkAddress(), TIMEOUT);
|
||||||
|
|
||||||
|
AbstractZkTestCase.tryCleanSolrZkNode(server.getZkHost());
|
||||||
|
AbstractZkTestCase.makeSolrZkNode(server.getZkHost());
|
||||||
|
ZkController.createClusterZkNodes(controllerClient);
|
||||||
|
|
||||||
|
reader = new ZkStateReader(controllerClient);
|
||||||
|
reader.createClusterStateWatchersAndUpdate();
|
||||||
|
|
||||||
|
// We did not create /collections -> this message will cause exception when Overseer try to flush the clusterstate
|
||||||
|
ZkNodeProps badMessage = new ZkNodeProps(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.CREATE.toLower(),
|
||||||
|
"name", "collection1",
|
||||||
|
ZkStateReader.REPLICATION_FACTOR, "1",
|
||||||
|
ZkStateReader.NUM_SHARDS_PROP, "1",
|
||||||
|
DocCollection.STATE_FORMAT, "2",
|
||||||
|
"createNodeSet", "");
|
||||||
|
ZkNodeProps goodMessage = new ZkNodeProps(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.CREATE.toLower(),
|
||||||
|
"name", "collection2",
|
||||||
|
ZkStateReader.REPLICATION_FACTOR, "1",
|
||||||
|
ZkStateReader.NUM_SHARDS_PROP, "1",
|
||||||
|
DocCollection.STATE_FORMAT, "1",
|
||||||
|
"createNodeSet", "");
|
||||||
|
ZkDistributedQueue workQueue = Overseer.getInternalWorkQueue(controllerClient, new Overseer.Stats());
|
||||||
|
workQueue.offer(Utils.toJSON(badMessage));
|
||||||
|
workQueue.offer(Utils.toJSON(goodMessage));
|
||||||
|
overseerClient = electNewOverseer(server.getZkAddress());
|
||||||
|
waitForCollections(reader, "collection2");
|
||||||
|
|
||||||
|
ZkDistributedQueue q = Overseer.getStateUpdateQueue(controllerClient);
|
||||||
|
q.offer(Utils.toJSON(badMessage));
|
||||||
|
q.offer(Utils.toJSON(goodMessage.plus("name", "collection3")));
|
||||||
|
waitForCollections(reader, "collection2", "collection3");
|
||||||
|
assertNotNull(reader.getClusterState().getCollectionOrNull("collection2"));
|
||||||
|
assertNotNull(reader.getClusterState().getCollectionOrNull("collection3"));
|
||||||
|
|
||||||
|
assertTrue(workQueue.peek() == null);
|
||||||
|
assertTrue(q.peek() == null);
|
||||||
|
} finally {
|
||||||
|
close(overseerClient);
|
||||||
|
close(controllerClient);
|
||||||
|
close(reader);
|
||||||
|
server.shutdown();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testShardLeaderChange() throws Exception {
|
public void testShardLeaderChange() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue