mirror of https://github.com/apache/lucene.git
SOLR-11782: Refactor LatchWatcher.await to protect against spurious wakeup
This commit is contained in:
parent
3856ae2d85
commit
56f3f6d948
|
@ -108,6 +108,8 @@ New Features
|
||||||
* SOLR-11722: New CREATEROUTEDALIAS SolrCloud command to create a "time routed alias" over a series of collections
|
* SOLR-11722: New CREATEROUTEDALIAS SolrCloud command to create a "time routed alias" over a series of collections
|
||||||
partitioned by time. (Gus Heck, David Smiley)
|
partitioned by time. (Gus Heck, David Smiley)
|
||||||
|
|
||||||
|
* SOLR-11782: Refactor LatchWatcher.await to protect against spurious wakeup (Tomás Fernández Löbbe, David Smiley, Dawid Weiss)
|
||||||
|
|
||||||
Bug Fixes
|
Bug Fixes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -16,13 +16,16 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.cloud;
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
|
import com.codahale.metrics.Timer;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.TreeSet;
|
import java.util.TreeSet;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
import java.util.concurrent.locks.Condition;
|
||||||
|
import java.util.concurrent.locks.Lock;
|
||||||
|
import java.util.concurrent.locks.ReentrantLock;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
import com.codahale.metrics.Timer;
|
|
||||||
import org.apache.solr.common.cloud.SolrZkClient;
|
import org.apache.solr.common.cloud.SolrZkClient;
|
||||||
import org.apache.solr.common.cloud.ZkNodeProps;
|
import org.apache.solr.common.cloud.ZkNodeProps;
|
||||||
import org.apache.solr.common.util.Pair;
|
import org.apache.solr.common.util.Pair;
|
||||||
|
@ -108,25 +111,24 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
|
||||||
/**
|
/**
|
||||||
* Watcher that blocks until a WatchedEvent occurs for a znode.
|
* Watcher that blocks until a WatchedEvent occurs for a znode.
|
||||||
*/
|
*/
|
||||||
private static final class LatchWatcher implements Watcher {
|
static final class LatchWatcher implements Watcher {
|
||||||
|
|
||||||
private final Object lock;
|
private final Lock lock;
|
||||||
|
private final Condition eventReceived;
|
||||||
private WatchedEvent event;
|
private WatchedEvent event;
|
||||||
private Event.EventType latchEventType;
|
private Event.EventType latchEventType;
|
||||||
|
|
||||||
LatchWatcher(Object lock) {
|
LatchWatcher() {
|
||||||
this(lock, null);
|
this(null);
|
||||||
}
|
}
|
||||||
|
|
||||||
LatchWatcher(Event.EventType eventType) {
|
LatchWatcher(Event.EventType eventType) {
|
||||||
this(new Object(), eventType);
|
this.lock = new ReentrantLock();
|
||||||
}
|
this.eventReceived = lock.newCondition();
|
||||||
|
|
||||||
LatchWatcher(Object lock, Event.EventType eventType) {
|
|
||||||
this.lock = lock;
|
|
||||||
this.latchEventType = eventType;
|
this.latchEventType = eventType;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void process(WatchedEvent event) {
|
public void process(WatchedEvent event) {
|
||||||
// session events are not change events, and do not remove the watcher
|
// session events are not change events, and do not remove the watcher
|
||||||
|
@ -136,17 +138,29 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
|
||||||
// If latchEventType is not null, only fire if the type matches
|
// If latchEventType is not null, only fire if the type matches
|
||||||
LOG.debug("{} fired on path {} state {} latchEventType {}", event.getType(), event.getPath(), event.getState(), latchEventType);
|
LOG.debug("{} fired on path {} state {} latchEventType {}", event.getType(), event.getPath(), event.getState(), latchEventType);
|
||||||
if (latchEventType == null || event.getType() == latchEventType) {
|
if (latchEventType == null || event.getType() == latchEventType) {
|
||||||
synchronized (lock) {
|
lock.lock();
|
||||||
|
try {
|
||||||
this.event = event;
|
this.event = event;
|
||||||
lock.notifyAll();
|
eventReceived.signalAll();
|
||||||
|
} finally {
|
||||||
|
lock.unlock();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void await(long timeout) throws InterruptedException {
|
public void await(long timeoutMs) throws InterruptedException {
|
||||||
synchronized (lock) {
|
assert timeoutMs > 0;
|
||||||
if (this.event != null) return;
|
long timeoutNanos = TimeUnit.MILLISECONDS.toNanos(timeoutMs);
|
||||||
lock.wait(timeout);
|
lock.lock();
|
||||||
|
try {
|
||||||
|
while (this.event == null) {
|
||||||
|
if (timeoutNanos <= 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
timeoutNanos = eventReceived.awaitNanos(timeoutNanos);
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
lock.unlock();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -187,17 +201,14 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
|
||||||
// otherwise we may miss the response.
|
// otherwise we may miss the response.
|
||||||
String watchID = createResponseNode();
|
String watchID = createResponseNode();
|
||||||
|
|
||||||
Object lock = new Object();
|
LatchWatcher watcher = new LatchWatcher();
|
||||||
LatchWatcher watcher = new LatchWatcher(lock);
|
|
||||||
Stat stat = zookeeper.exists(watchID, watcher, true);
|
Stat stat = zookeeper.exists(watchID, watcher, true);
|
||||||
|
|
||||||
// create the request node
|
// create the request node
|
||||||
createRequestNode(data, watchID);
|
createRequestNode(data, watchID);
|
||||||
|
|
||||||
synchronized (lock) {
|
if (stat != null) {
|
||||||
if (stat != null && watcher.getWatchedEvent() == null) {
|
watcher.await(timeout);
|
||||||
watcher.await(timeout);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
byte[] bytes = zookeeper.getData(watchID, null, null, true);
|
byte[] bytes = zookeeper.getData(watchID, null, null, true);
|
||||||
// create the event before deleting the node, otherwise we can get the deleted
|
// create the event before deleting the node, otherwise we can get the deleted
|
||||||
|
|
|
@ -16,7 +16,13 @@
|
||||||
*/
|
*/
|
||||||
package org.apache.solr.cloud;
|
package org.apache.solr.cloud;
|
||||||
|
|
||||||
import javax.xml.parsers.ParserConfigurationException;
|
import static org.apache.solr.cloud.AbstractDistribZkTestBase.verifyReplicaStatus;
|
||||||
|
import static org.mockito.Mockito.doReturn;
|
||||||
|
import static org.mockito.Mockito.mock;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
|
import com.codahale.metrics.Snapshot;
|
||||||
|
import com.codahale.metrics.Timer;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.lang.invoke.MethodHandles;
|
import java.lang.invoke.MethodHandles;
|
||||||
|
@ -31,10 +37,9 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.TimeUnit;
|
import java.util.concurrent.TimeUnit;
|
||||||
import java.util.concurrent.TimeoutException;
|
import java.util.concurrent.TimeoutException;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
import javax.xml.parsers.ParserConfigurationException;
|
||||||
import com.codahale.metrics.Snapshot;
|
|
||||||
import com.codahale.metrics.Timer;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase.Slow;
|
import org.apache.lucene.util.LuceneTestCase.Slow;
|
||||||
import org.apache.solr.SolrTestCaseJ4;
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
import org.apache.solr.client.solrj.cloud.DistributedQueue;
|
import org.apache.solr.client.solrj.cloud.DistributedQueue;
|
||||||
|
@ -64,7 +69,10 @@ import org.apache.zookeeper.CreateMode;
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.apache.zookeeper.KeeperException.NoNodeException;
|
import org.apache.zookeeper.KeeperException.NoNodeException;
|
||||||
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||||
|
import org.apache.zookeeper.WatchedEvent;
|
||||||
|
import org.apache.zookeeper.Watcher.Event;
|
||||||
import org.apache.zookeeper.data.Stat;
|
import org.apache.zookeeper.data.Stat;
|
||||||
|
import org.apache.zookeeper.proto.WatcherEvent;
|
||||||
import org.junit.After;
|
import org.junit.After;
|
||||||
import org.junit.AfterClass;
|
import org.junit.AfterClass;
|
||||||
import org.junit.BeforeClass;
|
import org.junit.BeforeClass;
|
||||||
|
@ -75,11 +83,6 @@ import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.xml.sax.SAXException;
|
import org.xml.sax.SAXException;
|
||||||
|
|
||||||
import static org.apache.solr.cloud.AbstractDistribZkTestBase.verifyReplicaStatus;
|
|
||||||
import static org.mockito.Mockito.doReturn;
|
|
||||||
import static org.mockito.Mockito.mock;
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
@Slow
|
@Slow
|
||||||
public class OverseerTest extends SolrTestCaseJ4 {
|
public class OverseerTest extends SolrTestCaseJ4 {
|
||||||
|
|
||||||
|
@ -1475,5 +1478,44 @@ public class OverseerTest extends SolrTestCaseJ4 {
|
||||||
server.shutdown();
|
server.shutdown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLatchWatcher() throws InterruptedException {
|
||||||
|
OverseerTaskQueue.LatchWatcher latch1 = new OverseerTaskQueue.LatchWatcher();
|
||||||
|
long before = System.nanoTime();
|
||||||
|
latch1.await(100);
|
||||||
|
long after = System.nanoTime();
|
||||||
|
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) > 50);
|
||||||
|
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 500);// Mostly to make sure the millis->nanos->millis is not broken
|
||||||
|
latch1.process(new WatchedEvent(new WatcherEvent(1, 1, "/foo/bar")));
|
||||||
|
before = System.nanoTime();
|
||||||
|
latch1.await(10000);// Expecting no wait
|
||||||
|
after = System.nanoTime();
|
||||||
|
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 1000);
|
||||||
|
|
||||||
|
final AtomicBoolean expectedEventProcessed = new AtomicBoolean(false);
|
||||||
|
final AtomicBoolean doneWaiting = new AtomicBoolean(false);
|
||||||
|
final OverseerTaskQueue.LatchWatcher latch2 = new OverseerTaskQueue.LatchWatcher(Event.EventType.NodeCreated);
|
||||||
|
Thread t = new Thread(()->{
|
||||||
|
//Process an event of a different type first, this shouldn't release the latch
|
||||||
|
latch2.process(new WatchedEvent(new WatcherEvent(Event.EventType.NodeDeleted.getIntValue(), 1, "/foo/bar")));
|
||||||
|
try {
|
||||||
|
Thread.sleep(10);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
assertFalse("Latch shouldn't have been released", doneWaiting.get());
|
||||||
|
// Now process the correct type of event
|
||||||
|
expectedEventProcessed.set(true);
|
||||||
|
latch2.process(new WatchedEvent(new WatcherEvent(Event.EventType.NodeCreated.getIntValue(), 1, "/foo/bar")));
|
||||||
|
});
|
||||||
|
t.start();
|
||||||
|
before = System.nanoTime();
|
||||||
|
latch2.await(10000); // It shouldn't wait this long, t should notify the lock
|
||||||
|
after = System.nanoTime();
|
||||||
|
doneWaiting.set(true);
|
||||||
|
assertTrue(expectedEventProcessed.get());
|
||||||
|
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 1000);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue