SOLR-11782: Refactor LatchWatcher.await to protect against spurious wakeup

This commit is contained in:
Tomas Fernandez Lobbe 2018-01-26 10:33:10 -08:00
parent 3856ae2d85
commit 56f3f6d948
3 changed files with 89 additions and 34 deletions

View File

@ -108,6 +108,8 @@ New Features
* SOLR-11722: New CREATEROUTEDALIAS SolrCloud command to create a "time routed alias" over a series of collections * SOLR-11722: New CREATEROUTEDALIAS SolrCloud command to create a "time routed alias" over a series of collections
partitioned by time. (Gus Heck, David Smiley) partitioned by time. (Gus Heck, David Smiley)
* SOLR-11782: Refactor LatchWatcher.await to protect against spurious wakeup (Tomás Fernández Löbbe, David Smiley, Dawid Weiss)
Bug Fixes Bug Fixes
---------------------- ----------------------

View File

@ -16,13 +16,16 @@
*/ */
package org.apache.solr.cloud; package org.apache.solr.cloud;
import com.codahale.metrics.Timer;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Predicate; import java.util.function.Predicate;
import com.codahale.metrics.Timer;
import org.apache.solr.common.cloud.SolrZkClient; import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkNodeProps; import org.apache.solr.common.cloud.ZkNodeProps;
import org.apache.solr.common.util.Pair; import org.apache.solr.common.util.Pair;
@ -108,25 +111,24 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
/** /**
* Watcher that blocks until a WatchedEvent occurs for a znode. * Watcher that blocks until a WatchedEvent occurs for a znode.
*/ */
private static final class LatchWatcher implements Watcher { static final class LatchWatcher implements Watcher {
private final Object lock; private final Lock lock;
private final Condition eventReceived;
private WatchedEvent event; private WatchedEvent event;
private Event.EventType latchEventType; private Event.EventType latchEventType;
LatchWatcher(Object lock) { LatchWatcher() {
this(lock, null); this(null);
} }
LatchWatcher(Event.EventType eventType) { LatchWatcher(Event.EventType eventType) {
this(new Object(), eventType); this.lock = new ReentrantLock();
} this.eventReceived = lock.newCondition();
LatchWatcher(Object lock, Event.EventType eventType) {
this.lock = lock;
this.latchEventType = eventType; this.latchEventType = eventType;
} }
@Override @Override
public void process(WatchedEvent event) { public void process(WatchedEvent event) {
// session events are not change events, and do not remove the watcher // session events are not change events, and do not remove the watcher
@ -136,17 +138,29 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
// If latchEventType is not null, only fire if the type matches // If latchEventType is not null, only fire if the type matches
LOG.debug("{} fired on path {} state {} latchEventType {}", event.getType(), event.getPath(), event.getState(), latchEventType); LOG.debug("{} fired on path {} state {} latchEventType {}", event.getType(), event.getPath(), event.getState(), latchEventType);
if (latchEventType == null || event.getType() == latchEventType) { if (latchEventType == null || event.getType() == latchEventType) {
synchronized (lock) { lock.lock();
try {
this.event = event; this.event = event;
lock.notifyAll(); eventReceived.signalAll();
} finally {
lock.unlock();
} }
} }
} }
public void await(long timeout) throws InterruptedException { public void await(long timeoutMs) throws InterruptedException {
synchronized (lock) { assert timeoutMs > 0;
if (this.event != null) return; long timeoutNanos = TimeUnit.MILLISECONDS.toNanos(timeoutMs);
lock.wait(timeout); lock.lock();
try {
while (this.event == null) {
if (timeoutNanos <= 0) {
return;
}
timeoutNanos = eventReceived.awaitNanos(timeoutNanos);
}
} finally {
lock.unlock();
} }
} }
@ -187,17 +201,14 @@ public class OverseerTaskQueue extends ZkDistributedQueue {
// otherwise we may miss the response. // otherwise we may miss the response.
String watchID = createResponseNode(); String watchID = createResponseNode();
Object lock = new Object(); LatchWatcher watcher = new LatchWatcher();
LatchWatcher watcher = new LatchWatcher(lock);
Stat stat = zookeeper.exists(watchID, watcher, true); Stat stat = zookeeper.exists(watchID, watcher, true);
// create the request node // create the request node
createRequestNode(data, watchID); createRequestNode(data, watchID);
synchronized (lock) { if (stat != null) {
if (stat != null && watcher.getWatchedEvent() == null) { watcher.await(timeout);
watcher.await(timeout);
}
} }
byte[] bytes = zookeeper.getData(watchID, null, null, true); byte[] bytes = zookeeper.getData(watchID, null, null, true);
// create the event before deleting the node, otherwise we can get the deleted // create the event before deleting the node, otherwise we can get the deleted

View File

@ -16,7 +16,13 @@
*/ */
package org.apache.solr.cloud; package org.apache.solr.cloud;
import javax.xml.parsers.ParserConfigurationException; import static org.apache.solr.cloud.AbstractDistribZkTestBase.verifyReplicaStatus;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.codahale.metrics.Snapshot;
import com.codahale.metrics.Timer;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodHandles;
@ -31,10 +37,9 @@ import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException; import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicInteger;
import javax.xml.parsers.ParserConfigurationException;
import com.codahale.metrics.Snapshot;
import com.codahale.metrics.Timer;
import org.apache.lucene.util.LuceneTestCase.Slow; import org.apache.lucene.util.LuceneTestCase.Slow;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.client.solrj.cloud.DistributedQueue; import org.apache.solr.client.solrj.cloud.DistributedQueue;
@ -64,7 +69,10 @@ import org.apache.zookeeper.CreateMode;
import org.apache.zookeeper.KeeperException; import org.apache.zookeeper.KeeperException;
import org.apache.zookeeper.KeeperException.NoNodeException; import org.apache.zookeeper.KeeperException.NoNodeException;
import org.apache.zookeeper.KeeperException.NodeExistsException; import org.apache.zookeeper.KeeperException.NodeExistsException;
import org.apache.zookeeper.WatchedEvent;
import org.apache.zookeeper.Watcher.Event;
import org.apache.zookeeper.data.Stat; import org.apache.zookeeper.data.Stat;
import org.apache.zookeeper.proto.WatcherEvent;
import org.junit.After; import org.junit.After;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
@ -75,11 +83,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import static org.apache.solr.cloud.AbstractDistribZkTestBase.verifyReplicaStatus;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@Slow @Slow
public class OverseerTest extends SolrTestCaseJ4 { public class OverseerTest extends SolrTestCaseJ4 {
@ -1475,5 +1478,44 @@ public class OverseerTest extends SolrTestCaseJ4 {
server.shutdown(); server.shutdown();
} }
} }
@Test
public void testLatchWatcher() throws InterruptedException {
OverseerTaskQueue.LatchWatcher latch1 = new OverseerTaskQueue.LatchWatcher();
long before = System.nanoTime();
latch1.await(100);
long after = System.nanoTime();
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) > 50);
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 500);// Mostly to make sure the millis->nanos->millis is not broken
latch1.process(new WatchedEvent(new WatcherEvent(1, 1, "/foo/bar")));
before = System.nanoTime();
latch1.await(10000);// Expecting no wait
after = System.nanoTime();
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 1000);
final AtomicBoolean expectedEventProcessed = new AtomicBoolean(false);
final AtomicBoolean doneWaiting = new AtomicBoolean(false);
final OverseerTaskQueue.LatchWatcher latch2 = new OverseerTaskQueue.LatchWatcher(Event.EventType.NodeCreated);
Thread t = new Thread(()->{
//Process an event of a different type first, this shouldn't release the latch
latch2.process(new WatchedEvent(new WatcherEvent(Event.EventType.NodeDeleted.getIntValue(), 1, "/foo/bar")));
try {
Thread.sleep(10);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
assertFalse("Latch shouldn't have been released", doneWaiting.get());
// Now process the correct type of event
expectedEventProcessed.set(true);
latch2.process(new WatchedEvent(new WatcherEvent(Event.EventType.NodeCreated.getIntValue(), 1, "/foo/bar")));
});
t.start();
before = System.nanoTime();
latch2.await(10000); // It shouldn't wait this long, t should notify the lock
after = System.nanoTime();
doneWaiting.set(true);
assertTrue(expectedEventProcessed.get());
assertTrue(TimeUnit.NANOSECONDS.toMillis(after-before) < 1000);
}
} }