mirror of https://github.com/apache/nifi.git
NIFI-10037: When system test fails to clean up flow, destroy the entire environment so that the next test starts in a healthy state. Name troubleshooting directories with the name of the test class to avoid ambiguity. Also added a log statement so that we know which test is running when looking at the log output from the tests themselves. Finally, found an issue in AbstractComponentNode in which we iterate over the elements in a Map and call setProperty, which can update the underlying Map - updated to first create a copy of the HashMap. Updated that in this Jira because I suspect it is causing one of the tests failures that I've been investigating.
This closes #6059 Signed-off-by: David Handermann <exceptionfactory@apache.org>
This commit is contained in:
parent
1cfca0d978
commit
38b51b0dde
|
@ -66,6 +66,7 @@ import java.net.URL;
|
|||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.LinkedHashSet;
|
||||
|
@ -591,7 +592,10 @@ public abstract class AbstractComponentNode implements ComponentNode {
|
|||
// use setProperty instead of setProperties so we can bypass the class loading logic.
|
||||
// Consider value changed if it is different than the PropertyDescriptor's default value because we need to call the #onPropertiesModified
|
||||
// method on the component if the current value is not the default value, since the component itself is being reloaded.
|
||||
for (final Map.Entry<PropertyDescriptor, PropertyConfiguration> entry : this.properties.entrySet()) {
|
||||
// Also, create a copy of this.properties instead of iterating directly over this.properties since the call to setProperty can change the
|
||||
// underlying map, and the behavior of modifying the map while iterating over its elements is undefined.
|
||||
final Map<PropertyDescriptor, PropertyConfiguration> copyOfPropertiesMap = new HashMap<>(this.properties);
|
||||
for (final Map.Entry<PropertyDescriptor, PropertyConfiguration> entry : copyOfPropertiesMap.entrySet()) {
|
||||
final PropertyDescriptor propertyDescriptor = entry.getKey();
|
||||
final PropertyConfiguration configuration = entry.getValue();
|
||||
|
||||
|
|
|
@ -73,6 +73,8 @@ public abstract class NiFiSystemIT implements NiFiInstanceProvider {
|
|||
@BeforeEach
|
||||
public void setup(final TestInfo testInfo) throws IOException {
|
||||
this.testInfo = testInfo;
|
||||
final String testClassName = testInfo.getTestClass().map(Class::getSimpleName).orElse("<Unknown Test Class>");
|
||||
logger.info("Beginning Test {}:{}", testClassName, testInfo.getDisplayName());
|
||||
|
||||
Thread.currentThread().setContextClassLoader(ClassLoader.getSystemClassLoader());
|
||||
setupClient();
|
||||
|
@ -116,6 +118,12 @@ public abstract class NiFiSystemIT implements NiFiInstanceProvider {
|
|||
|
||||
if (isDestroyEnvironmentAfterEachTest()) {
|
||||
cleanup();
|
||||
} else if (destroyFlowFailure != null) {
|
||||
// If unable to destroy the flow, we need to shutdown the instance and delete the flow and completely recreate the environment.
|
||||
// Otherwise, we will be left in an unknown state for the next test, and that can cause cascading failures that are very difficult
|
||||
// to understand and troubleshoot.
|
||||
logger.info("Because there was a failure when destroying the flow, will completely tear down the environments and start with a clean environment for the next test.");
|
||||
cleanup();
|
||||
}
|
||||
|
||||
if (destroyFlowFailure != null) {
|
||||
|
|
|
@ -40,7 +40,8 @@ public class TroubleshootingTestWatcher implements TestWatcher {
|
|||
final NiFiInstanceProvider provider = (NiFiInstanceProvider) testInstance;
|
||||
final String displayName = context.getDisplayName();
|
||||
try {
|
||||
final File dir = quarantineTroubleshootingInfo(provider, displayName, cause);
|
||||
final String testClassName = context.getTestClass().map(Class::getSimpleName).orElse("TestClassUnknown");
|
||||
final File dir = quarantineTroubleshootingInfo(provider, testClassName, displayName, cause);
|
||||
logger.info("Test Failed [{}]: Troubleshooting information stored [{}]", displayName, dir.getAbsolutePath());
|
||||
} catch (final Exception e) {
|
||||
logger.error("Test Failed [{}]: Troubleshooting information not stored", displayName, e);
|
||||
|
@ -49,17 +50,21 @@ public class TroubleshootingTestWatcher implements TestWatcher {
|
|||
}
|
||||
}
|
||||
|
||||
private File quarantineTroubleshootingInfo(final NiFiInstanceProvider provider, final String methodName, final Throwable failureCause) throws IOException {
|
||||
private File quarantineTroubleshootingInfo(final NiFiInstanceProvider provider, final String testClassName, final String methodName, final Throwable failureCause) throws IOException {
|
||||
NiFiInstance instance = provider.getNiFiInstance();
|
||||
|
||||
// The teardown method may or may not have already run at this point. If it has, the instance will be null.
|
||||
// In that case, just create a new instance and use it - it will map to the same directories.
|
||||
if (instance == null) {
|
||||
logger.warn("While capturing troubleshooting info for {}, the NiFi Instance is not available. Will create a new one for Diagnostics purposes, but some of the diagnostics may be less " +
|
||||
"accurate, since it's not the same instance that ran the test", methodName);
|
||||
|
||||
instance = provider.getInstanceFactory().createInstance();
|
||||
}
|
||||
|
||||
final File troubleshooting = new File("target/troubleshooting");
|
||||
final File quarantineDir = new File(troubleshooting, methodName);
|
||||
final String quarantineDirName = testClassName + "-" + methodName.replace("()", "");
|
||||
final File quarantineDir = new File(troubleshooting, quarantineDirName);
|
||||
quarantineDir.mkdirs();
|
||||
|
||||
instance.quarantineTroubleshootingInfo(quarantineDir, failureCause);
|
||||
|
|
|
@ -42,12 +42,13 @@ import java.io.IOException;
|
|||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.LongSummaryStatistics;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class LoadBalanceIT extends NiFiSystemIT {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
@ -277,13 +278,13 @@ public class LoadBalanceIT extends NiFiSystemIT {
|
|||
private int getQueueSize(final String connectionId) {
|
||||
final ConnectionStatusEntity statusEntity = getConnectionStatus(connectionId);
|
||||
final ConnectionStatusDTO connectionStatusDto = statusEntity.getConnectionStatus();
|
||||
return connectionStatusDto.getAggregateSnapshot().getFlowFilesQueued().intValue();
|
||||
return connectionStatusDto.getAggregateSnapshot().getFlowFilesQueued();
|
||||
}
|
||||
|
||||
private long getQueueBytes(final String connectionId) {
|
||||
final ConnectionStatusEntity statusEntity = getConnectionStatus(connectionId);
|
||||
final ConnectionStatusDTO connectionStatusDto = statusEntity.getConnectionStatus();
|
||||
return connectionStatusDto.getAggregateSnapshot().getBytesQueued().longValue();
|
||||
return connectionStatusDto.getAggregateSnapshot().getBytesQueued();
|
||||
}
|
||||
|
||||
private boolean isConnectionDoneLoadBalancing(final String connectionId) {
|
||||
|
@ -372,8 +373,13 @@ public class LoadBalanceIT extends NiFiSystemIT {
|
|||
instance2.start(true);
|
||||
waitForAllNodesConnected();
|
||||
|
||||
// Generate the data again
|
||||
generate = getNifiClient().getProcessorClient().getProcessor(generate.getId());
|
||||
|
||||
// Generate data and wait for it to be spread across the cluster. We do this in an infinite while() loop because
|
||||
// there can be a failure, in which case we'll retry. If that happens, we just want to keep retrying until the test
|
||||
// times out.
|
||||
while (true) {
|
||||
// Generate the data.
|
||||
getNifiClient().getProcessorClient().startProcessor(generate);
|
||||
|
||||
// Wait until all 20 FlowFiles are queued up
|
||||
|
@ -385,9 +391,27 @@ public class LoadBalanceIT extends NiFiSystemIT {
|
|||
// Wait until load balancing is complete
|
||||
waitFor(() -> isConnectionDoneLoadBalancing(connection.getId()));
|
||||
|
||||
// Ensure that the FlowFiles are evenly distributed between the nodes.
|
||||
// Log the distribution of data between nodes for easier troubleshooting in case there's a failure.
|
||||
final ConnectionStatusEntity afterSecondDataGenerationStatusEntity = getConnectionStatus(connection.getId());
|
||||
assertTrue(isEvenlyDistributed(afterSecondDataGenerationStatusEntity));
|
||||
final List<NodeConnectionStatusSnapshotDTO> nodeSnapshots = afterSecondDataGenerationStatusEntity.getConnectionStatus().getNodeSnapshots();
|
||||
logger.info("FlowFiles Queued Per Node:");
|
||||
nodeSnapshots.forEach(snapshot ->
|
||||
logger.info("{}:{} - {}", snapshot.getAddress(), snapshot.getApiPort(), snapshot.getStatusSnapshot().getFlowFilesQueued())
|
||||
);
|
||||
|
||||
// Check if the FlowFiles are evenly distributed between the nodes. If so, we're done.
|
||||
final boolean evenlyDistributed = isEvenlyDistributed(afterSecondDataGenerationStatusEntity);
|
||||
if (evenlyDistributed) {
|
||||
break;
|
||||
}
|
||||
|
||||
// If there's an IOException thrown while communicating between the nodes, the data will be rebalanced and will go to
|
||||
// the local partition. There's nothing we can do about that in this test. However, we can verify that NiFi recovers
|
||||
// from this and continues to distribute data. To do that, we will stop the processor so that it can be started again
|
||||
// (and produce more data) and we can empty the queue so that we know how much data to expect.
|
||||
getNifiClient().getProcessorClient().stopProcessor(generate);
|
||||
getClientUtil().emptyQueue(connection.getId());
|
||||
}
|
||||
|
||||
assertEquals(20, getQueueSize(connection.getId()));
|
||||
assertEquals(20 * 1024 * 1024, getQueueBytes(connection.getId()));
|
||||
|
|
|
@ -77,7 +77,7 @@ nifi.content.repository.implementation=org.apache.nifi.controller.repository.Fil
|
|||
nifi.content.claim.max.appendable.size=50 KB
|
||||
nifi.content.repository.directory.default=./content_repository
|
||||
nifi.content.repository.archive.max.retention.period=12 hours
|
||||
nifi.content.repository.archive.max.usage.percentage=50%
|
||||
nifi.content.repository.archive.max.usage.percentage=90%
|
||||
nifi.content.repository.archive.enabled=true
|
||||
nifi.content.repository.always.sync=false
|
||||
nifi.content.viewer.url=../nifi-content-viewer/
|
||||
|
|
|
@ -27,7 +27,7 @@ java.arg.3=-Xmx512m
|
|||
|
||||
java.arg.14=-Djava.awt.headless=true
|
||||
|
||||
java.arg.debug=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=8003
|
||||
#java.arg.debug=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=8003
|
||||
|
||||
java.arg.nodeNum=-DnodeNumber=2
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@ nifi.content.repository.implementation=org.apache.nifi.controller.repository.Fil
|
|||
nifi.content.claim.max.appendable.size=50 KB
|
||||
nifi.content.repository.directory.default=./content_repository
|
||||
nifi.content.repository.archive.max.retention.period=12 hours
|
||||
nifi.content.repository.archive.max.usage.percentage=50%
|
||||
nifi.content.repository.archive.max.usage.percentage=90%
|
||||
nifi.content.repository.archive.enabled=true
|
||||
nifi.content.repository.always.sync=false
|
||||
nifi.content.viewer.url=../nifi-content-viewer/
|
||||
|
|
|
@ -77,7 +77,7 @@ nifi.content.repository.implementation=org.apache.nifi.controller.repository.Fil
|
|||
nifi.content.claim.max.appendable.size=50 KB
|
||||
nifi.content.repository.directory.default=./content_repository
|
||||
nifi.content.repository.archive.max.retention.period=12 hours
|
||||
nifi.content.repository.archive.max.usage.percentage=50%
|
||||
nifi.content.repository.archive.max.usage.percentage=90%
|
||||
nifi.content.repository.archive.enabled=true
|
||||
nifi.content.repository.always.sync=false
|
||||
nifi.content.viewer.url=../nifi-content-viewer/
|
||||
|
|
Loading…
Reference in New Issue