HDDS-1708. Add container scrubber metrics.

Contributed by Hrishikesh Gadre.
This commit is contained in:
Anu Engineer 2019-09-05 14:33:06 -07:00
parent 0ccf4b0fe1
commit acbea8d976
8 changed files with 447 additions and 44 deletions

View File

@ -20,7 +20,6 @@ package org.apache.hadoop.ozone.container.keyvalue.impl;
import com.google.common.base.Preconditions;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.ozone.container.keyvalue.interfaces.ChunkManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View File

@ -19,7 +19,10 @@ package org.apache.hadoop.ozone.container.ozoneimpl;
import java.io.IOException;
import java.util.Iterator;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hdfs.util.Canceler;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
@ -42,6 +45,7 @@ public class ContainerDataScanner extends Thread {
private final ContainerController controller;
private final DataTransferThrottler throttler;
private final Canceler canceler;
private final ContainerDataScrubberMetrics metrics;
/**
* True if the thread is stopping.<p/>
@ -50,12 +54,15 @@ public class ContainerDataScanner extends Thread {
private volatile boolean stopping = false;
public ContainerDataScanner(ContainerController controller,
public ContainerDataScanner(Configuration conf,
ContainerController controller,
HddsVolume volume, long bytesPerSec) {
this.controller = controller;
this.volume = volume;
this.throttler = new DataTransferThrottler(bytesPerSec);
this.throttler = new HddsDataTransferThrottler(bytesPerSec);
this.canceler = new Canceler();
this.metrics = ContainerDataScrubberMetrics.create(conf,
volume.toString());
setName("ContainerDataScanner(" + volume + ")");
setDaemon(true);
}
@ -65,26 +72,54 @@ public class ContainerDataScanner extends Thread {
LOG.trace("{}: thread starting.", this);
try {
while (!stopping) {
runIteration();
metrics.resetNumContainersScanned();
metrics.resetNumUnhealthyContainers();
}
LOG.info("{} exiting.", this);
} catch (Throwable e) {
LOG.error("{} exiting because of exception ", this, e);
} finally {
if (metrics != null) {
metrics.unregister();
}
}
}
@VisibleForTesting
public void runIteration() {
long startTime = System.nanoTime();
Iterator<Container> itr = controller.getContainers(volume);
while (!stopping && itr.hasNext()) {
Container c = itr.next();
try {
if (c.shouldScanData()) {
if(!c.scanData(throttler, canceler)) {
try {
if (!c.scanData(throttler, canceler)) {
metrics.incNumUnHealthyContainers();
controller.markContainerUnhealthy(
c.getContainerData().getContainerID());
}
}
} catch (IOException ex) {
long containerId = c.getContainerData().getContainerID();
LOG.warn("Unexpected exception while scanning container "
+ containerId, ex);
} finally {
metrics.incNumContainersScanned();
}
}
}
LOG.info("{} exiting.", this);
} catch (Throwable e) {
LOG.error("{} exiting because of exception ", this, e);
long totalDuration = System.nanoTime() - startTime;
if (!stopping) {
metrics.incNumScanIterations();
LOG.info("Completed an iteration of container data scrubber in" +
" {} minutes." +
" Number of iterations (since the data-node restart) : {}" +
", Number of containers scanned in this iteration : {}" +
", Number of unhealthy containers found in this iteration : {}",
TimeUnit.NANOSECONDS.toMinutes(totalDuration),
metrics.getNumScanIterations(),
metrics.getNumContainersScanned(),
metrics.getNumUnHealthyContainers());
}
}
@ -100,9 +135,32 @@ public class ContainerDataScanner extends Thread {
}
}
@VisibleForTesting
public ContainerDataScrubberMetrics getMetrics() {
return metrics;
}
@Override
public String toString() {
return "ContainerDataScanner(" + volume +
", " + volume.getStorageID() + ")";
}
private class HddsDataTransferThrottler extends DataTransferThrottler {
HddsDataTransferThrottler(long bandwidthPerSec) {
super(bandwidthPerSec);
}
@Override
public synchronized void throttle(long numOfBytes) {
ContainerDataScanner.this.metrics.incNumBytesScanned(numOfBytes);
super.throttle(numOfBytes);
}
@Override
public synchronized void throttle(long numOfBytes, Canceler c) {
ContainerDataScanner.this.metrics.incNumBytesScanned(numOfBytes);
super.throttle(numOfBytes, c);
}
}
}

View File

@ -0,0 +1,115 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ozone.container.ozoneimpl;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableCounterInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
import org.apache.hadoop.metrics2.lib.MutableRate;
import java.util.concurrent.ThreadLocalRandom;
/**
* This class captures the container data scrubber metrics on the data-node.
**/
@InterfaceAudience.Private
@Metrics(about="DataNode container data scrubber metrics", context="dfs")
public final class ContainerDataScrubberMetrics {
private final String name;
private final MetricsSystem ms;
@Metric("number of containers scanned in the current iteration")
private MutableGaugeInt numContainersScanned;
@Metric("number of unhealthy containers found in the current iteration")
private MutableGaugeInt numUnHealthyContainers;
@Metric("number of iterations of scanner completed since the restart")
private MutableCounterInt numScanIterations;
@Metric("disk bandwidth used by the container data scrubber per volume")
private MutableRate numBytesScanned;
public int getNumContainersScanned() {
return numContainersScanned.value();
}
public void incNumContainersScanned() {
numContainersScanned.incr();
}
public void resetNumContainersScanned() {
numContainersScanned.decr(getNumContainersScanned());
}
public int getNumUnHealthyContainers() {
return numUnHealthyContainers.value();
}
public void incNumUnHealthyContainers() {
numUnHealthyContainers.incr();
}
public void resetNumUnhealthyContainers() {
numUnHealthyContainers.decr(getNumUnHealthyContainers());
}
public int getNumScanIterations() {
return numScanIterations.value();
}
public void incNumScanIterations() {
numScanIterations.incr();
}
public double getNumBytesScannedMean() {
return numBytesScanned.lastStat().mean();
}
public long getNumBytesScannedSampleCount() {
return numBytesScanned.lastStat().numSamples();
}
public double getNumBytesScannedStdDev() {
return numBytesScanned.lastStat().stddev();
}
public void incNumBytesScanned(long bytes) {
numBytesScanned.add(bytes);
}
public void unregister() {
ms.unregisterSource(name);
}
private ContainerDataScrubberMetrics(String name, MetricsSystem ms) {
this.name = name;
this.ms = ms;
}
public static ContainerDataScrubberMetrics create(final Configuration conf,
final String volumeName) {
MetricsSystem ms = DefaultMetricsSystem.instance();
String name = "ContainerDataScrubberMetrics-"+ (volumeName.isEmpty()
? "UndefinedDataNodeVolume"+ ThreadLocalRandom.current().nextInt()
: volumeName.replace(':', '-'));
return ms.register(name, null, new ContainerDataScrubberMetrics(name, ms));
}
}

View File

@ -18,6 +18,7 @@
package org.apache.hadoop.ozone.container.ozoneimpl;
import com.google.common.annotations.VisibleForTesting;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -36,16 +37,19 @@ public class ContainerMetadataScanner extends Thread {
private final ContainerController controller;
private final long metadataScanInterval;
private final ContainerMetadataScrubberMetrics metrics;
/**
* True if the thread is stopping.<p/>
* Protected by this object's lock.
*/
private boolean stopping = false;
public ContainerMetadataScanner(ContainerController controller,
public ContainerMetadataScanner(Configuration conf,
ContainerController controller,
long metadataScanInterval) {
this.controller = controller;
this.metadataScanInterval = metadataScanInterval;
this.metrics = ContainerMetadataScrubberMetrics.create(conf);
setName("ContainerMetadataScanner");
setDaemon(true);
}
@ -58,10 +62,43 @@ public class ContainerMetadataScanner extends Thread {
LOG.info("Background ContainerMetadataScanner starting up");
while (!stopping) {
long start = System.nanoTime();
scrub();
long interval = TimeUnit.NANOSECONDS.toMillis(System.nanoTime()-start);
runIteration();
if(!stopping) {
metrics.resetNumUnhealthyContainers();
metrics.resetNumContainersScanned();
}
}
}
@VisibleForTesting
public void runIteration() {
long start = System.nanoTime();
Iterator<Container> containerIt = controller.getContainers();
while (!stopping && containerIt.hasNext()) {
Container container = containerIt.next();
try {
scrub(container);
} catch (IOException e) {
LOG.info("Unexpected error while scrubbing container {}",
container.getContainerData().getContainerID());
} finally {
metrics.incNumContainersScanned();
}
}
long interval = System.nanoTime()-start;
if (!stopping) {
metrics.incNumScanIterations();
LOG.info("Completed an iteration of container metadata scrubber in" +
" {} minutes." +
" Number of iterations (since the data-node restart) : {}" +
", Number of containers scanned in this iteration : {}" +
", Number of unhealthy containers found in this iteration : {}",
TimeUnit.NANOSECONDS.toMinutes(interval),
metrics.getNumScanIterations(),
metrics.getNumContainersScanned(),
metrics.getNumUnHealthyContainers());
// ensure to delay next metadata scan with respect to user config.
if (!stopping && interval < metadataScanInterval) {
if (interval < metadataScanInterval) {
try {
Thread.sleep(metadataScanInterval - interval);
} catch (InterruptedException e) {
@ -72,32 +109,20 @@ public class ContainerMetadataScanner extends Thread {
}
}
private void scrub() {
Iterator<Container> containerIt = controller.getContainers();
long count = 0;
while (!stopping && containerIt.hasNext()) {
Container container = containerIt.next();
try {
scrub(container);
} catch (IOException e) {
LOG.info("Unexpected error while scrubbing container {}",
container.getContainerData().getContainerID());
}
count++;
}
LOG.debug("iterator ran integrity checks on {} containers", count);
}
@VisibleForTesting
public void scrub(Container container) throws IOException {
if (!container.scanMetaData()) {
metrics.incNumUnHealthyContainers();
controller.markContainerUnhealthy(
container.getContainerData().getContainerID());
}
}
@VisibleForTesting
public ContainerMetadataScrubberMetrics getMetrics() {
return metrics;
}
public synchronized void shutdown() {
this.stopping = true;
this.interrupt();

View File

@ -0,0 +1,93 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ozone.container.ozoneimpl;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.metrics2.MetricsSystem;
import org.apache.hadoop.metrics2.annotation.Metric;
import org.apache.hadoop.metrics2.annotation.Metrics;
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
import org.apache.hadoop.metrics2.lib.MutableCounterInt;
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
/**
* This class captures the container meta-data scrubber metrics on the
* data-node.
**/
@InterfaceAudience.Private
@Metrics(about="DataNode container data scrubber metrics", context="dfs")
public final class ContainerMetadataScrubberMetrics {
private final String name;
private final MetricsSystem ms;
@Metric("number of containers scanned in the current iteration")
private MutableGaugeInt numContainersScanned;
@Metric("number of unhealthy containers found in the current iteration")
private MutableGaugeInt numUnHealthyContainers;
@Metric("number of iterations of scanner completed since the restart")
private MutableCounterInt numScanIterations;
public int getNumContainersScanned() {
return numContainersScanned.value();
}
public void incNumContainersScanned() {
numContainersScanned.incr();
}
public void resetNumContainersScanned() {
numContainersScanned.decr(getNumContainersScanned());
}
public int getNumUnHealthyContainers() {
return numUnHealthyContainers.value();
}
public void incNumUnHealthyContainers() {
numUnHealthyContainers.incr();
}
public void resetNumUnhealthyContainers() {
numUnHealthyContainers.decr(getNumUnHealthyContainers());
}
public int getNumScanIterations() {
return numScanIterations.value();
}
public void incNumScanIterations() {
numScanIterations.incr();
}
public void unregister() {
ms.unregisterSource(name);
}
private ContainerMetadataScrubberMetrics(String name, MetricsSystem ms) {
this.name = name;
this.ms = ms;
}
public static ContainerMetadataScrubberMetrics create(Configuration conf) {
MetricsSystem ms = DefaultMetricsSystem.instance();
String name = "ContainerDataScrubberMetrics";
return ms.register(name, null,
new ContainerMetadataScrubberMetrics(name, ms));
}
}

View File

@ -178,14 +178,14 @@ public class OzoneContainer {
LOG.info("Background container scanner has been disabled.");
} else {
if (this.metadataScanner == null) {
this.metadataScanner = new ContainerMetadataScanner(controller,
this.metadataScanner = new ContainerMetadataScanner(config, controller,
metadataScanInterval);
}
this.metadataScanner.start();
dataScanners = new ArrayList<>();
for (HddsVolume v : volumeSet.getVolumesList()) {
ContainerDataScanner s = new ContainerDataScanner(controller,
ContainerDataScanner s = new ContainerDataScanner(config, controller,
v, bytesPerSec);
s.start();
dataScanners.add(s);

View File

@ -0,0 +1,113 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.ozone.container.ozoneimpl;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdfs.util.Canceler;
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
import org.apache.hadoop.ozone.container.common.impl.ContainerData;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
import org.junit.Assert;
import org.junit.Test;
import org.mockito.Mockito;
import java.util.Iterator;
/**
* This test verifies the container scrubber metrics functionality.
*/
public class TestContainerScrubberMetrics {
@Test
public void testContainerMetaDataScrubberMetrics() {
OzoneConfiguration conf = new OzoneConfiguration();
ContainerScrubberConfiguration c = conf.getObject(
ContainerScrubberConfiguration.class);
c.setMetadataScanInterval(0);
HddsVolume vol = Mockito.mock(HddsVolume.class);
ContainerController cntrl = mockContainerController(vol);
ContainerMetadataScanner mc = new ContainerMetadataScanner(conf,
cntrl, c.getMetadataScanInterval());
mc.runIteration();
Assert.assertEquals(1, mc.getMetrics().getNumScanIterations());
Assert.assertEquals(3, mc.getMetrics().getNumContainersScanned());
Assert.assertEquals(1, mc.getMetrics().getNumUnHealthyContainers());
}
@Test
public void testContainerDataScrubberMetrics() {
OzoneConfiguration conf = new OzoneConfiguration();
ContainerScrubberConfiguration c = conf.getObject(
ContainerScrubberConfiguration.class);
HddsVolume vol = Mockito.mock(HddsVolume.class);
ContainerController cntrl = mockContainerController(vol);
ContainerDataScanner sc = new ContainerDataScanner(conf, cntrl,
vol, c.getBandwidthPerVolume());
sc.runIteration();
ContainerDataScrubberMetrics m = sc.getMetrics();
Assert.assertEquals(1, m.getNumScanIterations());
Assert.assertEquals(2, m.getNumContainersScanned());
Assert.assertEquals(1, m.getNumUnHealthyContainers());
}
private ContainerController mockContainerController(HddsVolume vol) {
// healthy container
Container c1 = Mockito.mock(Container.class);
Mockito.when(c1.shouldScanData()).thenReturn(true);
Mockito.when(c1.scanMetaData()).thenReturn(true);
Mockito.when(c1.scanData(
Mockito.any(DataTransferThrottler.class),
Mockito.any(Canceler.class))).thenReturn(true);
// unhealthy container (corrupt data)
ContainerData c2d = Mockito.mock(ContainerData.class);
Mockito.when(c2d.getContainerID()).thenReturn(101L);
Container c2 = Mockito.mock(Container.class);
Mockito.when(c2.scanMetaData()).thenReturn(true);
Mockito.when(c2.shouldScanData()).thenReturn(true);
Mockito.when(c2.scanData(
Mockito.any(DataTransferThrottler.class),
Mockito.any(Canceler.class))).thenReturn(false);
Mockito.when(c2.getContainerData()).thenReturn(c2d);
// unhealthy container (corrupt metadata)
ContainerData c3d = Mockito.mock(ContainerData.class);
Mockito.when(c3d.getContainerID()).thenReturn(102L);
Container c3 = Mockito.mock(Container.class);
Mockito.when(c3.shouldScanData()).thenReturn(false);
Mockito.when(c3.scanMetaData()).thenReturn(false);
Mockito.when(c3.getContainerData()).thenReturn(c3d);
Iterator<Container> iter = Mockito.mock(Iterator.class);
Mockito.when(iter.hasNext()).thenReturn(true, true, true, false);
Mockito.when(iter.next()).thenReturn(c1, c2, c3);
ContainerController cntrl = Mockito.mock(ContainerController.class);
Mockito.when(cntrl.getContainers(vol))
.thenReturn(iter);
Mockito.when(cntrl.getContainers())
.thenReturn(iter);
return cntrl;
}
}

View File

@ -159,7 +159,7 @@ public class TestDataScrubber {
deleteDirectory(chunksDir);
Assert.assertFalse(chunksDir.exists());
ContainerMetadataScanner sb = new ContainerMetadataScanner(
ContainerMetadataScanner sb = new ContainerMetadataScanner(ozoneConfig,
oc.getController(), 0);
sb.scrub(c);