HBASE-17276 Only log stacktraces for exceptions once for updates in a batch

For large batches of updates, repeatedly logging WrongRegionExceptions,
FailedSanityCheckExceptions, and/or NoSuchColumnFamilyExceptions can
easily dominate the contents of a RegionServer log. After the first
occurence of logging the full exception, switch to logging only the
message on the exception.
This commit is contained in:
Josh Elser 2016-12-07 13:11:16 -05:00 committed by Michael Stack
parent b3ae87bd7d
commit b554e05410
2 changed files with 134 additions and 5 deletions

View File

@ -550,6 +550,57 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
} }
} }
/**
* A class that tracks exceptions that have been observed in one batch. Not thread safe.
*/
static class ObservedExceptionsInBatch {
private boolean wrongRegion = false;
private boolean failedSanityCheck = false;
private boolean wrongFamily = false;
/**
* @return If a {@link WrongRegionException} has been observed.
*/
boolean hasSeenWrongRegion() {
return wrongRegion;
}
/**
* Records that a {@link WrongRegionException} has been observed.
*/
void sawWrongRegion() {
wrongRegion = true;
}
/**
* @return If a {@link FailedSanityCheckException} has been observed.
*/
boolean hasSeenFailedSanityCheck() {
return failedSanityCheck;
}
/**
* Records that a {@link FailedSanityCheckException} has been observed.
*/
void sawFailedSanityCheck() {
failedSanityCheck = true;
}
/**
* @return If a {@link NoSuchColumnFamilyException} has been observed.
*/
boolean hasSeenNoSuchFamily() {
return wrongFamily;
}
/**
* Records that a {@link NoSuchColumnFamilyException} has been observed.
*/
void sawNoSuchFamily() {
wrongFamily = true;
}
}
final WriteState writestate = new WriteState(); final WriteState writestate = new WriteState();
long memstoreFlushSize; long memstoreFlushSize;
@ -3107,12 +3158,13 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
/** Keep track of the locks we hold so we can release them in finally clause */ /** Keep track of the locks we hold so we can release them in finally clause */
List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length); List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
MemstoreSize memstoreSize = new MemstoreSize(); MemstoreSize memstoreSize = new MemstoreSize();
final ObservedExceptionsInBatch observedExceptions = new ObservedExceptionsInBatch();
try { try {
// STEP 1. Try to acquire as many locks as we can, and ensure we acquire at least one. // STEP 1. Try to acquire as many locks as we can, and ensure we acquire at least one.
int numReadyToWrite = 0; int numReadyToWrite = 0;
long now = EnvironmentEdgeManager.currentTime(); long now = EnvironmentEdgeManager.currentTime();
while (lastIndexExclusive < batchOp.operations.length) { while (lastIndexExclusive < batchOp.operations.length) {
if (checkBatchOp(batchOp, lastIndexExclusive, familyMaps, now)) { if (checkBatchOp(batchOp, lastIndexExclusive, familyMaps, now, observedExceptions)) {
lastIndexExclusive++; lastIndexExclusive++;
continue; continue;
} }
@ -3477,7 +3529,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
} }
private boolean checkBatchOp(BatchOperation<?> batchOp, final int lastIndexExclusive, private boolean checkBatchOp(BatchOperation<?> batchOp, final int lastIndexExclusive,
final Map<byte[], List<Cell>>[] familyMaps, final long now) final Map<byte[], List<Cell>>[] familyMaps, final long now,
final ObservedExceptionsInBatch observedExceptions)
throws IOException { throws IOException {
boolean skip = false; boolean skip = false;
// Skip anything that "ran" already // Skip anything that "ran" already
@ -3493,17 +3546,35 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
try { try {
checkAndPrepareMutation(mutation, batchOp.isInReplay(), familyMap, now); checkAndPrepareMutation(mutation, batchOp.isInReplay(), familyMap, now);
} catch (NoSuchColumnFamilyException nscf) { } catch (NoSuchColumnFamilyException nscf) {
LOG.warn("No such column family in batch mutation", nscf); final String msg = "No such column family in batch mutation. ";
if (observedExceptions.hasSeenNoSuchFamily()) {
LOG.warn(msg + nscf.getMessage());
} else {
LOG.warn(msg, nscf);
observedExceptions.sawNoSuchFamily();
}
batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
OperationStatusCode.BAD_FAMILY, nscf.getMessage()); OperationStatusCode.BAD_FAMILY, nscf.getMessage());
skip = true; skip = true;
} catch (FailedSanityCheckException fsce) { } catch (FailedSanityCheckException fsce) {
LOG.warn("Batch Mutation did not pass sanity check", fsce); final String msg = "Batch Mutation did not pass sanity check. ";
if (observedExceptions.hasSeenFailedSanityCheck()) {
LOG.warn(msg + fsce.getMessage());
} else {
LOG.warn(msg, fsce);
observedExceptions.sawFailedSanityCheck();
}
batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage()); OperationStatusCode.SANITY_CHECK_FAILURE, fsce.getMessage());
skip = true; skip = true;
} catch (WrongRegionException we) { } catch (WrongRegionException we) {
LOG.warn("Batch mutation had a row that does not belong to this region", we); final String msg = "Batch mutation had a row that does not belong to this region. ";
if (observedExceptions.hasSeenWrongRegion()) {
LOG.warn(msg + we.getMessage());
} else {
LOG.warn(msg, we);
observedExceptions.sawWrongRegion();
}
batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus( batchOp.retCodeDetails[lastIndexExclusive] = new OperationStatus(
OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage()); OperationStatusCode.SANITY_CHECK_FAILURE, we.getMessage());
skip = true; skip = true;

View File

@ -0,0 +1,58 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.apache.hadoop.hbase.regionserver.HRegion.ObservedExceptionsInBatch;
import org.apache.hadoop.hbase.testclassification.SmallTests;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Test class for {@link ObservedExceptionsInBatch}.
*/
@Category(SmallTests.class)
public class TestObservedExceptionsInBatch {
private ObservedExceptionsInBatch observedExceptions;
@Before
public void setup() {
observedExceptions = new ObservedExceptionsInBatch();
}
@Test
public void testNoObservationsOnCreation() {
assertFalse(observedExceptions.hasSeenFailedSanityCheck());
assertFalse(observedExceptions.hasSeenNoSuchFamily());
assertFalse(observedExceptions.hasSeenWrongRegion());
}
@Test
public void testObservedAfterRecording() {
observedExceptions.sawFailedSanityCheck();
assertTrue(observedExceptions.hasSeenFailedSanityCheck());
observedExceptions.sawNoSuchFamily();
assertTrue(observedExceptions.hasSeenNoSuchFamily());
observedExceptions.sawWrongRegion();
assertTrue(observedExceptions.hasSeenWrongRegion());
}
}