From 4405698ee99fe26d0ac9317a2df96096f2731a7b Mon Sep 17 00:00:00 2001 From: Jonathan Hsieh Date: Wed, 13 Feb 2013 19:13:38 +0000 Subject: [PATCH] HBASE-7703 Eventually all online snapshots fail due to Timeout at same regionserver. Online snapshot attempts would fail due to timeout because a rowlock could not be obtained. Prior to this a cancellation occurred which likely grabbed the lock without cleaning it properly. The fix here is to use nice cancel instead of interrupting cancel on failures. git-svn-id: https://svn.apache.org/repos/asf/hbase/branches/hbase-7290@1445866 13f79535-47bb-0310-9956-ffa450edef68 --- .../regionserver/snapshot/RegionServerSnapshotManager.java | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/snapshot/RegionServerSnapshotManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/snapshot/RegionServerSnapshotManager.java index 3e5238e7b28..1282585d52e 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/snapshot/RegionServerSnapshotManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/snapshot/RegionServerSnapshotManager.java @@ -347,7 +347,11 @@ public class RegionServerSnapshotManager { Collection> tasks = futures; LOG.debug("cancelling " + tasks.size() + " tasks for snapshot " + name); for (Future f: tasks) { - f.cancel(true); + // TODO Ideally we'd interrupt hbase threads when we cancel. However it seems that there + // are places in the HBase code where row/region locks are taken and not released in a + // finally block. Thus we cancel without interrupting. Cancellations will be slower to + // complete but we won't suffer from unreleased locks due to poor code discipline. + f.cancel(false); } // evict remaining tasks and futures from taskPool.