From 6c1e484d36ffdcb654b0cf3ef0a74924ad120cb4 Mon Sep 17 00:00:00 2001 From: Michael Stack Date: Sat, 13 Apr 2013 17:07:14 +0000 Subject: [PATCH] HBASE-8096 [replication] NPE while replicating a log that is acquiring a new block from HDFS git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1467662 13f79535-47bb-0310-9956-ffa450edef68 --- .../regionserver/ReplicationHLogReaderManager.java | 6 +++++- .../replication/regionserver/ReplicationSource.java | 11 ++++++++--- .../hbase/replication/TestReplicationSmallTests.java | 2 +- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java index 82b7023bec2..fe6924c91e0 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationHLogReaderManager.java @@ -68,7 +68,11 @@ public class ReplicationHLogReaderManager { this.reader = HLogFactory.createReader(this.fs, path, this.conf); this.lastPath = path; } else { - this.reader.reset(); + try { + this.reader.reset(); + } catch (NullPointerException npe) { + throw new IOException("NPE resetting reader, likely HDFS-4380", npe); + } } return this.reader; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index cac32ae01fe..bde35466ac7 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -622,9 +622,14 @@ public class ReplicationSource extends Thread } catch (IOException ioe) { LOG.warn(peerClusterZnode + " Got: ", ioe); this.reader = null; - // TODO Need a better way to determinate if a file is really gone but - // TODO without scanning all logs dir - if (sleepMultiplier == this.maxRetriesMultiplier) { + if (ioe.getCause() instanceof NullPointerException) { + // Workaround for race condition in HDFS-4380 + // which throws a NPE if we open a file before any data node has the most recent block + // Just sleep and retry. Will require re-reading compressed HLogs for compressionContext. + LOG.warn("Got NPE opening reader, will retry."); + } else if (sleepMultiplier == this.maxRetriesMultiplier) { + // TODO Need a better way to determine if a file is really gone but + // TODO without scanning all logs dir LOG.warn("Waited too long for this file, considering dumping"); return !processEndOfFile(); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSmallTests.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSmallTests.java index 374ea6848de..b9219668db4 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSmallTests.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/replication/TestReplicationSmallTests.java @@ -428,7 +428,7 @@ public class TestReplicationSmallTests extends TestReplicationBase { Result[] res = scanner.next(NB_ROWS_IN_BIG_BATCH); scanner.close(); - assertEquals(NB_ROWS_IN_BATCH *10, res.length); + assertEquals(NB_ROWS_IN_BIG_BATCH, res.length); scan = new Scan();