HDFS-3077. Quorum-based protocol for reading and writing edit logs. Contributed by Todd Lipcon based on initial work from Brandon Li and Hari Mankude.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-3077@1363596 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
23af0493e4
commit
74d4573a23
|
@ -64,6 +64,15 @@ public final class ExitUtil {
|
|||
return firstExitException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the tracking of process termination. This is for use
|
||||
* in unit tests where one test in the suite expects an exit
|
||||
* but others do not.
|
||||
*/
|
||||
public static void resetFirstExitException() {
|
||||
firstExitException = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Terminate the current process. Note that terminate is the *only* method
|
||||
* that should be used to terminate the daemon processes.
|
||||
|
|
|
@ -0,0 +1,4 @@
|
|||
Changes for HDFS-3077 branch.
|
||||
This will be merged into the main CHANGES.txt when the branch is merged.
|
||||
|
||||
HDFS-3077. Quorum-based protocol for reading and writing edit logs. Contributed by Todd Lipcon based on initial work from Brandon Li and Hari Mankude.
|
|
@ -8,6 +8,9 @@
|
|||
<Match>
|
||||
<Package name="org.apache.hadoop.hdfs.server.namenode.ha.proto" />
|
||||
</Match>
|
||||
<Match>
|
||||
<Package name="org.apache.hadoop.hdfs.qjournal.protocol" />
|
||||
</Match>
|
||||
<Match>
|
||||
<Bug pattern="EI_EXPOSE_REP" />
|
||||
</Match>
|
||||
|
@ -273,4 +276,11 @@
|
|||
<Method name="quit" />
|
||||
<Bug pattern="DM_EXIT" />
|
||||
</Match>
|
||||
|
||||
<!-- More complex cleanup logic confuses findbugs -->
|
||||
<Match>
|
||||
<Class name="org.apache.hadoop.hdfs.qjournal.server.Journal" />
|
||||
<Method name="persistPaxosData" />
|
||||
<Bug pattern="OS_OPEN_STREAM" />
|
||||
</Match>
|
||||
</FindBugsFilter>
|
||||
|
|
|
@ -177,6 +177,25 @@ http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>journal</id>
|
||||
<phase>generate-sources</phase>
|
||||
<goals>
|
||||
<goal>compile</goal>
|
||||
</goals>
|
||||
<configuration>
|
||||
<compile>false</compile>
|
||||
<workingDirectory>${project.build.directory}/generated-src/main/jsp</workingDirectory>
|
||||
<webFragmentFile>${project.build.directory}/journal-jsp-servlet-definitions.xml</webFragmentFile>
|
||||
<packageName>org.apache.hadoop.hdfs.server.journalservice</packageName>
|
||||
<sources>
|
||||
<directory>${basedir}/src/main/webapps/journal</directory>
|
||||
<includes>
|
||||
<include>*.jsp</include>
|
||||
</includes>
|
||||
</sources>
|
||||
</configuration>
|
||||
</execution>
|
||||
<execution>
|
||||
<id>datanode</id>
|
||||
<phase>generate-sources</phase>
|
||||
|
@ -284,6 +303,7 @@ http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|||
<loadfile property="hdfs.servlet.definitions" srcFile="${project.build.directory}/hdfs-jsp-servlet-definitions.xml"/>
|
||||
<loadfile property="secondary.servlet.definitions" srcFile="${project.build.directory}/secondary-jsp-servlet-definitions.xml"/>
|
||||
<loadfile property="datanode.servlet.definitions" srcFile="${project.build.directory}/datanode-jsp-servlet-definitions.xml"/>
|
||||
<loadfile property="journal.servlet.definitions" srcFile="${project.build.directory}/journal-jsp-servlet-definitions.xml"/>
|
||||
<echoproperties destfile="${project.build.directory}/webxml.properties">
|
||||
<propertyset>
|
||||
<propertyref regex=".*.servlet.definitions"/>
|
||||
|
@ -299,6 +319,9 @@ http://maven.apache.org/xsd/maven-4.0.0.xsd">
|
|||
<copy file="${basedir}/src/main/webapps/proto-datanode-web.xml"
|
||||
tofile="${project.build.directory}/webapps/datanode/WEB-INF/web.xml"
|
||||
filtering="true"/>
|
||||
<copy file="${basedir}/src/main/webapps/proto-journal-web.xml"
|
||||
tofile="${project.build.directory}/webapps/journal/WEB-INF/web.xml"
|
||||
filtering="true"/>
|
||||
<copy toDir="${project.build.directory}/webapps">
|
||||
<fileset dir="${basedir}/src/main/webapps">
|
||||
<exclude name="**/*.jsp"/>
|
||||
|
|
|
@ -30,6 +30,7 @@ function print_usage(){
|
|||
echo " namenode -format format the DFS filesystem"
|
||||
echo " secondarynamenode run the DFS secondary namenode"
|
||||
echo " namenode run the DFS namenode"
|
||||
echo " journalnode run the DFS journalnode"
|
||||
echo " zkfc run the ZK Failover Controller daemon"
|
||||
echo " datanode run a DFS datanode"
|
||||
echo " dfsadmin run a DFS admin client"
|
||||
|
@ -90,6 +91,9 @@ elif [ "$COMMAND" = "datanode" ] ; then
|
|||
else
|
||||
HADOOP_OPTS="$HADOOP_OPTS -server $HADOOP_DATANODE_OPTS"
|
||||
fi
|
||||
elif [ "$COMMAND" = "journalnode" ] ; then
|
||||
CLASS='org.apache.hadoop.hdfs.qjournal.server.JournalNode'
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_JOURNALNODE_OPTS"
|
||||
elif [ "$COMMAND" = "dfs" ] ; then
|
||||
CLASS=org.apache.hadoop.fs.FsShell
|
||||
HADOOP_OPTS="$HADOOP_OPTS $HADOOP_CLIENT_OPTS"
|
||||
|
|
|
@ -366,4 +366,37 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
|
|||
public static final boolean DFS_HA_AUTO_FAILOVER_ENABLED_DEFAULT = false;
|
||||
public static final String DFS_HA_ZKFC_PORT_KEY = "dfs.ha.zkfc.port";
|
||||
public static final int DFS_HA_ZKFC_PORT_DEFAULT = 8019;
|
||||
|
||||
// Journal-node related configs. These are read on the JN side.
|
||||
public static final String DFS_JOURNALNODE_EDITS_DIR_KEY = "dfs.journalnode.edits.dir";
|
||||
public static final String DFS_JOURNALNODE_EDITS_DIR_DEFAULT = "/tmp/hadoop/dfs/journalnode/";
|
||||
public static final String DFS_JOURNALNODE_RPC_ADDRESS_KEY = "dfs.journalnode.rpc-address";
|
||||
public static final int DFS_JOURNALNODE_RPC_PORT_DEFAULT = 8485;
|
||||
public static final String DFS_JOURNALNODE_RPC_ADDRESS_DEFAULT = "0.0.0.0:" + DFS_JOURNALNODE_RPC_PORT_DEFAULT;
|
||||
|
||||
public static final String DFS_JOURNALNODE_HTTP_ADDRESS_KEY = "dfs.journalnode.http-address";
|
||||
public static final int DFS_JOURNALNODE_HTTP_PORT_DEFAULT = 8480;
|
||||
public static final String DFS_JOURNALNODE_HTTP_ADDRESS_DEFAULT = "0.0.0.0:" + DFS_JOURNALNODE_HTTP_PORT_DEFAULT;
|
||||
|
||||
public static final String DFS_JOURNALNODE_KEYTAB_FILE_KEY = "dfs.journalnode.keytab.file";
|
||||
public static final String DFS_JOURNALNODE_USER_NAME_KEY = "dfs.journalnode.kerberos.principal";
|
||||
public static final String DFS_JOURNALNODE_INTERNAL_SPNEGO_USER_NAME_KEY = "dfs.journalnode.kerberos.internal.spnego.principal";
|
||||
|
||||
// Journal-node related configs for the client side.
|
||||
public static final String DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY = "dfs.qjournal.queued-edits.limit.mb";
|
||||
public static final int DFS_QJOURNAL_QUEUE_SIZE_LIMIT_DEFAULT = 10;
|
||||
|
||||
// Quorum-journal timeouts for various operations. Unlikely to need
|
||||
// to be tweaked, but configurable just in case.
|
||||
public static final String DFS_QJOURNAL_START_SEGMENT_TIMEOUT_KEY = "dfs.qjournal.start-segment.timeout.ms";
|
||||
public static final String DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_KEY = "dfs.qjournal.prepare-recovery.timeout.ms";
|
||||
public static final String DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_KEY = "dfs.qjournal.accept-recovery.timeout.ms";
|
||||
public static final String DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_KEY = "dfs.qjournal.finalize-segment.timeout.ms";
|
||||
public static final String DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_KEY = "dfs.qjournal.select-input-streams.timeout.ms";
|
||||
public static final int DFS_QJOURNAL_START_SEGMENT_TIMEOUT_DEFAULT = 20000;
|
||||
public static final int DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_DEFAULT = 20000;
|
||||
public static final int DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_DEFAULT = 60000;
|
||||
public static final int DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_DEFAULT = 20000;
|
||||
public static final int DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_DEFAULT = 20000;
|
||||
}
|
||||
|
||||
|
|
|
@ -328,12 +328,15 @@ public class PBHelper {
|
|||
}
|
||||
|
||||
public static RemoteEditLogProto convert(RemoteEditLog log) {
|
||||
return RemoteEditLogProto.newBuilder().setEndTxId(log.getEndTxId())
|
||||
.setStartTxId(log.getStartTxId()).build();
|
||||
return RemoteEditLogProto.newBuilder()
|
||||
.setStartTxId(log.getStartTxId())
|
||||
.setEndTxId(log.getEndTxId())
|
||||
.setIsInProgress(log.isInProgress()).build();
|
||||
}
|
||||
|
||||
public static RemoteEditLog convert(RemoteEditLogProto l) {
|
||||
return new RemoteEditLog(l.getStartTxId(), l.getEndTxId());
|
||||
return new RemoteEditLog(l.getStartTxId(), l.getEndTxId(),
|
||||
l.getIsInProgress());
|
||||
}
|
||||
|
||||
public static RemoteEditLogManifestProto convert(
|
||||
|
|
|
@ -0,0 +1,114 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.net.URL;
|
||||
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
|
||||
/**
|
||||
* Interface for a remote log which is only communicated with asynchronously.
|
||||
* This is essentially a wrapper around {@link QJournalProtocol} with the key
|
||||
* differences being:
|
||||
*
|
||||
* <ul>
|
||||
* <li>All methods return {@link ListenableFuture}s instead of synchronous
|
||||
* objects.</li>
|
||||
* <li>The {@link RequestInfo} objects are created by the underlying
|
||||
* implementation.</li>
|
||||
* </ul>
|
||||
*/
|
||||
interface AsyncLogger {
|
||||
|
||||
/**
|
||||
* Send a batch of edits to the logger.
|
||||
* @param firstTxnId the first txid of the edits.
|
||||
* @param numTxns the number of transactions in the batch
|
||||
* @param data the actual data to be sent
|
||||
*/
|
||||
public ListenableFuture<Void> sendEdits(
|
||||
final long firstTxnId, final int numTxns, final byte[] data);
|
||||
|
||||
/**
|
||||
* Begin writing a new log segment.
|
||||
*
|
||||
* @param txid the first txid to be written to the new log
|
||||
*/
|
||||
public ListenableFuture<Void> startLogSegment(long txid);
|
||||
|
||||
/**
|
||||
* Finalize a log segment.
|
||||
*
|
||||
* @param startTxId the first txid that was written to the segment
|
||||
* @param endTxId the last txid that was written to the segment
|
||||
*/
|
||||
public ListenableFuture<Void> finalizeLogSegment(
|
||||
long startTxId, long endTxId);
|
||||
|
||||
/**
|
||||
* @return the state of the last epoch on the target node.
|
||||
*/
|
||||
public ListenableFuture<GetJournalStateResponseProto> getJournalState();
|
||||
|
||||
/**
|
||||
* Begin a new epoch on the target node.
|
||||
*/
|
||||
public ListenableFuture<NewEpochResponseProto> newEpoch(long epoch);
|
||||
|
||||
/**
|
||||
* Fetch the list of edit logs available on the remote node.
|
||||
*/
|
||||
public ListenableFuture<GetEditLogManifestResponseProto> getEditLogManifest(
|
||||
long fromTxnId);
|
||||
|
||||
/**
|
||||
* Prepare recovery. See the HDFS-3077 design document for details.
|
||||
*/
|
||||
public ListenableFuture<PrepareRecoveryResponseProto> prepareRecovery(
|
||||
long segmentTxId);
|
||||
|
||||
/**
|
||||
* Accept a recovery proposal. See the HDFS-3077 design document for details.
|
||||
*/
|
||||
public ListenableFuture<Void> acceptRecovery(SegmentStateProto log,
|
||||
URL fromUrl);
|
||||
|
||||
/**
|
||||
* Set the epoch number used for all future calls.
|
||||
*/
|
||||
public void setEpoch(long e);
|
||||
|
||||
/**
|
||||
* Build an HTTP URL to fetch the log segment with the given startTxId.
|
||||
*/
|
||||
public URL buildURLToFetchLogs(long segmentTxId);
|
||||
|
||||
/**
|
||||
* Tear down any resources, connections, etc. The proxy may not be used
|
||||
* after this point, and any in-flight RPCs may throw an exception.
|
||||
*/
|
||||
public void close();
|
||||
}
|
|
@ -0,0 +1,272 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
|
||||
/**
|
||||
* Wrapper around a set of Loggers, taking care of fanning out
|
||||
* calls to the underlying loggers and constructing corresponding
|
||||
* {@link QuorumCall} instances.
|
||||
*/
|
||||
class AsyncLoggerSet {
|
||||
static final Log LOG = LogFactory.getLog(AsyncLoggerSet.class);
|
||||
|
||||
private static final int NEWEPOCH_TIMEOUT_MS = 10000;
|
||||
|
||||
private final List<AsyncLogger> loggers;
|
||||
|
||||
private static final long INVALID_EPOCH = -1;
|
||||
private long myEpoch = INVALID_EPOCH;
|
||||
|
||||
public AsyncLoggerSet(List<AsyncLogger> loggers) {
|
||||
this.loggers = ImmutableList.copyOf(loggers);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fence any previous writers, and obtain a unique epoch number
|
||||
* for write-access to the journal nodes.
|
||||
*
|
||||
* @param nsInfo the expected namespace information. If the remote
|
||||
* node does not match with this namespace, the request will be rejected.
|
||||
* @return the new, unique epoch number
|
||||
* @throws IOException
|
||||
*/
|
||||
Map<AsyncLogger, NewEpochResponseProto> createNewUniqueEpoch(
|
||||
NamespaceInfo nsInfo) throws IOException {
|
||||
Preconditions.checkState(myEpoch == -1,
|
||||
"epoch already created: epoch=" + myEpoch);
|
||||
|
||||
Map<AsyncLogger, GetJournalStateResponseProto> lastPromises =
|
||||
waitForWriteQuorum(getJournalState(), NEWEPOCH_TIMEOUT_MS);
|
||||
|
||||
long maxPromised = Long.MIN_VALUE;
|
||||
for (GetJournalStateResponseProto resp : lastPromises.values()) {
|
||||
maxPromised = Math.max(maxPromised, resp.getLastPromisedEpoch());
|
||||
}
|
||||
assert maxPromised >= 0;
|
||||
|
||||
long myEpoch = maxPromised + 1;
|
||||
Map<AsyncLogger, NewEpochResponseProto> resps =
|
||||
waitForWriteQuorum(newEpoch(nsInfo, myEpoch), NEWEPOCH_TIMEOUT_MS);
|
||||
this.myEpoch = myEpoch;
|
||||
setEpoch(myEpoch);
|
||||
return resps;
|
||||
}
|
||||
|
||||
private void setEpoch(long e) {
|
||||
for (AsyncLogger l : loggers) {
|
||||
l.setEpoch(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the epoch number for this writer. This may only be called after
|
||||
* a successful call to {@link #createNewUniqueEpoch(NamespaceInfo)}.
|
||||
*/
|
||||
long getEpoch() {
|
||||
Preconditions.checkState(myEpoch != INVALID_EPOCH,
|
||||
"No epoch created yet");
|
||||
return myEpoch;
|
||||
}
|
||||
|
||||
/**
|
||||
* Close all of the underlying loggers.
|
||||
*/
|
||||
void close() {
|
||||
for (AsyncLogger logger : loggers) {
|
||||
logger.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Wait for a quorum of loggers to respond to the given call. If a quorum
|
||||
* can't be achieved, throws a QuorumException.
|
||||
* @param q the quorum call
|
||||
* @param timeoutMs the number of millis to wait
|
||||
* @return a map of successful results
|
||||
* @throws QuorumException if a quorum doesn't respond with success
|
||||
* @throws IOException if the thread is interrupted or times out
|
||||
*/
|
||||
<V> Map<AsyncLogger, V> waitForWriteQuorum(QuorumCall<AsyncLogger, V> q,
|
||||
int timeoutMs) throws IOException {
|
||||
int majority = getMajoritySize();
|
||||
try {
|
||||
q.waitFor(
|
||||
loggers.size(), // either all respond
|
||||
majority, // or we get a majority successes
|
||||
majority, // or we get a majority failures,
|
||||
timeoutMs);
|
||||
} catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
throw new IOException("Interrupted waiting for quorum results");
|
||||
} catch (TimeoutException e) {
|
||||
throw new IOException("Timed out waiting " + timeoutMs + " for write quorum");
|
||||
}
|
||||
|
||||
if (q.countSuccesses() < majority) {
|
||||
q.rethrowException("Got too many exceptions to achieve quorum size " +
|
||||
getMajorityString());
|
||||
}
|
||||
|
||||
return q.getResults();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of nodes which are required to obtain a quorum.
|
||||
*/
|
||||
int getMajoritySize() {
|
||||
return loggers.size() / 2 + 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return a textual description of the majority size (eg "2/3" or "3/5")
|
||||
*/
|
||||
String getMajorityString() {
|
||||
return getMajoritySize() + "/" + loggers.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of loggers behind this set
|
||||
*/
|
||||
int size() {
|
||||
return loggers.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the (mutable) list of loggers, for use in tests to
|
||||
* set up spies
|
||||
*/
|
||||
@VisibleForTesting
|
||||
List<AsyncLogger> getLoggersForTests() {
|
||||
return loggers;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
// The rest of this file is simply boilerplate wrappers which fan-out the
|
||||
// various IPC calls to the underlying AsyncLoggers and wrap the result
|
||||
// in a QuorumCall.
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
private QuorumCall<AsyncLogger, GetJournalStateResponseProto> getJournalState() {
|
||||
Map<AsyncLogger, ListenableFuture<GetJournalStateResponseProto>> calls =
|
||||
Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
calls.put(logger, logger.getJournalState());
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
private QuorumCall<AsyncLogger,NewEpochResponseProto> newEpoch(
|
||||
NamespaceInfo nsInfo,
|
||||
long epoch) {
|
||||
Map<AsyncLogger, ListenableFuture<NewEpochResponseProto>> calls =
|
||||
Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
calls.put(logger, logger.newEpoch(epoch));
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
public QuorumCall<AsyncLogger, Void> startLogSegment(
|
||||
long txid) {
|
||||
Map<AsyncLogger, ListenableFuture<Void>> calls = Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
calls.put(logger, logger.startLogSegment(txid));
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
public QuorumCall<AsyncLogger, Void> finalizeLogSegment(long firstTxId,
|
||||
long lastTxId) {
|
||||
Map<AsyncLogger, ListenableFuture<Void>> calls = Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
calls.put(logger, logger.finalizeLogSegment(firstTxId, lastTxId));
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
public QuorumCall<AsyncLogger, Void> sendEdits(
|
||||
long firstTxnId, int numTxns, byte[] data) {
|
||||
Map<AsyncLogger, ListenableFuture<Void>> calls = Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
ListenableFuture<Void> future =
|
||||
logger.sendEdits(firstTxnId, numTxns, data);
|
||||
calls.put(logger, future);
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
public QuorumCall<AsyncLogger,GetEditLogManifestResponseProto>
|
||||
getEditLogManifest(long fromTxnId) {
|
||||
Map<AsyncLogger,
|
||||
ListenableFuture<GetEditLogManifestResponseProto>> calls
|
||||
= Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
ListenableFuture<GetEditLogManifestResponseProto> future =
|
||||
logger.getEditLogManifest(fromTxnId);
|
||||
calls.put(logger, future);
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
QuorumCall<AsyncLogger, PrepareRecoveryResponseProto>
|
||||
prepareRecovery(long segmentTxId) {
|
||||
Map<AsyncLogger,
|
||||
ListenableFuture<PrepareRecoveryResponseProto>> calls
|
||||
= Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
ListenableFuture<PrepareRecoveryResponseProto> future =
|
||||
logger.prepareRecovery(segmentTxId);
|
||||
calls.put(logger, future);
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
|
||||
QuorumCall<AsyncLogger,Void>
|
||||
acceptRecovery(SegmentStateProto log, URL fromURL) {
|
||||
Map<AsyncLogger, ListenableFuture<Void>> calls
|
||||
= Maps.newHashMap();
|
||||
for (AsyncLogger logger : loggers) {
|
||||
ListenableFuture<Void> future =
|
||||
logger.acceptRecovery(log, fromURL);
|
||||
calls.put(logger, future);
|
||||
}
|
||||
return QuorumCall.create(calls);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,330 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocolPB.QJournalProtocolPB;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocolPB.QJournalProtocolTranslatorPB;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.GetJournalEditServlet;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.ipc.ProtobufRpcEngine;
|
||||
import org.apache.hadoop.ipc.RPC;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.util.concurrent.FutureCallback;
|
||||
import com.google.common.util.concurrent.Futures;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
import com.google.common.util.concurrent.ListeningExecutorService;
|
||||
import com.google.common.util.concurrent.MoreExecutors;
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
import com.google.common.util.concurrent.UncaughtExceptionHandlers;
|
||||
|
||||
/**
|
||||
* Channel to a remote JournalNode using Hadoop IPC.
|
||||
* All of the calls are run on a separate thread, and return
|
||||
* {@link ListenableFuture} instances to wait for their result.
|
||||
* This allows calls to be bound together using the {@link QuorumCall}
|
||||
* class.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class IPCLoggerChannel implements AsyncLogger {
|
||||
|
||||
private final Configuration conf;
|
||||
private final InetSocketAddress addr;
|
||||
private QJournalProtocol proxy;
|
||||
|
||||
private final ListeningExecutorService executor;
|
||||
private long ipcSerial = 0;
|
||||
private long epoch = -1;
|
||||
private final String journalId;
|
||||
private final NamespaceInfo nsInfo;
|
||||
private int httpPort = -1;
|
||||
|
||||
/**
|
||||
* The number of bytes of edits data still in the queue.
|
||||
*/
|
||||
private int queuedEditsSizeBytes = 0;
|
||||
|
||||
/**
|
||||
* The maximum number of bytes that can be pending in the queue.
|
||||
* This keeps the writer from hitting OOME if one of the loggers
|
||||
* starts responding really slowly. Eventually, the queue
|
||||
* overflows and it starts to treat the logger as having errored.
|
||||
*/
|
||||
private final int queueSizeLimitBytes;
|
||||
|
||||
public IPCLoggerChannel(Configuration conf,
|
||||
NamespaceInfo nsInfo,
|
||||
String journalId,
|
||||
InetSocketAddress addr) {
|
||||
this.conf = conf;
|
||||
this.nsInfo = nsInfo;
|
||||
this.journalId = journalId;
|
||||
this.addr = addr;
|
||||
|
||||
this.queueSizeLimitBytes = 1024 * 1024 * conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_QUEUE_SIZE_LIMIT_DEFAULT);
|
||||
|
||||
executor = MoreExecutors.listeningDecorator(
|
||||
Executors.newSingleThreadExecutor(
|
||||
new ThreadFactoryBuilder()
|
||||
.setDaemon(true)
|
||||
.setNameFormat("Logger channel to " + addr)
|
||||
.setUncaughtExceptionHandler(
|
||||
UncaughtExceptionHandlers.systemExit())
|
||||
.build()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized void setEpoch(long epoch) {
|
||||
this.epoch = epoch;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// No more tasks may be submitted after this point.
|
||||
executor.shutdown();
|
||||
if (proxy != null) {
|
||||
// TODO: this can hang for quite some time if the client
|
||||
// is currently in the middle of a call to a downed JN.
|
||||
// We should instead do this asynchronously, and just stop
|
||||
// making any more calls after this point (eg clear the queue)
|
||||
RPC.stopProxy(proxy);
|
||||
}
|
||||
}
|
||||
|
||||
protected QJournalProtocol getProxy() throws IOException {
|
||||
if (proxy != null) return proxy;
|
||||
|
||||
RPC.setProtocolEngine(conf,
|
||||
QJournalProtocolPB.class, ProtobufRpcEngine.class);
|
||||
QJournalProtocolPB pbproxy = RPC.getProxy(
|
||||
QJournalProtocolPB.class,
|
||||
RPC.getProtocolVersion(QJournalProtocolPB.class),
|
||||
addr, conf);
|
||||
proxy = new QJournalProtocolTranslatorPB(pbproxy);
|
||||
return proxy;
|
||||
}
|
||||
|
||||
@Override
|
||||
public URL buildURLToFetchLogs(long segmentTxId) {
|
||||
Preconditions.checkArgument(segmentTxId > 0,
|
||||
"Invalid segment: %s", segmentTxId);
|
||||
Preconditions.checkState(httpPort != -1,
|
||||
"HTTP port not set yet");
|
||||
|
||||
try {
|
||||
String path = GetJournalEditServlet.buildPath(
|
||||
journalId, segmentTxId, nsInfo);
|
||||
return new URL("http", addr.getHostName(), httpPort, path.toString());
|
||||
} catch (MalformedURLException e) {
|
||||
// should never get here.
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized RequestInfo createReqInfo() {
|
||||
Preconditions.checkState(epoch > 0, "bad epoch: " + epoch);
|
||||
return new RequestInfo(journalId, epoch, ipcSerial++);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
synchronized long getNextIpcSerial() {
|
||||
return ipcSerial;
|
||||
}
|
||||
|
||||
public synchronized int getQueuedEditsSize() {
|
||||
return queuedEditsSizeBytes;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
void waitForAllPendingCalls() throws InterruptedException {
|
||||
try {
|
||||
executor.submit(new Runnable() {
|
||||
@Override
|
||||
public void run() {
|
||||
}
|
||||
}).get();
|
||||
} catch (ExecutionException e) {
|
||||
// This can't happen!
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<GetJournalStateResponseProto> getJournalState() {
|
||||
return executor.submit(new Callable<GetJournalStateResponseProto>() {
|
||||
@Override
|
||||
public GetJournalStateResponseProto call() throws IOException {
|
||||
GetJournalStateResponseProto ret =
|
||||
getProxy().getJournalState(journalId);
|
||||
httpPort = ret.getHttpPort();
|
||||
return ret;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<NewEpochResponseProto> newEpoch(
|
||||
final long epoch) {
|
||||
return executor.submit(new Callable<NewEpochResponseProto>() {
|
||||
@Override
|
||||
public NewEpochResponseProto call() throws IOException {
|
||||
return getProxy().newEpoch(journalId, nsInfo, epoch);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<Void> sendEdits(
|
||||
final long firstTxnId, final int numTxns, final byte[] data) {
|
||||
try {
|
||||
reserveQueueSpace(data.length);
|
||||
} catch (LoggerTooFarBehindException e) {
|
||||
return Futures.immediateFailedFuture(e);
|
||||
}
|
||||
ListenableFuture<Void> ret = null;
|
||||
try {
|
||||
ret = executor.submit(new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws IOException {
|
||||
getProxy().journal(createReqInfo(), firstTxnId, numTxns, data);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
} finally {
|
||||
if (ret == null) {
|
||||
// it didn't successfully get submitted,
|
||||
// so adjust the queue size back down.
|
||||
unreserveQueueSpace(data.length);
|
||||
} else {
|
||||
// It was submitted to the queue, so adjust the length
|
||||
// once the call completes, regardless of whether it
|
||||
// succeeds or fails.
|
||||
Futures.addCallback(ret, new FutureCallback<Void>() {
|
||||
@Override
|
||||
public void onFailure(Throwable t) {
|
||||
unreserveQueueSpace(data.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onSuccess(Void t) {
|
||||
unreserveQueueSpace(data.length);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private synchronized void reserveQueueSpace(int size)
|
||||
throws LoggerTooFarBehindException {
|
||||
Preconditions.checkArgument(size >= 0);
|
||||
if (queuedEditsSizeBytes + size > queueSizeLimitBytes &&
|
||||
queuedEditsSizeBytes > 0) {
|
||||
throw new LoggerTooFarBehindException();
|
||||
}
|
||||
queuedEditsSizeBytes += size;
|
||||
}
|
||||
|
||||
private synchronized void unreserveQueueSpace(int size) {
|
||||
Preconditions.checkArgument(size >= 0);
|
||||
queuedEditsSizeBytes -= size;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<Void> startLogSegment(final long txid) {
|
||||
return executor.submit(new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws IOException {
|
||||
getProxy().startLogSegment(createReqInfo(), txid);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<Void> finalizeLogSegment(
|
||||
final long startTxId, final long endTxId) {
|
||||
return executor.submit(new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws IOException {
|
||||
getProxy().finalizeLogSegment(createReqInfo(), startTxId, endTxId);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<GetEditLogManifestResponseProto> getEditLogManifest(
|
||||
final long fromTxnId) {
|
||||
return executor.submit(new Callable<GetEditLogManifestResponseProto>() {
|
||||
@Override
|
||||
public GetEditLogManifestResponseProto call() throws IOException {
|
||||
return getProxy().getEditLogManifest(journalId, fromTxnId);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<PrepareRecoveryResponseProto> prepareRecovery(
|
||||
final long segmentTxId) {
|
||||
return executor.submit(new Callable<PrepareRecoveryResponseProto>() {
|
||||
@Override
|
||||
public PrepareRecoveryResponseProto call() throws IOException {
|
||||
return getProxy().prepareRecovery(createReqInfo(), segmentTxId);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public ListenableFuture<Void> acceptRecovery(
|
||||
final SegmentStateProto log, final URL url) {
|
||||
return executor.submit(new Callable<Void>() {
|
||||
@Override
|
||||
public Void call() throws IOException {
|
||||
getProxy().acceptRecovery(createReqInfo(), log, url);
|
||||
return null;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Channel to journal node " + addr;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,25 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
class LoggerTooFarBehindException extends IOException {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
}
|
|
@ -0,0 +1,166 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.hadoop.util.Time;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.util.concurrent.FutureCallback;
|
||||
import com.google.common.util.concurrent.Futures;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
import com.google.protobuf.Message;
|
||||
import com.google.protobuf.TextFormat;
|
||||
|
||||
/**
|
||||
* Represents a set of calls for which a quorum of results is needed.
|
||||
* @param <KEY> a key used to identify each of the outgoing calls
|
||||
* @param <RESULT> the type of the call result
|
||||
*/
|
||||
class QuorumCall<KEY, RESULT> {
|
||||
private final Map<KEY, RESULT> successes = Maps.newHashMap();
|
||||
private final Map<KEY, Throwable> exceptions = Maps.newHashMap();
|
||||
|
||||
static <KEY, RESULT> QuorumCall<KEY, RESULT> create(
|
||||
Map<KEY, ? extends ListenableFuture<RESULT>> calls) {
|
||||
final QuorumCall<KEY, RESULT> qr = new QuorumCall<KEY, RESULT>();
|
||||
for (final Entry<KEY, ? extends ListenableFuture<RESULT>> e : calls.entrySet()) {
|
||||
Preconditions.checkArgument(e.getValue() != null,
|
||||
"null future for key: " + e.getKey());
|
||||
Futures.addCallback(e.getValue(), new FutureCallback<RESULT>() {
|
||||
@Override
|
||||
public void onFailure(Throwable t) {
|
||||
qr.addException(e.getKey(), t);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void onSuccess(RESULT res) {
|
||||
qr.addResult(e.getKey(), res);
|
||||
}
|
||||
});
|
||||
}
|
||||
return qr;
|
||||
}
|
||||
|
||||
private QuorumCall() {
|
||||
// Only instantiated from factory method above
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for the quorum to achieve a certain number of responses.
|
||||
*
|
||||
* Note that, even after this returns, more responses may arrive,
|
||||
* causing the return value of other methods in this class to change.
|
||||
*
|
||||
* @param minResponses return as soon as this many responses have been
|
||||
* received, regardless of whether they are successes or exceptions
|
||||
* @param minSuccesses return as soon as this many successful (non-exception)
|
||||
* responses have been received
|
||||
* @param maxExceptions return as soon as this many exception responses
|
||||
* have been received. Pass 0 to return immediately if any exception is
|
||||
* received.
|
||||
* @param millis the number of milliseconds to wait for
|
||||
* @throws InterruptedException if the thread is interrupted while waiting
|
||||
* @throws TimeoutException if the specified timeout elapses before
|
||||
* achieving the desired conditions
|
||||
*/
|
||||
public synchronized void waitFor(
|
||||
int minResponses, int minSuccesses, int maxExceptions,
|
||||
int millis)
|
||||
throws InterruptedException, TimeoutException {
|
||||
long et = Time.monotonicNow() + millis;
|
||||
while (true) {
|
||||
if (minResponses > 0 && countResponses() >= minResponses) return;
|
||||
if (minSuccesses > 0 && countSuccesses() >= minSuccesses) return;
|
||||
if (maxExceptions >= 0 && countExceptions() > maxExceptions) return;
|
||||
long rem = et - Time.monotonicNow();
|
||||
if (rem <= 0) {
|
||||
throw new TimeoutException();
|
||||
}
|
||||
wait(rem);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void addResult(KEY k, RESULT res) {
|
||||
successes.put(k, res);
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
private synchronized void addException(KEY k, Throwable t) {
|
||||
exceptions.put(k, t);
|
||||
notifyAll();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the total number of calls for which a response has been received,
|
||||
* regardless of whether it threw an exception or returned a successful
|
||||
* result.
|
||||
*/
|
||||
public synchronized int countResponses() {
|
||||
return successes.size() + exceptions.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of calls for which a non-exception response has been
|
||||
* received.
|
||||
*/
|
||||
public synchronized int countSuccesses() {
|
||||
return successes.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of calls for which an exception response has been
|
||||
* received.
|
||||
*/
|
||||
public synchronized int countExceptions() {
|
||||
return exceptions.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the map of successful responses. A copy is made such that this
|
||||
* map will not be further mutated, even if further results arrive for the
|
||||
* quorum.
|
||||
*/
|
||||
public synchronized Map<KEY, RESULT> getResults() {
|
||||
return Maps.newHashMap(successes);
|
||||
}
|
||||
|
||||
public synchronized void rethrowException(String msg) throws QuorumException {
|
||||
Preconditions.checkState(!exceptions.isEmpty());
|
||||
throw QuorumException.create(msg, successes, exceptions);
|
||||
}
|
||||
|
||||
public static <K> String mapToString(
|
||||
Map<K, ? extends Message> map) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
boolean first = true;
|
||||
for (Map.Entry<K, ? extends Message> e : map.entrySet()) {
|
||||
if (!first) {
|
||||
sb.append("\n");
|
||||
}
|
||||
first = false;
|
||||
sb.append(e.getKey()).append(": ")
|
||||
.append(TextFormat.shortDebugString(e.getValue()));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,85 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
/**
|
||||
* Exception thrown when too many exceptions occur while gathering
|
||||
* responses to a quorum call.
|
||||
*/
|
||||
class QuorumException extends IOException {
|
||||
|
||||
/**
|
||||
* Create a QuorumException instance with a descriptive message detailing
|
||||
* the underlying exceptions, as well as any successful responses which
|
||||
* were returned.
|
||||
* @param <K> the keys for the quorum calls
|
||||
* @param <V> the success response type
|
||||
* @param successes any successful responses returned
|
||||
* @param exceptions the exceptions returned
|
||||
*/
|
||||
public static <K, V> QuorumException create(
|
||||
String simpleMsg,
|
||||
Map<K, V> successes,
|
||||
Map<K, Throwable> exceptions) {
|
||||
Preconditions.checkArgument(!exceptions.isEmpty(),
|
||||
"Must pass exceptions");
|
||||
|
||||
StringBuilder msg = new StringBuilder();
|
||||
msg.append(simpleMsg).append(". ");
|
||||
if (!successes.isEmpty()) {
|
||||
msg.append(successes.size()).append(" successful responses:\n");
|
||||
Joiner.on("\n")
|
||||
.useForNull("null")
|
||||
.withKeyValueSeparator(": ")
|
||||
.appendTo(msg, successes);
|
||||
msg.append("\n");
|
||||
}
|
||||
msg.append(exceptions.size() + " exceptions thrown:\n");
|
||||
boolean isFirst = true;
|
||||
|
||||
for (Map.Entry<K, Throwable> e : exceptions.entrySet()) {
|
||||
if (!isFirst) {
|
||||
msg.append("\n");
|
||||
}
|
||||
isFirst = false;
|
||||
|
||||
msg.append(e.getKey()).append(": ");
|
||||
|
||||
if (e.getValue() instanceof RuntimeException) {
|
||||
msg.append(StringUtils.stringifyException(e.getValue()));
|
||||
} else {
|
||||
msg.append(e.getValue().getLocalizedMessage());
|
||||
}
|
||||
}
|
||||
return new QuorumException(msg.toString());
|
||||
}
|
||||
|
||||
private QuorumException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
}
|
|
@ -0,0 +1,382 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.PriorityQueue;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogFileInputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogInputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.JournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.JournalSet;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
* A JournalManager that writes to a set of remote JournalNodes,
|
||||
* requiring a quorum of nodes to ack each write.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class QuorumJournalManager implements JournalManager {
|
||||
static final Log LOG = LogFactory.getLog(QuorumJournalManager.class);
|
||||
|
||||
// Timeouts for which the QJM will wait for each of the following actions.
|
||||
private final int startSegmentTimeoutMs;
|
||||
private final int prepareRecoveryTimeoutMs;
|
||||
private final int acceptRecoveryTimeoutMs;
|
||||
private final int finalizeSegmentTimeoutMs;
|
||||
private final int selectInputStreamsTimeoutMs;
|
||||
|
||||
private final Configuration conf;
|
||||
private final URI uri;
|
||||
private final NamespaceInfo nsInfo;
|
||||
private boolean isActiveWriter;
|
||||
|
||||
private final AsyncLoggerSet loggers;
|
||||
|
||||
public QuorumJournalManager(Configuration conf,
|
||||
URI uri, NamespaceInfo nsInfo) throws IOException {
|
||||
Preconditions.checkArgument(conf != null, "must be configured");
|
||||
|
||||
this.conf = conf;
|
||||
this.uri = uri;
|
||||
this.nsInfo = nsInfo;
|
||||
this.loggers = new AsyncLoggerSet(createLoggers());
|
||||
|
||||
// Configure timeouts.
|
||||
this.startSegmentTimeoutMs = conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_START_SEGMENT_TIMEOUT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_START_SEGMENT_TIMEOUT_DEFAULT);
|
||||
this.prepareRecoveryTimeoutMs = conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_PREPARE_RECOVERY_TIMEOUT_DEFAULT);
|
||||
this.acceptRecoveryTimeoutMs = conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_ACCEPT_RECOVERY_TIMEOUT_DEFAULT);
|
||||
this.finalizeSegmentTimeoutMs = conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_FINALIZE_SEGMENT_TIMEOUT_DEFAULT);
|
||||
this.selectInputStreamsTimeoutMs = conf.getInt(
|
||||
DFSConfigKeys.DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_KEY,
|
||||
DFSConfigKeys.DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_DEFAULT);
|
||||
|
||||
}
|
||||
|
||||
static String parseJournalId(URI uri) {
|
||||
String path = uri.getPath();
|
||||
Preconditions.checkArgument(path != null && !path.isEmpty(),
|
||||
"Bad URI '%s': must identify journal in path component",
|
||||
uri);
|
||||
String journalId = path.substring(1);
|
||||
checkJournalId(journalId);
|
||||
return journalId;
|
||||
}
|
||||
|
||||
public static void checkJournalId(String jid) {
|
||||
Preconditions.checkArgument(jid != null &&
|
||||
!jid.isEmpty() &&
|
||||
!jid.contains("/") &&
|
||||
!jid.startsWith("."),
|
||||
"bad journal id: " + jid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run recovery/synchronization for a specific segment.
|
||||
* Postconditions:
|
||||
* <ul>
|
||||
* <li>This segment will be finalized on a majority
|
||||
* of nodes.</li>
|
||||
* <li>All nodes which contain the finalized segment will
|
||||
* agree on the length.</li>
|
||||
* </ul>
|
||||
*
|
||||
* @param segmentTxId the starting txid of the segment
|
||||
* @throws IOException
|
||||
*/
|
||||
private void recoverUnclosedSegment(long segmentTxId) throws IOException {
|
||||
Preconditions.checkArgument(segmentTxId > 0);
|
||||
LOG.info("Beginning recovery of unclosed segment starting at txid " +
|
||||
segmentTxId);
|
||||
|
||||
// Step 1. Prepare recovery
|
||||
QuorumCall<AsyncLogger,PrepareRecoveryResponseProto> prepare =
|
||||
loggers.prepareRecovery(segmentTxId);
|
||||
Map<AsyncLogger, PrepareRecoveryResponseProto> prepareResponses=
|
||||
loggers.waitForWriteQuorum(prepare, prepareRecoveryTimeoutMs);
|
||||
LOG.info("Recovery prepare phase complete. Responses: " +
|
||||
QuorumCall.mapToString(prepareResponses));
|
||||
|
||||
// Determine the logger who either:
|
||||
// a) Has already accepted a previous proposal that's higher than any
|
||||
// other
|
||||
//
|
||||
// OR, if no such logger exists:
|
||||
//
|
||||
// b) Has the longest log starting at this transaction ID
|
||||
|
||||
// TODO: we should collect any "ties" and pass the URL for all of them
|
||||
// when syncing, so we can tolerate failure during recovery better.
|
||||
Entry<AsyncLogger, PrepareRecoveryResponseProto> bestEntry = Collections.max(
|
||||
prepareResponses.entrySet(), RECOVERY_COMPARATOR);
|
||||
AsyncLogger bestLogger = bestEntry.getKey();
|
||||
PrepareRecoveryResponseProto bestResponse = bestEntry.getValue();
|
||||
|
||||
// Log the above decision, check invariants.
|
||||
if (bestResponse.hasAcceptedInEpoch()) {
|
||||
LOG.info("Using already-accepted recovery for segment " +
|
||||
"starting at txid " + segmentTxId + ": " +
|
||||
bestEntry);
|
||||
} else if (bestResponse.hasSegmentState()) {
|
||||
LOG.info("Using longest log: " + bestEntry);
|
||||
} else {
|
||||
// TODO: can we get here? what about the following case:
|
||||
// - 3 JNs, JN1, JN2, JN3
|
||||
// - writer starts segment 101 on JN1, then crashes
|
||||
// - during newEpoch(), we saw the segment on JN1 and decide to recover segment 101
|
||||
// - during prepare(), JN1 has actually crashed, and we only talk to JN2 and JN3,
|
||||
// neither of which has any entry for this log.
|
||||
// Write a test case.
|
||||
throw new AssertionError("None of the responses " +
|
||||
"had a log to recover: " + QuorumCall.mapToString(prepareResponses));
|
||||
}
|
||||
|
||||
|
||||
// TODO: check that md5s match up between any "tied" logs
|
||||
|
||||
SegmentStateProto logToSync = bestResponse.getSegmentState();
|
||||
assert segmentTxId == logToSync.getStartTxId();
|
||||
|
||||
URL syncFromUrl = bestLogger.buildURLToFetchLogs(segmentTxId);
|
||||
|
||||
QuorumCall<AsyncLogger,Void> accept = loggers.acceptRecovery(logToSync, syncFromUrl);
|
||||
loggers.waitForWriteQuorum(accept, acceptRecoveryTimeoutMs);
|
||||
|
||||
// TODO:
|
||||
// we should only try to finalize loggers who successfully synced above
|
||||
// eg if a logger was down, we don't want to send the finalize request.
|
||||
// write a test for this!
|
||||
|
||||
QuorumCall<AsyncLogger, Void> finalize =
|
||||
loggers.finalizeLogSegment(logToSync.getStartTxId(), logToSync.getEndTxId());
|
||||
loggers.waitForWriteQuorum(finalize, finalizeSegmentTimeoutMs);
|
||||
}
|
||||
|
||||
private static final Comparator<Entry<AsyncLogger, PrepareRecoveryResponseProto>> RECOVERY_COMPARATOR =
|
||||
new Comparator<Entry<AsyncLogger, PrepareRecoveryResponseProto>>() {
|
||||
@Override
|
||||
public int compare(
|
||||
Entry<AsyncLogger, PrepareRecoveryResponseProto> a,
|
||||
Entry<AsyncLogger, PrepareRecoveryResponseProto> b) {
|
||||
|
||||
PrepareRecoveryResponseProto r1 = a.getValue();
|
||||
PrepareRecoveryResponseProto r2 = b.getValue();
|
||||
|
||||
if (r1.hasSegmentState() && r2.hasSegmentState()) {
|
||||
assert r1.getSegmentState().getStartTxId() ==
|
||||
r2.getSegmentState().getStartTxId() : "bad args: " + r1 + ", " + r2;
|
||||
}
|
||||
|
||||
return ComparisonChain.start()
|
||||
// If one of them has accepted something and the other hasn't,
|
||||
// use the one with an accepted recovery
|
||||
.compare(r1.hasAcceptedInEpoch(), r2.hasAcceptedInEpoch())
|
||||
// If they both accepted, use the one that's more recent
|
||||
.compare(r1.getAcceptedInEpoch(),
|
||||
r2.getAcceptedInEpoch())
|
||||
// Otherwise, choose based on which log is longer
|
||||
.compare(r1.hasSegmentState(), r2.hasSegmentState())
|
||||
.compare(r1.getSegmentState().getEndTxId(), r2.getSegmentState().getEndTxId())
|
||||
.result();
|
||||
}
|
||||
};
|
||||
|
||||
protected List<AsyncLogger> createLoggers() throws IOException {
|
||||
return createLoggers(conf, uri, nsInfo);
|
||||
}
|
||||
|
||||
static List<AsyncLogger> createLoggers(Configuration conf,
|
||||
URI uri, NamespaceInfo nsInfo) throws IOException {
|
||||
List<AsyncLogger> ret = Lists.newArrayList();
|
||||
List<InetSocketAddress> addrs = getLoggerAddresses(uri);
|
||||
String jid = parseJournalId(uri);
|
||||
for (InetSocketAddress addr : addrs) {
|
||||
ret.add(new IPCLoggerChannel(conf, nsInfo, jid, addr));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private static List<InetSocketAddress> getLoggerAddresses(URI uri)
|
||||
throws IOException {
|
||||
String authority = uri.getAuthority();
|
||||
Preconditions.checkArgument(authority != null && !authority.isEmpty(),
|
||||
"URI has no authority: " + uri);
|
||||
|
||||
String[] parts = StringUtils.split(authority, ';');
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = parts[i].trim();
|
||||
}
|
||||
|
||||
if (parts.length % 2 == 0) {
|
||||
LOG.warn("Quorum journal URI '" + uri + "' has an even number " +
|
||||
"of Journal Nodes specified. This is not recommended!");
|
||||
}
|
||||
|
||||
List<InetSocketAddress> addrs = Lists.newArrayList();
|
||||
for (String addr : parts) {
|
||||
addrs.add(NetUtils.createSocketAddr(
|
||||
addr, DFSConfigKeys.DFS_JOURNALNODE_RPC_PORT_DEFAULT));
|
||||
}
|
||||
return addrs;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EditLogOutputStream startLogSegment(long txId) throws IOException {
|
||||
Preconditions.checkState(isActiveWriter,
|
||||
"must recover segments before starting a new one");
|
||||
QuorumCall<AsyncLogger,Void> q = loggers.startLogSegment(txId);
|
||||
loggers.waitForWriteQuorum(q, startSegmentTimeoutMs);
|
||||
return new QuorumOutputStream(loggers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finalizeLogSegment(long firstTxId, long lastTxId)
|
||||
throws IOException {
|
||||
QuorumCall<AsyncLogger,Void> q = loggers.finalizeLogSegment(
|
||||
firstTxId, lastTxId);
|
||||
loggers.waitForWriteQuorum(q, finalizeSegmentTimeoutMs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOutputBufferCapacity(int size) {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void purgeLogsOlderThan(long minTxIdToKeep) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
}
|
||||
|
||||
@Override
|
||||
public void recoverUnfinalizedSegments() throws IOException {
|
||||
Preconditions.checkState(!isActiveWriter, "already active writer");
|
||||
|
||||
Map<AsyncLogger, NewEpochResponseProto> resps =
|
||||
loggers.createNewUniqueEpoch(nsInfo);
|
||||
LOG.info("newEpoch(" + loggers.getEpoch() + ") responses:\n" +
|
||||
QuorumCall.mapToString(resps));
|
||||
|
||||
long mostRecentSegmentTxId = Long.MIN_VALUE;
|
||||
for (NewEpochResponseProto r : resps.values()) {
|
||||
if (r.hasLastSegmentTxId()) {
|
||||
mostRecentSegmentTxId = Math.max(mostRecentSegmentTxId,
|
||||
r.getLastSegmentTxId());
|
||||
}
|
||||
}
|
||||
|
||||
// On a completely fresh system, none of the journals have any
|
||||
// segments, so there's nothing to recover.
|
||||
if (mostRecentSegmentTxId != Long.MIN_VALUE) {
|
||||
recoverUnclosedSegment(mostRecentSegmentTxId);
|
||||
}
|
||||
isActiveWriter = true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
loggers.close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void selectInputStreams(Collection<EditLogInputStream> streams,
|
||||
long fromTxnId, boolean inProgressOk) {
|
||||
|
||||
QuorumCall<AsyncLogger,GetEditLogManifestResponseProto> q =
|
||||
loggers.getEditLogManifest(fromTxnId);
|
||||
Map<AsyncLogger, GetEditLogManifestResponseProto> resps;
|
||||
try {
|
||||
resps = loggers.waitForWriteQuorum(q, selectInputStreamsTimeoutMs);
|
||||
} catch (IOException ioe) {
|
||||
// TODO: can we do better here?
|
||||
throw new RuntimeException(ioe);
|
||||
}
|
||||
|
||||
LOG.info("selectInputStream manifests:\n" +
|
||||
QuorumCall.mapToString(resps));
|
||||
|
||||
final PriorityQueue<EditLogInputStream> allStreams =
|
||||
new PriorityQueue<EditLogInputStream>(64,
|
||||
JournalSet.EDIT_LOG_INPUT_STREAM_COMPARATOR);
|
||||
for (Map.Entry<AsyncLogger, GetEditLogManifestResponseProto> e : resps.entrySet()) {
|
||||
AsyncLogger logger = e.getKey();
|
||||
GetEditLogManifestResponseProto response = e.getValue();
|
||||
RemoteEditLogManifest manifest = PBHelper.convert(response.getManifest());
|
||||
|
||||
for (RemoteEditLog remoteLog : manifest.getLogs()) {
|
||||
URL url = logger.buildURLToFetchLogs(remoteLog.getStartTxId());
|
||||
LOG.info("URL: " + url);
|
||||
|
||||
EditLogInputStream elis = EditLogFileInputStream.fromUrl(
|
||||
url, remoteLog.getStartTxId(), remoteLog.getEndTxId(),
|
||||
remoteLog.isInProgress());
|
||||
allStreams.add(elis);
|
||||
}
|
||||
}
|
||||
JournalSet.chainAndMakeRedundantStreams(
|
||||
streams, allStreams, fromTxnId, inProgressOk);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Quorum journal manager " + uri;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
AsyncLoggerSet getLoggerSetForTests() {
|
||||
return loggers;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,103 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditsDoubleBuffer;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
|
||||
import org.apache.hadoop.io.DataOutputBuffer;
|
||||
|
||||
/**
|
||||
* EditLogOutputStream implementation that writes to a quorum of
|
||||
* remote journals.
|
||||
*/
|
||||
class QuorumOutputStream extends EditLogOutputStream {
|
||||
private final AsyncLoggerSet loggers;
|
||||
private EditsDoubleBuffer buf;
|
||||
|
||||
public QuorumOutputStream(AsyncLoggerSet loggers) throws IOException {
|
||||
super();
|
||||
this.buf = new EditsDoubleBuffer(256*1024); // TODO: conf
|
||||
this.loggers = loggers;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(FSEditLogOp op) throws IOException {
|
||||
buf.writeOp(op);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeRaw(byte[] bytes, int offset, int length) throws IOException {
|
||||
buf.writeRaw(bytes, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void create() throws IOException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
if (buf != null) {
|
||||
buf.close();
|
||||
buf = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void abort() throws IOException {
|
||||
QuorumJournalManager.LOG.warn("Aborting " + this);
|
||||
buf = null;
|
||||
close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReadyToFlush() throws IOException {
|
||||
buf.setReadyToFlush();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void flushAndSync() throws IOException {
|
||||
int numReadyBytes = buf.countReadyBytes();
|
||||
if (numReadyBytes > 0) {
|
||||
int numReadyTxns = buf.countReadyTxns();
|
||||
long firstTxToFlush = buf.getFirstReadyTxId();
|
||||
|
||||
assert numReadyTxns > 0;
|
||||
|
||||
// Copy from our double-buffer into a new byte array. This is for
|
||||
// two reasons:
|
||||
// 1) The IPC code has no way of specifying to send only a slice of
|
||||
// a larger array.
|
||||
// 2) because the calls to the underlying nodes are asynchronous, we
|
||||
// need a defensive copy to avoid accidentally mutating the buffer
|
||||
// before it is sent.
|
||||
DataOutputBuffer bufToSend = new DataOutputBuffer(numReadyBytes);
|
||||
buf.flushTo(bufToSend);
|
||||
assert bufToSend.getLength() == numReadyBytes;
|
||||
byte[] data = bufToSend.getData();
|
||||
assert data.length == bufToSend.getLength();
|
||||
|
||||
QuorumCall<AsyncLogger, Void> qcall = loggers.sendEdits(
|
||||
firstTxToFlush, numReadyTxns, data);
|
||||
loggers.waitForWriteQuorum(qcall, 20000); // TODO: configurable timeout
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.protocol;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.JournalNode;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.security.KerberosInfo;
|
||||
|
||||
/**
|
||||
* Protocol used to communicate between {@link QuorumJournalManager}
|
||||
* and each {@link JournalNode}.
|
||||
*
|
||||
* This is responsible for sending edits as well as coordinating
|
||||
* recovery of the nodes.
|
||||
*/
|
||||
@KerberosInfo(
|
||||
serverPrincipal = DFSConfigKeys.DFS_JOURNALNODE_USER_NAME_KEY,
|
||||
clientPrincipal = DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY)
|
||||
@InterfaceAudience.Private
|
||||
public interface QJournalProtocol {
|
||||
public static final long versionID = 1L;
|
||||
|
||||
/**
|
||||
* Get the current state of the journal, including the most recent
|
||||
* epoch number and the HTTP port.
|
||||
*/
|
||||
public GetJournalStateResponseProto getJournalState(String journalId)
|
||||
throws IOException;
|
||||
|
||||
/**
|
||||
* Begin a new epoch. See the HDFS-3077 design doc for details.
|
||||
*/
|
||||
public NewEpochResponseProto newEpoch(String journalId,
|
||||
NamespaceInfo nsInfo, long epoch) throws IOException;
|
||||
|
||||
/**
|
||||
* Journal edit records.
|
||||
* This message is sent by the active name-node to the JournalNodes
|
||||
* to write edits to their local logs.
|
||||
*/
|
||||
public void journal(RequestInfo reqInfo,
|
||||
long firstTxnId,
|
||||
int numTxns,
|
||||
byte[] records) throws IOException;
|
||||
|
||||
/**
|
||||
* Start writing to a new log segment on the JournalNode.
|
||||
* Before calling this, one should finalize the previous segment
|
||||
* using {@link #finalizeLogSegment(RequestInfo, long, long)}.
|
||||
*
|
||||
* @param txid the first txid in the new log
|
||||
*/
|
||||
public void startLogSegment(RequestInfo reqInfo,
|
||||
long txid) throws IOException;
|
||||
|
||||
/**
|
||||
* Finalize the given log segment on the JournalNode. The segment
|
||||
* is expected to be in-progress and starting at the given startTxId.
|
||||
*
|
||||
* @param startTxId the starting transaction ID of teh log
|
||||
* @param endTxId the expected last transaction in the given log
|
||||
* @throws IOException if no such segment exists
|
||||
*/
|
||||
public void finalizeLogSegment(RequestInfo reqInfo,
|
||||
long startTxId, long endTxId) throws IOException;
|
||||
|
||||
/**
|
||||
* @param jid the journal from which to enumerate edits
|
||||
* @param sinceTxId the first transaction which the client cares about
|
||||
* @return a list of edit log segments since the given transaction ID.
|
||||
*/
|
||||
public GetEditLogManifestResponseProto getEditLogManifest(
|
||||
String jid, long sinceTxId) throws IOException;
|
||||
|
||||
/**
|
||||
* Begin the recovery process for a given segment. See the HDFS-3077
|
||||
* design document for details.
|
||||
*/
|
||||
public PrepareRecoveryResponseProto prepareRecovery(RequestInfo reqInfo,
|
||||
long segmentTxId) throws IOException;
|
||||
|
||||
/**
|
||||
* Accept a proposed recovery for the given transaction ID.
|
||||
*/
|
||||
public void acceptRecovery(RequestInfo reqInfo,
|
||||
SegmentStateProto stateToAccept, URL fromUrl) throws IOException;
|
||||
|
||||
}
|
|
@ -0,0 +1,54 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.protocol;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public class RequestInfo {
|
||||
private String jid;
|
||||
private long epoch;
|
||||
private long ipcSerialNumber;
|
||||
|
||||
public RequestInfo(String jid, long epoch, long ipcSerialNumber) {
|
||||
this.jid = jid;
|
||||
this.epoch = epoch;
|
||||
this.ipcSerialNumber = ipcSerialNumber;
|
||||
}
|
||||
|
||||
public long getEpoch() {
|
||||
return epoch;
|
||||
}
|
||||
|
||||
public void setEpoch(long epoch) {
|
||||
this.epoch = epoch;
|
||||
}
|
||||
|
||||
public String getJournalId() {
|
||||
return jid;
|
||||
}
|
||||
|
||||
public long getIpcSerialNumber() {
|
||||
return ipcSerialNumber;
|
||||
}
|
||||
|
||||
public void setIpcSerialNumber(long ipcSerialNumber) {
|
||||
this.ipcSerialNumber = ipcSerialNumber;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.protocolPB;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.QJournalProtocolService;
|
||||
import org.apache.hadoop.ipc.ProtocolInfo;
|
||||
import org.apache.hadoop.security.KerberosInfo;
|
||||
|
||||
/**
|
||||
* Protocol used to journal edits to a JournalNode participating
|
||||
* in the quorum journal.
|
||||
* Note: This extends the protocolbuffer service based interface to
|
||||
* add annotations required for security.
|
||||
*/
|
||||
@KerberosInfo(
|
||||
serverPrincipal = DFSConfigKeys.DFS_JOURNALNODE_USER_NAME_KEY,
|
||||
clientPrincipal = DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY)
|
||||
@ProtocolInfo(protocolName =
|
||||
"org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol",
|
||||
protocolVersion = 1)
|
||||
@InterfaceAudience.Private
|
||||
public interface QJournalProtocolPB extends
|
||||
QJournalProtocolService.BlockingInterface {
|
||||
}
|
|
@ -0,0 +1,177 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.protocolPB;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.FinalizeLogSegmentRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.FinalizeLogSegmentResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.JournalIdProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.JournalRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.JournalResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.AcceptRecoveryRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.AcceptRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.StartLogSegmentRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.StartLogSegmentResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
|
||||
import com.google.protobuf.RpcController;
|
||||
import com.google.protobuf.ServiceException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
|
||||
/**
|
||||
* Implementation for protobuf service that forwards requests
|
||||
* received on {@link JournalProtocolPB} to the
|
||||
* {@link JournalProtocol} server implementation.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class QJournalProtocolServerSideTranslatorPB implements QJournalProtocolPB {
|
||||
/** Server side implementation to delegate the requests to */
|
||||
private final QJournalProtocol impl;
|
||||
|
||||
public QJournalProtocolServerSideTranslatorPB(QJournalProtocol impl) {
|
||||
this.impl = impl;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetJournalStateResponseProto getJournalState(RpcController controller,
|
||||
GetJournalStateRequestProto request) throws ServiceException {
|
||||
try {
|
||||
return impl.getJournalState(
|
||||
convert(request.getJid()));
|
||||
} catch (IOException ioe) {
|
||||
throw new ServiceException(ioe);
|
||||
}
|
||||
}
|
||||
|
||||
private String convert(JournalIdProto jid) {
|
||||
return jid.getIdentifier();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NewEpochResponseProto newEpoch(RpcController controller,
|
||||
NewEpochRequestProto request) throws ServiceException {
|
||||
try {
|
||||
return impl.newEpoch(
|
||||
request.getJid().getIdentifier(),
|
||||
PBHelper.convert(request.getNsInfo()),
|
||||
request.getEpoch());
|
||||
} catch (IOException ioe) {
|
||||
throw new ServiceException(ioe);
|
||||
}
|
||||
}
|
||||
|
||||
/** @see JournalProtocol#journal */
|
||||
@Override
|
||||
public JournalResponseProto journal(RpcController unused,
|
||||
JournalRequestProto req) throws ServiceException {
|
||||
try {
|
||||
impl.journal(convert(req.getReqInfo()),
|
||||
req.getFirstTxnId(), req.getNumTxns(), req.getRecords()
|
||||
.toByteArray());
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
return JournalResponseProto.newBuilder().build();
|
||||
}
|
||||
|
||||
/** @see JournalProtocol#startLogSegment */
|
||||
@Override
|
||||
public StartLogSegmentResponseProto startLogSegment(RpcController controller,
|
||||
StartLogSegmentRequestProto req) throws ServiceException {
|
||||
try {
|
||||
impl.startLogSegment(convert(req.getReqInfo()),
|
||||
req.getTxid());
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
return StartLogSegmentResponseProto.newBuilder().build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public FinalizeLogSegmentResponseProto finalizeLogSegment(
|
||||
RpcController controller, FinalizeLogSegmentRequestProto req)
|
||||
throws ServiceException {
|
||||
try {
|
||||
impl.finalizeLogSegment(convert(req.getReqInfo()),
|
||||
req.getStartTxId(), req.getEndTxId());
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
return FinalizeLogSegmentResponseProto.newBuilder().build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetEditLogManifestResponseProto getEditLogManifest(
|
||||
RpcController controller, GetEditLogManifestRequestProto request)
|
||||
throws ServiceException {
|
||||
try {
|
||||
return impl.getEditLogManifest(
|
||||
request.getJid().getIdentifier(),
|
||||
request.getSinceTxId());
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public PrepareRecoveryResponseProto prepareRecovery(RpcController controller,
|
||||
PrepareRecoveryRequestProto request) throws ServiceException {
|
||||
try {
|
||||
return impl.prepareRecovery(convert(request.getReqInfo()),
|
||||
request.getSegmentTxId());
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public AcceptRecoveryResponseProto acceptRecovery(RpcController controller,
|
||||
AcceptRecoveryRequestProto request) throws ServiceException {
|
||||
try {
|
||||
impl.acceptRecovery(convert(request.getReqInfo()),
|
||||
request.getStateToAccept(),
|
||||
new URL(request.getFromURL()));
|
||||
return AcceptRecoveryResponseProto.getDefaultInstance();
|
||||
} catch (IOException e) {
|
||||
throw new ServiceException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private RequestInfo convert(
|
||||
QJournalProtocolProtos.RequestInfoProto reqInfo) {
|
||||
return new RequestInfo(
|
||||
reqInfo.getJournalId().getIdentifier(),
|
||||
reqInfo.getEpoch(),
|
||||
reqInfo.getIpcSerialNumber());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,217 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.protocolPB;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.net.URL;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.classification.InterfaceStability;
|
||||
import org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.RemoteEditLogProto;
|
||||
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.FinalizeLogSegmentRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.JournalIdProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.JournalRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.AcceptRecoveryRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.StartLogSegmentRequestProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.JournalProtocol;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.io.MD5Hash;
|
||||
import org.apache.hadoop.ipc.ProtobufHelper;
|
||||
import org.apache.hadoop.ipc.ProtocolMetaInterface;
|
||||
import org.apache.hadoop.ipc.RPC;
|
||||
import org.apache.hadoop.ipc.RpcClientUtil;
|
||||
|
||||
import com.google.protobuf.ByteString;
|
||||
import com.google.protobuf.RpcController;
|
||||
import com.google.protobuf.ServiceException;
|
||||
|
||||
/**
|
||||
* This class is the client side translator to translate the requests made on
|
||||
* {@link JournalProtocol} interfaces to the RPC server implementing
|
||||
* {@link JournalProtocolPB}.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
@InterfaceStability.Stable
|
||||
public class QJournalProtocolTranslatorPB implements ProtocolMetaInterface,
|
||||
QJournalProtocol, Closeable {
|
||||
/** RpcController is not used and hence is set to null */
|
||||
private final static RpcController NULL_CONTROLLER = null;
|
||||
private final QJournalProtocolPB rpcProxy;
|
||||
|
||||
public QJournalProtocolTranslatorPB(QJournalProtocolPB rpcProxy) {
|
||||
this.rpcProxy = rpcProxy;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
RPC.stopProxy(rpcProxy);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public GetJournalStateResponseProto getJournalState(String jid)
|
||||
throws IOException {
|
||||
try {
|
||||
GetJournalStateRequestProto req = GetJournalStateRequestProto.newBuilder()
|
||||
.setJid(convertJournalId(jid))
|
||||
.build();
|
||||
return rpcProxy.getJournalState(NULL_CONTROLLER, req);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private JournalIdProto convertJournalId(String jid) {
|
||||
return JournalIdProto.newBuilder()
|
||||
.setIdentifier(jid)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NewEpochResponseProto newEpoch(String jid, NamespaceInfo nsInfo,
|
||||
long epoch) throws IOException {
|
||||
try {
|
||||
NewEpochRequestProto req = NewEpochRequestProto.newBuilder()
|
||||
.setJid(convertJournalId(jid))
|
||||
.setNsInfo(PBHelper.convert(nsInfo))
|
||||
.setEpoch(epoch)
|
||||
.build();
|
||||
return rpcProxy.newEpoch(NULL_CONTROLLER, req);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void journal(RequestInfo reqInfo, long firstTxnId, int numTxns,
|
||||
byte[] records) throws IOException {
|
||||
JournalRequestProto req = JournalRequestProto.newBuilder()
|
||||
.setReqInfo(convert(reqInfo))
|
||||
.setFirstTxnId(firstTxnId)
|
||||
.setNumTxns(numTxns)
|
||||
.setRecords(PBHelper.getByteString(records))
|
||||
.build();
|
||||
try {
|
||||
rpcProxy.journal(NULL_CONTROLLER, req);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private QJournalProtocolProtos.RequestInfoProto convert(
|
||||
RequestInfo reqInfo) {
|
||||
return QJournalProtocolProtos.RequestInfoProto.newBuilder()
|
||||
.setJournalId(convertJournalId(reqInfo.getJournalId()))
|
||||
.setEpoch(reqInfo.getEpoch())
|
||||
.setIpcSerialNumber(reqInfo.getIpcSerialNumber())
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startLogSegment(RequestInfo reqInfo, long txid)
|
||||
throws IOException {
|
||||
StartLogSegmentRequestProto req = StartLogSegmentRequestProto.newBuilder()
|
||||
.setReqInfo(convert(reqInfo))
|
||||
.setTxid(txid)
|
||||
.build();
|
||||
try {
|
||||
rpcProxy.startLogSegment(NULL_CONTROLLER, req);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
|
||||
long endTxId) throws IOException {
|
||||
FinalizeLogSegmentRequestProto req =
|
||||
FinalizeLogSegmentRequestProto.newBuilder()
|
||||
.setReqInfo(convert(reqInfo))
|
||||
.setStartTxId(startTxId)
|
||||
.setEndTxId(endTxId)
|
||||
.build();
|
||||
try {
|
||||
rpcProxy.finalizeLogSegment(NULL_CONTROLLER, req);
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetEditLogManifestResponseProto getEditLogManifest(String jid,
|
||||
long sinceTxId) throws IOException {
|
||||
try {
|
||||
return rpcProxy.getEditLogManifest(NULL_CONTROLLER,
|
||||
GetEditLogManifestRequestProto.newBuilder()
|
||||
.setJid(convertJournalId(jid))
|
||||
.setSinceTxId(sinceTxId)
|
||||
.build());
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public PrepareRecoveryResponseProto prepareRecovery(RequestInfo reqInfo,
|
||||
long segmentTxId) throws IOException {
|
||||
try {
|
||||
return rpcProxy.prepareRecovery(NULL_CONTROLLER,
|
||||
PrepareRecoveryRequestProto.newBuilder()
|
||||
.setReqInfo(convert(reqInfo))
|
||||
.setSegmentTxId(segmentTxId)
|
||||
.build());
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void acceptRecovery(RequestInfo reqInfo,
|
||||
SegmentStateProto stateToAccept, URL fromUrl) throws IOException {
|
||||
try {
|
||||
rpcProxy.acceptRecovery(NULL_CONTROLLER,
|
||||
AcceptRecoveryRequestProto.newBuilder()
|
||||
.setReqInfo(convert(reqInfo))
|
||||
.setStateToAccept(stateToAccept)
|
||||
.setFromURL(fromUrl.toExternalForm())
|
||||
.build());
|
||||
} catch (ServiceException e) {
|
||||
throw ProtobufHelper.getRemoteException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isMethodSupported(String methodName) throws IOException {
|
||||
return RpcClientUtil.isMethodSupported(rpcProxy,
|
||||
QJournalProtocolPB.class, RPC.RpcKind.RPC_PROTOCOL_BUFFER,
|
||||
RPC.getProtocolVersion(QJournalProtocolPB.class), methodName);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
|
||||
import javax.servlet.ServletContext;
|
||||
import javax.servlet.ServletException;
|
||||
import javax.servlet.http.HttpServlet;
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.common.JspHelper;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
|
||||
import org.apache.hadoop.hdfs.server.namenode.GetImageServlet;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
||||
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.hdfs.util.DataTransferThrottler;
|
||||
import org.apache.hadoop.security.SecurityUtil;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.util.ServletUtil;
|
||||
import org.apache.hadoop.util.StringUtils;
|
||||
|
||||
/**
|
||||
* This servlet is used in two cases:
|
||||
* <ul>
|
||||
* <li>The QuorumJournalManager, when reading edits, fetches the edit streams
|
||||
* from the journal nodes.</li>
|
||||
* <li>During edits synchronization, one journal node will fetch edits from
|
||||
* another journal node.</li>
|
||||
* </ul>
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class GetJournalEditServlet extends HttpServlet {
|
||||
|
||||
private static final long serialVersionUID = -4635891628211723009L;
|
||||
private static final Log LOG = LogFactory.getLog(GetJournalEditServlet.class);
|
||||
|
||||
static final String STORAGEINFO_PARAM = "storageInfo";
|
||||
static final String JOURNAL_ID_PARAM = "jid";
|
||||
static final String SEGMENT_TXID_PARAM = "segmentTxId";
|
||||
|
||||
// TODO: create security tests
|
||||
protected boolean isValidRequestor(String remoteUser, Configuration conf)
|
||||
throws IOException {
|
||||
if (remoteUser == null) { // This really shouldn't happen...
|
||||
LOG.warn("Received null remoteUser while authorizing access to " +
|
||||
"GetJournalEditServlet");
|
||||
return false;
|
||||
}
|
||||
|
||||
String[] validRequestors = {
|
||||
SecurityUtil.getServerPrincipal(conf
|
||||
.get(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY), NameNode
|
||||
.getAddress(conf).getHostName()),
|
||||
SecurityUtil.getServerPrincipal(conf
|
||||
.get(DFSConfigKeys.DFS_JOURNALNODE_USER_NAME_KEY),
|
||||
NameNode.getAddress(conf).getHostName()) };
|
||||
// TODO: above principal is not correct, since each JN will have a
|
||||
// different hostname.
|
||||
|
||||
for (String v : validRequestors) {
|
||||
if (v != null && v.equals(remoteUser)) {
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("isValidRequestor is allowing: " + remoteUser);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
if (LOG.isDebugEnabled())
|
||||
LOG.debug("isValidRequestor is rejecting: " + remoteUser);
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkRequestorOrSendError(Configuration conf,
|
||||
HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
if (UserGroupInformation.isSecurityEnabled()
|
||||
&& !isValidRequestor(request.getRemoteUser(), conf)) {
|
||||
response.sendError(HttpServletResponse.SC_FORBIDDEN,
|
||||
"Only Namenode and another JournalNode may access this servlet");
|
||||
LOG.warn("Received non-NN/JN request for edits from "
|
||||
+ request.getRemoteHost());
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean checkStorageInfoOrSendError(JNStorage storage,
|
||||
HttpServletRequest request, HttpServletResponse response)
|
||||
throws IOException {
|
||||
String myStorageInfoString = storage.toColonSeparatedString();
|
||||
String theirStorageInfoString = request.getParameter(STORAGEINFO_PARAM);
|
||||
|
||||
if (theirStorageInfoString != null
|
||||
&& !myStorageInfoString.equals(theirStorageInfoString)) {
|
||||
response.sendError(HttpServletResponse.SC_FORBIDDEN,
|
||||
"This node has storage info " + myStorageInfoString
|
||||
+ " but the requesting node expected "
|
||||
+ theirStorageInfoString);
|
||||
LOG.warn("Received an invalid request file transfer request "
|
||||
+ " with storage info " + theirStorageInfoString);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doGet(final HttpServletRequest request,
|
||||
final HttpServletResponse response) throws ServletException, IOException {
|
||||
try {
|
||||
final ServletContext context = getServletContext();
|
||||
final Configuration conf = (Configuration) getServletContext()
|
||||
.getAttribute(JspHelper.CURRENT_CONF);
|
||||
final String journalId = request.getParameter(JOURNAL_ID_PARAM);
|
||||
QuorumJournalManager.checkJournalId(journalId);
|
||||
final JNStorage storage = JournalNodeHttpServer
|
||||
.getJournalFromContext(context, journalId).getStorage();
|
||||
|
||||
// Check security
|
||||
if (!checkRequestorOrSendError(conf, request, response)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Check that the namespace info is correct
|
||||
if (!checkStorageInfoOrSendError(storage, request, response)) {
|
||||
return;
|
||||
}
|
||||
|
||||
long segmentTxId = ServletUtil.parseLongParam(request,
|
||||
SEGMENT_TXID_PARAM);
|
||||
|
||||
FileJournalManager fjm = storage.getJournalManager();
|
||||
File editFile;
|
||||
FileInputStream editFileIn;
|
||||
|
||||
synchronized (fjm) {
|
||||
// Synchronize on the FJM so that the file doesn't get finalized
|
||||
// out from underneath us while we're in the process of opening
|
||||
// it up.
|
||||
EditLogFile elf = fjm.getLogFile(
|
||||
segmentTxId);
|
||||
if (elf == null) {
|
||||
response.sendError(HttpServletResponse.SC_NOT_FOUND,
|
||||
"No edit log found starting at txid " + segmentTxId);
|
||||
return;
|
||||
}
|
||||
editFile = elf.getFile();
|
||||
GetImageServlet.setVerificationHeaders(response, editFile);
|
||||
GetImageServlet.setFileNameHeaders(response, editFile);
|
||||
editFileIn = new FileInputStream(editFile);
|
||||
}
|
||||
|
||||
DataTransferThrottler throttler = GetImageServlet.getThrottler(conf);
|
||||
|
||||
// send edits
|
||||
TransferFsImage.getFileServer(response, editFile, editFileIn, throttler);
|
||||
|
||||
} catch (Throwable t) {
|
||||
String errMsg = "getedit failed. " + StringUtils.stringifyException(t);
|
||||
response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, errMsg);
|
||||
throw new IOException(errMsg);
|
||||
}
|
||||
}
|
||||
|
||||
public static String buildPath(String journalId, long segmentTxId,
|
||||
NamespaceInfo nsInfo) {
|
||||
StringBuilder path = new StringBuilder("/getJournal?");
|
||||
try {
|
||||
path.append(JOURNAL_ID_PARAM).append("=")
|
||||
.append(URLEncoder.encode(journalId, "UTF-8"));
|
||||
path.append("&" + SEGMENT_TXID_PARAM).append("=")
|
||||
.append(segmentTxId);
|
||||
path.append("&" + STORAGEINFO_PARAM).append("=")
|
||||
.append(URLEncoder.encode(nsInfo.toColonSeparatedString(), "UTF-8"));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
// Never get here -- everyone supports UTF-8
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return path.toString();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NodeType;
|
||||
import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
||||
import org.apache.hadoop.hdfs.server.common.Storage;
|
||||
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NNStorage;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
|
||||
/**
|
||||
* A {@link Storage} implementation for the {@link JournalNode}.
|
||||
*
|
||||
* The JN has a storage directory for each namespace for which it stores
|
||||
* metadata. There is only a single directory per JN in the current design.
|
||||
*/
|
||||
class JNStorage extends Storage {
|
||||
|
||||
private final FileJournalManager fjm;
|
||||
private final StorageDirectory sd;
|
||||
private boolean lazyInitted = false;
|
||||
|
||||
/**
|
||||
* @param logDir the path to the directory in which data will be stored
|
||||
* @param errorReporter a callback to report errors
|
||||
*/
|
||||
protected JNStorage(File logDir, StorageErrorReporter errorReporter) {
|
||||
super(NodeType.JOURNAL_NODE);
|
||||
|
||||
sd = new StorageDirectory(logDir);
|
||||
this.addStorageDir(sd);
|
||||
this.fjm = new FileJournalManager(sd, errorReporter);
|
||||
}
|
||||
|
||||
FileJournalManager getJournalManager() {
|
||||
return fjm;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isPreUpgradableLayout(StorageDirectory sd) throws IOException {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find an edits file spanning the given transaction ID range.
|
||||
* If no such file exists, an exception is thrown.
|
||||
*/
|
||||
File findFinalizedEditsFile(long startTxId, long endTxId) throws IOException {
|
||||
File ret = new File(sd.getCurrentDir(),
|
||||
NNStorage.getFinalizedEditsFileName(startTxId, endTxId));
|
||||
if (!ret.exists()) {
|
||||
throw new IOException(
|
||||
"No edits file for range " + startTxId + "-" + endTxId);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the path for an in-progress edits file starting at the given
|
||||
* transaction ID. This does not verify existence of the file.
|
||||
*/
|
||||
File getInProgressEditLog(long startTxId) {
|
||||
return new File(sd.getCurrentDir(),
|
||||
NNStorage.getInProgressEditsFileName(startTxId));
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the path for the file which contains persisted data for the
|
||||
* paxos-like recovery process for the given log segment.
|
||||
*/
|
||||
File getPaxosFile(long segmentTxId) {
|
||||
return new File(getPaxosDir(), String.valueOf(segmentTxId));
|
||||
}
|
||||
|
||||
private File getPaxosDir() {
|
||||
return new File(sd.getCurrentDir(), "paxos");
|
||||
}
|
||||
|
||||
void format(NamespaceInfo nsInfo) throws IOException {
|
||||
setStorageInfo(nsInfo);
|
||||
LOG.info("Formatting journal storage directory " +
|
||||
sd + " with nsid: " + getNamespaceID());
|
||||
sd.clearDirectory();
|
||||
writeProperties(sd);
|
||||
if (!getPaxosDir().mkdirs()) {
|
||||
throw new IOException("Could not create paxos dir: " + getPaxosDir());
|
||||
}
|
||||
}
|
||||
|
||||
void analyzeStorage(NamespaceInfo nsInfo) throws IOException {
|
||||
if (lazyInitted) {
|
||||
checkConsistentNamespace(nsInfo);
|
||||
return;
|
||||
}
|
||||
|
||||
StorageState state = sd.analyzeStorage(StartupOption.REGULAR, this);
|
||||
switch (state) {
|
||||
case NON_EXISTENT:
|
||||
case NOT_FORMATTED:
|
||||
format(nsInfo);
|
||||
// In the NORMAL case below, analyzeStorage() has already locked the
|
||||
// directory for us. But in the case that we format it, we have to
|
||||
// lock it here.
|
||||
// The directory is unlocked in close() when the node shuts down.
|
||||
sd.lock();
|
||||
break;
|
||||
case NORMAL:
|
||||
// Storage directory is already locked by analyzeStorage() - no
|
||||
// need to lock it here.
|
||||
readProperties(sd);
|
||||
checkConsistentNamespace(nsInfo);
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG.warn("TODO: unhandled state for storage dir " + sd + ": " + state);
|
||||
}
|
||||
lazyInitted = true;
|
||||
}
|
||||
|
||||
private void checkConsistentNamespace(NamespaceInfo nsInfo)
|
||||
throws IOException {
|
||||
if (nsInfo.getNamespaceID() != getNamespaceID()) {
|
||||
throw new IOException("Incompatible namespaceID for journal " +
|
||||
this.sd + ": NameNode has nsId " + nsInfo.getNamespaceID() +
|
||||
" but storage has nsId " + getNamespaceID());
|
||||
}
|
||||
|
||||
if (!nsInfo.getClusterID().equals(getClusterID())) {
|
||||
throw new IOException("Incompatible clusterID for journal " +
|
||||
this.sd + ": NameNode has clusterId '" + nsInfo.getClusterID() +
|
||||
"' but storage has clusterId '" + getClusterID() + "'");
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
LOG.info("Closing journal storage for " + sd);
|
||||
unlockAll();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,493 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStreamWriter;
|
||||
import java.net.URL;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PersistedRecoveryPaxosData;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
|
||||
import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
||||
import org.apache.hadoop.hdfs.util.AtomicFileOutputStream;
|
||||
import org.apache.hadoop.hdfs.util.PersistentLongFile;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.protobuf.ByteString;
|
||||
import com.google.protobuf.TextFormat;
|
||||
|
||||
/**
|
||||
* A JournalNode can manage journals for several clusters at once.
|
||||
* Each such journal is entirely independent despite being hosted by
|
||||
* the same JVM.
|
||||
*/
|
||||
class Journal implements Closeable {
|
||||
static final Log LOG = LogFactory.getLog(Journal.class);
|
||||
|
||||
|
||||
// Current writing state
|
||||
private EditLogOutputStream curSegment;
|
||||
private long curSegmentTxId = HdfsConstants.INVALID_TXID;
|
||||
private long nextTxId = HdfsConstants.INVALID_TXID;
|
||||
|
||||
private final JNStorage storage;
|
||||
|
||||
/**
|
||||
* When a new writer comes along, it asks each node to promise
|
||||
* to ignore requests from any previous writer, as identified
|
||||
* by epoch number. In order to make such a promise, the epoch
|
||||
* number of that writer is stored persistently on disk.
|
||||
*/
|
||||
private PersistentLongFile lastPromisedEpoch;
|
||||
private static final String LAST_PROMISED_FILENAME = "last-promised-epoch";
|
||||
|
||||
private final FileJournalManager fjm;
|
||||
|
||||
Journal(File logDir, StorageErrorReporter errorReporter) {
|
||||
storage = new JNStorage(logDir, errorReporter);
|
||||
|
||||
File currentDir = storage.getSingularStorageDir().getCurrentDir();
|
||||
this.lastPromisedEpoch = new PersistentLongFile(
|
||||
new File(currentDir, LAST_PROMISED_FILENAME), 0);
|
||||
|
||||
this.fjm = storage.getJournalManager();
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the edit logs stored locally, and set
|
||||
* {@link #curSegmentTxId} to refer to the most recently written
|
||||
* one.
|
||||
*/
|
||||
private synchronized void scanStorage() throws IOException {
|
||||
if (!fjm.getStorageDirectory().getCurrentDir().exists()) {
|
||||
return;
|
||||
}
|
||||
LOG.info("Scanning storage " + fjm);
|
||||
List<EditLogFile> files = fjm.getLogFiles(0);
|
||||
if (!files.isEmpty()) {
|
||||
EditLogFile latestLog = files.get(files.size() - 1);
|
||||
LOG.info("Latest log is " + latestLog);
|
||||
curSegmentTxId = latestLog.getFirstTxId();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Format the local storage with the given namespace.
|
||||
*/
|
||||
void format(NamespaceInfo nsInfo) throws IOException {
|
||||
Preconditions.checkState(nsInfo.getNamespaceID() != 0,
|
||||
"can't format with uninitialized namespace info: %s",
|
||||
nsInfo);
|
||||
storage.format(nsInfo);
|
||||
}
|
||||
|
||||
/**
|
||||
* Unlock and release resources.
|
||||
*/
|
||||
@Override // Closeable
|
||||
public void close() throws IOException {
|
||||
storage.close();
|
||||
}
|
||||
|
||||
JNStorage getStorage() {
|
||||
return storage;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the last epoch which this node has promised not to accept
|
||||
* any lower epoch, or 0 if no promises have been made.
|
||||
*/
|
||||
synchronized long getLastPromisedEpoch() throws IOException {
|
||||
return lastPromisedEpoch.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Try to create a new epoch for this journal.
|
||||
* @param nsInfo the namespace, which is verified for consistency or used to
|
||||
* format, if the Journal has not yet been written to.
|
||||
* @param epoch the epoch to start
|
||||
* @return the status information necessary to begin recovery
|
||||
* @throws IOException if the node has already made a promise to another
|
||||
* writer with a higher epoch number, if the namespace is inconsistent,
|
||||
* or if a disk error occurs.
|
||||
*/
|
||||
synchronized NewEpochResponseProto newEpoch(
|
||||
NamespaceInfo nsInfo, long epoch) throws IOException {
|
||||
|
||||
// If the storage is unformatted, format it with this NS.
|
||||
// Otherwise, check that the NN's nsinfo matches the storage.
|
||||
storage.analyzeStorage(nsInfo);
|
||||
|
||||
if (epoch <= getLastPromisedEpoch()) {
|
||||
throw new IOException("Proposed epoch " + epoch + " <= last promise " +
|
||||
getLastPromisedEpoch());
|
||||
}
|
||||
|
||||
lastPromisedEpoch.set(epoch);
|
||||
if (curSegment != null) {
|
||||
curSegment.close();
|
||||
curSegment = null;
|
||||
}
|
||||
|
||||
NewEpochResponseProto.Builder builder =
|
||||
NewEpochResponseProto.newBuilder();
|
||||
|
||||
// TODO: we only need to do this once, not on writer switchover.
|
||||
scanStorage();
|
||||
|
||||
if (curSegmentTxId != HdfsConstants.INVALID_TXID) {
|
||||
builder.setLastSegmentTxId(curSegmentTxId);
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a batch of edits to the journal.
|
||||
* {@see QJournalProtocol#journal(RequestInfo, long, int, byte[])}
|
||||
*/
|
||||
synchronized void journal(RequestInfo reqInfo, long firstTxnId,
|
||||
int numTxns, byte[] records) throws IOException {
|
||||
checkRequest(reqInfo);
|
||||
|
||||
// TODO: if a JN goes down and comes back up, then it will throw
|
||||
// this exception on every edit. We should instead send back
|
||||
// a response indicating the log needs to be rolled, which would
|
||||
// mark the logger on the client side as "pending" -- and have the
|
||||
// NN code look for this condition and trigger a roll when it happens.
|
||||
// That way the node can catch back up and rejoin
|
||||
Preconditions.checkState(curSegment != null,
|
||||
"Can't write, no segment open");
|
||||
Preconditions.checkState(nextTxId == firstTxnId,
|
||||
"Can't write txid " + firstTxnId + " expecting nextTxId=" + nextTxId);
|
||||
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("Writing txid " + firstTxnId + "-" + (firstTxnId + numTxns - 1));
|
||||
}
|
||||
curSegment.writeRaw(records, 0, records.length);
|
||||
curSegment.setReadyToFlush();
|
||||
curSegment.flush();
|
||||
nextTxId += numTxns;
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that the given request is coming from the correct writer and in-order.
|
||||
* @param reqInfo the request info
|
||||
* @throws IOException if the request is invalid.
|
||||
*/
|
||||
private synchronized void checkRequest(RequestInfo reqInfo) throws IOException {
|
||||
// Invariant 25 from ZAB paper
|
||||
if (reqInfo.getEpoch() < lastPromisedEpoch.get()) {
|
||||
throw new IOException("IPC's epoch " + reqInfo.getEpoch() +
|
||||
" is less than the last promised epoch " +
|
||||
lastPromisedEpoch.get());
|
||||
}
|
||||
|
||||
// TODO: should other requests check the _exact_ epoch instead of
|
||||
// the <= check? <= should probably only be necessary for the
|
||||
// first calls
|
||||
|
||||
// TODO: some check on serial number that they only increase from a given
|
||||
// client
|
||||
}
|
||||
|
||||
/**
|
||||
* Start a new segment at the given txid. The previous segment
|
||||
* must have already been finalized.
|
||||
*/
|
||||
public synchronized void startLogSegment(RequestInfo reqInfo, long txid)
|
||||
throws IOException {
|
||||
assert fjm != null;
|
||||
checkRequest(reqInfo);
|
||||
|
||||
Preconditions.checkState(curSegment == null,
|
||||
"Can't start a log segment, already writing " + curSegment);
|
||||
Preconditions.checkState(nextTxId == txid || nextTxId == HdfsConstants.INVALID_TXID,
|
||||
"Can't start log segment " + txid + " expecting nextTxId=" + nextTxId);
|
||||
curSegment = fjm.startLogSegment(txid);
|
||||
curSegmentTxId = txid;
|
||||
nextTxId = txid;
|
||||
}
|
||||
|
||||
/**
|
||||
* Finalize the log segment at the given transaction ID.
|
||||
*/
|
||||
public synchronized void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
|
||||
long endTxId) throws IOException {
|
||||
checkRequest(reqInfo);
|
||||
|
||||
if (startTxId == curSegmentTxId) {
|
||||
if (curSegment != null) {
|
||||
curSegment.close();
|
||||
curSegment = null;
|
||||
}
|
||||
}
|
||||
|
||||
FileJournalManager.EditLogFile elf = fjm.getLogFile(startTxId);
|
||||
if (elf.isInProgress()) {
|
||||
// TODO: this is slow to validate when in non-recovery cases
|
||||
// we already know the length here!
|
||||
|
||||
LOG.info("Validating log about to be finalized: " + elf);
|
||||
elf.validateLog();
|
||||
|
||||
Preconditions.checkState(elf.getLastTxId() == endTxId,
|
||||
"Trying to finalize log %s-%s, but current state of log" +
|
||||
"is %s", startTxId, endTxId, elf);
|
||||
fjm.finalizeLogSegment(startTxId, endTxId);
|
||||
} else {
|
||||
Preconditions.checkArgument(endTxId == elf.getLastTxId(),
|
||||
"Trying to re-finalize already finalized log " +
|
||||
elf + " with different endTxId " + endTxId);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @see QJournalProtocol#getEditLogManifest(String, long)
|
||||
*/
|
||||
public RemoteEditLogManifest getEditLogManifest(long sinceTxId)
|
||||
throws IOException {
|
||||
// TODO: check fencing info?
|
||||
RemoteEditLogManifest manifest = new RemoteEditLogManifest(
|
||||
fjm.getRemoteEditLogs(sinceTxId));
|
||||
return manifest;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the current state of the given segment, or null if the
|
||||
* segment does not exist.
|
||||
*/
|
||||
private SegmentStateProto getSegmentInfo(long segmentTxId)
|
||||
throws IOException {
|
||||
EditLogFile elf = fjm.getLogFile(segmentTxId);
|
||||
if (elf == null) {
|
||||
return null;
|
||||
}
|
||||
if (elf.isInProgress()) {
|
||||
elf.validateLog();
|
||||
}
|
||||
if (elf.getLastTxId() == HdfsConstants.INVALID_TXID) {
|
||||
// no transactions in file
|
||||
throw new AssertionError("TODO: no transactions in file " +
|
||||
elf);
|
||||
}
|
||||
SegmentStateProto ret = SegmentStateProto.newBuilder()
|
||||
.setStartTxId(segmentTxId)
|
||||
.setEndTxId(elf.getLastTxId())
|
||||
.setIsInProgress(elf.isInProgress())
|
||||
.setMd5Sum(ByteString.EMPTY) // TODO
|
||||
.build();
|
||||
LOG.info("getSegmentInfo(" + segmentTxId + "): " + elf + " -> " +
|
||||
TextFormat.shortDebugString(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see QJournalProtocol#prepareRecovery(RequestInfo, long)
|
||||
*/
|
||||
public synchronized PrepareRecoveryResponseProto prepareRecovery(
|
||||
RequestInfo reqInfo, long segmentTxId) throws IOException {
|
||||
checkRequest(reqInfo);
|
||||
|
||||
PrepareRecoveryResponseProto.Builder builder =
|
||||
PrepareRecoveryResponseProto.newBuilder();
|
||||
|
||||
PersistedRecoveryPaxosData previouslyAccepted = getPersistedPaxosData(segmentTxId);
|
||||
if (previouslyAccepted != null) {
|
||||
builder.setAcceptedInEpoch(previouslyAccepted.getAcceptedInEpoch())
|
||||
.setSegmentState(previouslyAccepted.getSegmentState());
|
||||
} else {
|
||||
SegmentStateProto segInfo = getSegmentInfo(segmentTxId);
|
||||
if (segInfo != null) {
|
||||
builder.setSegmentState(segInfo);
|
||||
}
|
||||
}
|
||||
|
||||
PrepareRecoveryResponseProto resp = builder.build();
|
||||
LOG.info("Prepared recovery for segment " + segmentTxId + ": " +
|
||||
TextFormat.shortDebugString(resp));
|
||||
return resp;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see QJournalProtocol#acceptRecovery(RequestInfo, SegmentStateProto, URL)
|
||||
*/
|
||||
public synchronized void acceptRecovery(RequestInfo reqInfo,
|
||||
SegmentStateProto segment, URL fromUrl)
|
||||
throws IOException {
|
||||
checkRequest(reqInfo);
|
||||
long segmentTxId = segment.getStartTxId();
|
||||
|
||||
// TODO: right now, a recovery of a segment when the log is
|
||||
// completely emtpy (ie startLogSegment() but no txns)
|
||||
// will fail this assertion here, since endTxId < startTxId
|
||||
Preconditions.checkArgument(segment.getEndTxId() > 0 &&
|
||||
segment.getEndTxId() >= segmentTxId,
|
||||
"bad recovery state for segment %s: %s",
|
||||
segmentTxId, TextFormat.shortDebugString(segment));
|
||||
|
||||
PersistedRecoveryPaxosData oldData = getPersistedPaxosData(segmentTxId);
|
||||
PersistedRecoveryPaxosData newData = PersistedRecoveryPaxosData.newBuilder()
|
||||
.setAcceptedInEpoch(reqInfo.getEpoch())
|
||||
.setSegmentState(segment)
|
||||
.build();
|
||||
if (oldData != null) {
|
||||
Preconditions.checkState(oldData.getAcceptedInEpoch() <= reqInfo.getEpoch(),
|
||||
"Bad paxos transition, out-of-order epochs.\nOld: %s\nNew: %s\n",
|
||||
oldData, newData);
|
||||
}
|
||||
|
||||
SegmentStateProto currentSegment = getSegmentInfo(segmentTxId);
|
||||
// TODO: this can be null, in the case that one of the loggers started
|
||||
// the next segment, but others did not! add regression test and null
|
||||
// check in next condition below.
|
||||
|
||||
// TODO: what if they have the same length but one is finalized and the
|
||||
// other isn't! cover that case.
|
||||
if (currentSegment.getEndTxId() != segment.getEndTxId()) {
|
||||
syncLog(reqInfo, segment, fromUrl);
|
||||
} else {
|
||||
LOG.info("Skipping download of log " +
|
||||
TextFormat.shortDebugString(segment) +
|
||||
": already have up-to-date logs");
|
||||
}
|
||||
|
||||
// TODO: is it OK that this is non-atomic?
|
||||
// we might be left with an older epoch recorded, but a newer log
|
||||
|
||||
persistPaxosData(segmentTxId, newData);
|
||||
LOG.info("Accepted recovery for segment " + segmentTxId + ": " +
|
||||
TextFormat.shortDebugString(newData));
|
||||
}
|
||||
|
||||
/**
|
||||
* Synchronize a log segment from another JournalNode.
|
||||
* @param reqInfo the request info for the recovery IPC
|
||||
* @param segment
|
||||
* @param url
|
||||
* @throws IOException
|
||||
*/
|
||||
private void syncLog(RequestInfo reqInfo,
|
||||
SegmentStateProto segment, URL url) throws IOException {
|
||||
String tmpFileName =
|
||||
"synclog_" + segment.getStartTxId() + "_" +
|
||||
reqInfo.getEpoch() + "." + reqInfo.getIpcSerialNumber();
|
||||
|
||||
List<File> localPaths = storage.getFiles(null, tmpFileName);
|
||||
assert localPaths.size() == 1;
|
||||
File tmpFile = localPaths.get(0);
|
||||
|
||||
boolean success = false;
|
||||
|
||||
LOG.info("Synchronizing log " +
|
||||
TextFormat.shortDebugString(segment) + " from " + url);
|
||||
TransferFsImage.doGetUrl(url, localPaths, storage, true);
|
||||
assert tmpFile.exists();
|
||||
try {
|
||||
success = tmpFile.renameTo(storage.getInProgressEditLog(
|
||||
segment.getStartTxId()));
|
||||
if (success) {
|
||||
// If we're synchronizing the latest segment, update our cached
|
||||
// info.
|
||||
// TODO: can this be done more generally?
|
||||
if (curSegmentTxId == segment.getStartTxId()) {
|
||||
nextTxId = segment.getEndTxId() + 1;
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
if (!success) {
|
||||
if (!tmpFile.delete()) {
|
||||
LOG.warn("Failed to delete temporary file " + tmpFile);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieve the persisted data for recovering the given segment from disk.
|
||||
*/
|
||||
private PersistedRecoveryPaxosData getPersistedPaxosData(long segmentTxId)
|
||||
throws IOException {
|
||||
File f = storage.getPaxosFile(segmentTxId);
|
||||
if (!f.exists()) {
|
||||
// Default instance has no fields filled in (they're optional)
|
||||
return null;
|
||||
}
|
||||
|
||||
InputStream in = new FileInputStream(f);
|
||||
try {
|
||||
PersistedRecoveryPaxosData ret = PersistedRecoveryPaxosData.parseDelimitedFrom(in);
|
||||
Preconditions.checkState(ret != null &&
|
||||
ret.getSegmentState().getStartTxId() == segmentTxId,
|
||||
"Bad persisted data for segment %s: %s",
|
||||
segmentTxId, ret);
|
||||
return ret;
|
||||
} finally {
|
||||
IOUtils.closeStream(in);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Persist data for recovering the given segment from disk.
|
||||
*/
|
||||
private void persistPaxosData(long segmentTxId,
|
||||
PersistedRecoveryPaxosData newData) throws IOException {
|
||||
File f = storage.getPaxosFile(segmentTxId);
|
||||
boolean success = false;
|
||||
AtomicFileOutputStream fos = new AtomicFileOutputStream(f);
|
||||
try {
|
||||
newData.writeDelimitedTo(fos);
|
||||
fos.write('\n');
|
||||
// Write human-readable data after the protobuf. This is only
|
||||
// to assist in debugging -- it's not parsed at all.
|
||||
OutputStreamWriter writer = new OutputStreamWriter(fos);
|
||||
|
||||
writer.write(String.valueOf(newData));
|
||||
writer.write('\n');
|
||||
writer.flush();
|
||||
|
||||
fos.flush();
|
||||
success = true;
|
||||
} finally {
|
||||
if (success) {
|
||||
IOUtils.closeStream(fos);
|
||||
} else {
|
||||
fos.abort();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,202 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configurable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
|
||||
import org.apache.hadoop.io.IOUtils;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.metrics2.source.JvmMetrics;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
/**
|
||||
* The JournalNode is a daemon which allows namenodes using
|
||||
* the QuorumJournalManager to log and retrieve edits stored
|
||||
* remotely. It is a thin wrapper around a local edit log
|
||||
* directory with the addition of facilities to participate
|
||||
* in the quorum protocol.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class JournalNode implements Tool, Configurable {
|
||||
public static final Log LOG = LogFactory.getLog(JournalNode.class);
|
||||
private Configuration conf;
|
||||
private JournalNodeRpcServer rpcServer;
|
||||
private JournalNodeHttpServer httpServer;
|
||||
private Map<String, Journal> journalsById = Maps.newHashMap();
|
||||
|
||||
static {
|
||||
HdfsConfiguration.init();
|
||||
}
|
||||
|
||||
/**
|
||||
* When stopped, the daemon will exit with this code.
|
||||
*/
|
||||
private int resultCode = 0;
|
||||
|
||||
synchronized Journal getOrCreateJournal(String jid) {
|
||||
QuorumJournalManager.checkJournalId(jid);
|
||||
|
||||
Journal journal = journalsById.get(jid);
|
||||
if (journal == null) {
|
||||
File logDir = getLogDir(jid);
|
||||
LOG.info("Initializing journal in directory " + logDir);
|
||||
journal = new Journal(logDir, new ErrorReporter());
|
||||
journalsById.put(jid, journal);
|
||||
}
|
||||
|
||||
return journal;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration conf) {
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Configuration getConf() {
|
||||
return conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int run(String[] args) throws Exception {
|
||||
start();
|
||||
return join();
|
||||
}
|
||||
|
||||
/**
|
||||
* Start listening for edits via RPC.
|
||||
*/
|
||||
public void start() throws IOException {
|
||||
Preconditions.checkState(!isStarted(), "JN already running");
|
||||
|
||||
DefaultMetricsSystem.initialize("JournalNode");
|
||||
JvmMetrics.create("JournalNode",
|
||||
conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY),
|
||||
DefaultMetricsSystem.instance());
|
||||
|
||||
httpServer = new JournalNodeHttpServer(conf, this);
|
||||
httpServer.start();
|
||||
|
||||
rpcServer = new JournalNodeRpcServer(conf, this);
|
||||
rpcServer.start();
|
||||
}
|
||||
|
||||
public boolean isStarted() {
|
||||
return rpcServer != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the address the IPC server is bound to
|
||||
*/
|
||||
public InetSocketAddress getBoundIpcAddress() {
|
||||
return rpcServer.getAddress();
|
||||
}
|
||||
|
||||
|
||||
public InetSocketAddress getBoundHttpAddress() {
|
||||
return httpServer.getAddress();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Stop the daemon with the given status code
|
||||
* @param rc the status code with which to exit (non-zero
|
||||
* should indicate an error)
|
||||
*/
|
||||
public void stop(int rc) {
|
||||
this.resultCode = rc;
|
||||
|
||||
if (rpcServer != null) {
|
||||
rpcServer.stop();
|
||||
}
|
||||
|
||||
if (httpServer != null) {
|
||||
try {
|
||||
httpServer.stop();
|
||||
} catch (IOException ioe) {
|
||||
LOG.warn("Unable to stop HTTP server for " + this, ioe);
|
||||
}
|
||||
}
|
||||
|
||||
for (Journal j : journalsById.values()) {
|
||||
IOUtils.cleanup(LOG, j);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for the daemon to exit.
|
||||
* @return the result code (non-zero if error)
|
||||
*/
|
||||
int join() throws InterruptedException {
|
||||
if (rpcServer != null) {
|
||||
rpcServer.join();
|
||||
}
|
||||
return resultCode;
|
||||
}
|
||||
|
||||
public void stopAndJoin(int rc) throws InterruptedException {
|
||||
stop(rc);
|
||||
join();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the directory inside our configured storage
|
||||
* dir which corresponds to a given journal.
|
||||
* @param jid the journal identifier
|
||||
* @return the file, which may or may not exist yet
|
||||
*/
|
||||
private File getLogDir(String jid) {
|
||||
String dir = conf.get(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_DEFAULT);
|
||||
Preconditions.checkArgument(jid != null &&
|
||||
!jid.isEmpty(),
|
||||
"bad journal identifier: %s", jid);
|
||||
return new File(new File(dir), jid);
|
||||
}
|
||||
|
||||
|
||||
private class ErrorReporter implements StorageErrorReporter {
|
||||
@Override
|
||||
public void reportErrorOnFile(File f) {
|
||||
LOG.fatal("Error reported on file " + f + "... exiting",
|
||||
new Exception());
|
||||
stop(1);
|
||||
}
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
System.exit(ToolRunner.run(new JournalNode(), args));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_ADMIN;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_KEYTAB_FILE_KEY;
|
||||
import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_JOURNALNODE_USER_NAME_KEY;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
|
||||
import javax.servlet.ServletContext;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.server.common.JspHelper;
|
||||
import org.apache.hadoop.http.HttpServer;
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hadoop.security.authorize.AccessControlList;
|
||||
|
||||
/**
|
||||
* Encapsulates the HTTP server started by the Journal Service.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class JournalNodeHttpServer {
|
||||
public static final Log LOG = LogFactory.getLog(
|
||||
JournalNodeHttpServer.class);
|
||||
|
||||
public static final String JN_ATTRIBUTE_KEY = "localjournal";
|
||||
|
||||
private HttpServer httpServer;
|
||||
private int infoPort;
|
||||
private JournalNode localJournalNode;
|
||||
|
||||
private final Configuration conf;
|
||||
|
||||
JournalNodeHttpServer(Configuration conf, JournalNode jn) {
|
||||
this.conf = conf;
|
||||
this.localJournalNode = jn;
|
||||
}
|
||||
|
||||
void start() throws IOException {
|
||||
final InetSocketAddress bindAddr = getAddress(conf);
|
||||
|
||||
// initialize the webserver for uploading/downloading files.
|
||||
LOG.info("Starting web server as: "
|
||||
+ UserGroupInformation.getCurrentUser().getUserName());
|
||||
|
||||
int tmpInfoPort = bindAddr.getPort();
|
||||
httpServer = new HttpServer("journal", bindAddr.getHostName(),
|
||||
tmpInfoPort, tmpInfoPort == 0, conf, new AccessControlList(conf
|
||||
.get(DFS_ADMIN, " "))) {
|
||||
{
|
||||
if (UserGroupInformation.isSecurityEnabled()) {
|
||||
initSpnego(conf, DFS_JOURNALNODE_USER_NAME_KEY,
|
||||
DFS_JOURNALNODE_KEYTAB_FILE_KEY);
|
||||
}
|
||||
}
|
||||
};
|
||||
httpServer.setAttribute(JN_ATTRIBUTE_KEY, localJournalNode);
|
||||
httpServer.setAttribute(JspHelper.CURRENT_CONF, conf);
|
||||
httpServer.addInternalServlet("getJournal", "/getJournal",
|
||||
GetJournalEditServlet.class, true);
|
||||
httpServer.start();
|
||||
|
||||
// The web-server port can be ephemeral... ensure we have the correct info
|
||||
infoPort = httpServer.getPort();
|
||||
|
||||
LOG.info("Journal Web-server up at: " + bindAddr + ":" + infoPort);
|
||||
}
|
||||
|
||||
void stop() throws IOException {
|
||||
if (httpServer != null) {
|
||||
try {
|
||||
httpServer.stop();
|
||||
} catch (Exception e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the actual address bound to by the running server.
|
||||
*/
|
||||
public InetSocketAddress getAddress() {
|
||||
InetSocketAddress addr = httpServer.getListenerAddress();
|
||||
assert addr.getPort() != 0;
|
||||
return addr;
|
||||
}
|
||||
|
||||
private static InetSocketAddress getAddress(Configuration conf) {
|
||||
String addr = conf.get(DFSConfigKeys.DFS_JOURNALNODE_HTTP_ADDRESS_KEY,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_HTTP_ADDRESS_DEFAULT);
|
||||
return NetUtils.createSocketAddr(addr,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_HTTP_PORT_DEFAULT,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_HTTP_ADDRESS_KEY);
|
||||
}
|
||||
|
||||
public static Journal getJournalFromContext(ServletContext context, String jid) {
|
||||
JournalNode jn = (JournalNode)context.getAttribute(JN_ATTRIBUTE_KEY);
|
||||
return jn.getOrCreateJournal(jid);
|
||||
}
|
||||
|
||||
public static Configuration getConfFromContext(ServletContext context) {
|
||||
return (Configuration) context.getAttribute(JspHelper.CURRENT_CONF);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,160 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URL;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.protocolPB.PBHelper;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetEditLogManifestResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.QJournalProtocolService;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocolPB.QJournalProtocolPB;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocolPB.QJournalProtocolServerSideTranslatorPB;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest;
|
||||
import org.apache.hadoop.ipc.ProtobufRpcEngine;
|
||||
import org.apache.hadoop.ipc.RPC;
|
||||
import org.apache.hadoop.ipc.RPC.Server;
|
||||
import org.apache.hadoop.net.NetUtils;
|
||||
|
||||
import com.google.protobuf.BlockingService;
|
||||
|
||||
class JournalNodeRpcServer implements QJournalProtocol {
|
||||
|
||||
private static final int HANDLER_COUNT = 5;
|
||||
private JournalNode jn;
|
||||
private Server server;
|
||||
|
||||
JournalNodeRpcServer(Configuration conf, JournalNode jn) throws IOException {
|
||||
this.jn = jn;
|
||||
|
||||
InetSocketAddress addr = getAddress(conf);
|
||||
RPC.setProtocolEngine(conf, QJournalProtocolPB.class,
|
||||
ProtobufRpcEngine.class);
|
||||
QJournalProtocolServerSideTranslatorPB translator =
|
||||
new QJournalProtocolServerSideTranslatorPB(this);
|
||||
BlockingService service = QJournalProtocolService
|
||||
.newReflectiveBlockingService(translator);
|
||||
|
||||
this.server = RPC.getServer(
|
||||
QJournalProtocolPB.class,
|
||||
service, addr.getHostName(),
|
||||
addr.getPort(), HANDLER_COUNT, false, conf,
|
||||
null /*secretManager*/);
|
||||
}
|
||||
|
||||
void start() {
|
||||
this.server.start();
|
||||
}
|
||||
|
||||
public InetSocketAddress getAddress() {
|
||||
return server.getListenerAddress();
|
||||
}
|
||||
|
||||
void join() throws InterruptedException {
|
||||
this.server.join();
|
||||
}
|
||||
|
||||
void stop() {
|
||||
this.server.stop();
|
||||
}
|
||||
|
||||
private static InetSocketAddress getAddress(Configuration conf) {
|
||||
String addr = conf.get(
|
||||
DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_KEY,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_DEFAULT);
|
||||
return NetUtils.createSocketAddr(addr, 0,
|
||||
DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_KEY);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetJournalStateResponseProto getJournalState(String journalId)
|
||||
throws IOException {
|
||||
long epoch = jn.getOrCreateJournal(journalId).getLastPromisedEpoch();
|
||||
return GetJournalStateResponseProto.newBuilder()
|
||||
.setLastPromisedEpoch(epoch)
|
||||
.setHttpPort(jn.getBoundHttpAddress().getPort())
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public NewEpochResponseProto newEpoch(String journalId,
|
||||
NamespaceInfo nsInfo,
|
||||
long epoch) throws IOException {
|
||||
return jn.getOrCreateJournal(journalId).newEpoch(nsInfo, epoch);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void journal(RequestInfo reqInfo, long firstTxnId,
|
||||
int numTxns, byte[] records) throws IOException {
|
||||
jn.getOrCreateJournal(reqInfo.getJournalId())
|
||||
.journal(reqInfo, firstTxnId, numTxns, records);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startLogSegment(RequestInfo reqInfo, long txid)
|
||||
throws IOException {
|
||||
jn.getOrCreateJournal(reqInfo.getJournalId())
|
||||
.startLogSegment(reqInfo, txid);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void finalizeLogSegment(RequestInfo reqInfo, long startTxId,
|
||||
long endTxId) throws IOException {
|
||||
jn.getOrCreateJournal(reqInfo.getJournalId())
|
||||
.finalizeLogSegment(reqInfo, startTxId, endTxId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public GetEditLogManifestResponseProto getEditLogManifest(String jid,
|
||||
long sinceTxId) throws IOException {
|
||||
|
||||
RemoteEditLogManifest manifest = jn.getOrCreateJournal(jid)
|
||||
.getEditLogManifest(sinceTxId);
|
||||
|
||||
return GetEditLogManifestResponseProto.newBuilder()
|
||||
.setManifest(PBHelper.convert(manifest))
|
||||
.setHttpPort(jn.getBoundHttpAddress().getPort())
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public PrepareRecoveryResponseProto prepareRecovery(RequestInfo reqInfo,
|
||||
long segmentTxId) throws IOException {
|
||||
return jn.getOrCreateJournal(reqInfo.getJournalId())
|
||||
.prepareRecovery(reqInfo, segmentTxId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void acceptRecovery(RequestInfo reqInfo, SegmentStateProto log,
|
||||
URL fromUrl) throws IOException {
|
||||
jn.getOrCreateJournal(reqInfo.getJournalId())
|
||||
.acceptRecovery(reqInfo, log, fromUrl);
|
||||
}
|
||||
|
||||
}
|
|
@ -39,7 +39,8 @@ public final class HdfsServerConstants {
|
|||
*/
|
||||
static public enum NodeType {
|
||||
NAME_NODE,
|
||||
DATA_NODE;
|
||||
DATA_NODE,
|
||||
JOURNAL_NODE;
|
||||
}
|
||||
|
||||
/** Startup options */
|
||||
|
|
|
@ -41,6 +41,8 @@ import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.StartupOption;
|
|||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.util.VersionInfo;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
|
@ -75,7 +77,7 @@ public abstract class Storage extends StorageInfo {
|
|||
/** Layout versions of 0.20.203 release */
|
||||
public static final int[] LAYOUT_VERSIONS_203 = {-19, -31};
|
||||
|
||||
private static final String STORAGE_FILE_LOCK = "in_use.lock";
|
||||
public static final String STORAGE_FILE_LOCK = "in_use.lock";
|
||||
protected static final String STORAGE_FILE_VERSION = "VERSION";
|
||||
public static final String STORAGE_DIR_CURRENT = "current";
|
||||
public static final String STORAGE_DIR_PREVIOUS = "previous";
|
||||
|
@ -719,6 +721,15 @@ public abstract class Storage extends StorageInfo {
|
|||
return storageDirs.get(idx);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the storage directory, with the precondition that this storage
|
||||
* has exactly one storage directory
|
||||
*/
|
||||
public StorageDirectory getSingularStorageDir() {
|
||||
Preconditions.checkState(storageDirs.size() == 1);
|
||||
return storageDirs.get(0);
|
||||
}
|
||||
|
||||
protected void addStorageDir(StorageDirectory sd) {
|
||||
storageDirs.add(sd);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.hadoop.hdfs.server.namenode;
|
|||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp.Writer;
|
||||
import org.apache.hadoop.io.DataOutputBuffer;
|
||||
|
@ -34,7 +35,8 @@ import com.google.common.base.Preconditions;
|
|||
* to progress concurrently to flushes without allocating new buffers each
|
||||
* time.
|
||||
*/
|
||||
class EditsDoubleBuffer {
|
||||
@InterfaceAudience.Private
|
||||
public class EditsDoubleBuffer {
|
||||
|
||||
private TxnBuffer bufCurrent; // current buffer for writing
|
||||
private TxnBuffer bufReady; // buffer ready for flushing
|
||||
|
@ -51,11 +53,11 @@ class EditsDoubleBuffer {
|
|||
bufCurrent.writeOp(op);
|
||||
}
|
||||
|
||||
void writeRaw(byte[] bytes, int offset, int length) throws IOException {
|
||||
public void writeRaw(byte[] bytes, int offset, int length) throws IOException {
|
||||
bufCurrent.write(bytes, offset, length);
|
||||
}
|
||||
|
||||
void close() throws IOException {
|
||||
public void close() throws IOException {
|
||||
Preconditions.checkNotNull(bufCurrent);
|
||||
Preconditions.checkNotNull(bufReady);
|
||||
|
||||
|
@ -69,7 +71,7 @@ class EditsDoubleBuffer {
|
|||
bufCurrent = bufReady = null;
|
||||
}
|
||||
|
||||
void setReadyToFlush() {
|
||||
public void setReadyToFlush() {
|
||||
assert isFlushed() : "previous data not flushed yet";
|
||||
TxnBuffer tmp = bufReady;
|
||||
bufReady = bufCurrent;
|
||||
|
@ -80,12 +82,12 @@ class EditsDoubleBuffer {
|
|||
* Writes the content of the "ready" buffer to the given output stream,
|
||||
* and resets it. Does not swap any buffers.
|
||||
*/
|
||||
void flushTo(OutputStream out) throws IOException {
|
||||
public void flushTo(OutputStream out) throws IOException {
|
||||
bufReady.writeTo(out); // write data to file
|
||||
bufReady.reset(); // erase all data in the buffer
|
||||
}
|
||||
|
||||
boolean shouldForceSync() {
|
||||
public boolean shouldForceSync() {
|
||||
return bufCurrent.size() >= initBufferSize;
|
||||
}
|
||||
|
||||
|
@ -120,6 +122,12 @@ class EditsDoubleBuffer {
|
|||
return bufReady.numTxns;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of bytes that are ready to be flushed
|
||||
*/
|
||||
public int countReadyBytes() {
|
||||
return bufReady.size();
|
||||
}
|
||||
|
||||
private static class TxnBuffer extends DataOutputBuffer {
|
||||
long firstTxId;
|
||||
|
|
|
@ -1136,6 +1136,7 @@ public class FSEditLog {
|
|||
journalSet.recoverUnfinalizedSegments();
|
||||
} catch (IOException ex) {
|
||||
// All journals have failed, it is handled in logSync.
|
||||
// TODO: are we sure this is OK?
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Collections;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.hadoop.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
|
||||
|
@ -39,6 +40,7 @@ import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeFile;
|
|||
import org.apache.hadoop.hdfs.server.protocol.RemoteEditLog;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
|
@ -50,7 +52,8 @@ import com.google.common.collect.ComparisonChain;
|
|||
* Note: this class is not thread-safe and should be externally
|
||||
* synchronized.
|
||||
*/
|
||||
class FileJournalManager implements JournalManager {
|
||||
@InterfaceAudience.Private
|
||||
public class FileJournalManager implements JournalManager {
|
||||
private static final Log LOG = LogFactory.getLog(FileJournalManager.class);
|
||||
|
||||
private final StorageDirectory sd;
|
||||
|
@ -147,7 +150,7 @@ class FileJournalManager implements JournalManager {
|
|||
* @return a list of remote edit logs
|
||||
* @throws IOException if edit logs cannot be listed.
|
||||
*/
|
||||
List<RemoteEditLog> getRemoteEditLogs(long firstTxId) throws IOException {
|
||||
public List<RemoteEditLog> getRemoteEditLogs(long firstTxId) throws IOException {
|
||||
File currentDir = sd.getCurrentDir();
|
||||
List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
|
||||
List<RemoteEditLog> ret = Lists.newArrayListWithCapacity(
|
||||
|
@ -165,6 +168,8 @@ class FileJournalManager implements JournalManager {
|
|||
}
|
||||
}
|
||||
|
||||
Collections.sort(ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -178,7 +183,7 @@ class FileJournalManager implements JournalManager {
|
|||
* @throws IOException
|
||||
* IOException thrown for invalid logDir
|
||||
*/
|
||||
static List<EditLogFile> matchEditLogs(File logDir) throws IOException {
|
||||
public static List<EditLogFile> matchEditLogs(File logDir) throws IOException {
|
||||
return matchEditLogs(FileUtil.listFiles(logDir));
|
||||
}
|
||||
|
||||
|
@ -206,7 +211,7 @@ class FileJournalManager implements JournalManager {
|
|||
try {
|
||||
long startTxId = Long.valueOf(inProgressEditsMatch.group(1));
|
||||
ret.add(
|
||||
new EditLogFile(f, startTxId, startTxId, true));
|
||||
new EditLogFile(f, startTxId, HdfsConstants.INVALID_TXID, true));
|
||||
} catch (NumberFormatException nfe) {
|
||||
LOG.error("In-progress edits file " + f + " has improperly " +
|
||||
"formatted transaction ID");
|
||||
|
@ -304,7 +309,7 @@ class FileJournalManager implements JournalManager {
|
|||
}
|
||||
}
|
||||
|
||||
List<EditLogFile> getLogFiles(long fromTxId) throws IOException {
|
||||
public List<EditLogFile> getLogFiles(long fromTxId) throws IOException {
|
||||
File currentDir = sd.getCurrentDir();
|
||||
List<EditLogFile> allLogFiles = matchEditLogs(currentDir);
|
||||
List<EditLogFile> logFiles = Lists.newArrayList();
|
||||
|
@ -321,6 +326,32 @@ class FileJournalManager implements JournalManager {
|
|||
return logFiles;
|
||||
}
|
||||
|
||||
public EditLogFile getLogFile(long startTxId) throws IOException {
|
||||
return getLogFile(sd.getCurrentDir(), startTxId);
|
||||
}
|
||||
|
||||
public static EditLogFile getLogFile(File dir, long startTxId)
|
||||
throws IOException {
|
||||
List<EditLogFile> files = matchEditLogs(dir);
|
||||
List<EditLogFile> ret = Lists.newLinkedList();
|
||||
for (EditLogFile elf : files) {
|
||||
if (elf.getFirstTxId() == startTxId) {
|
||||
ret.add(elf);
|
||||
}
|
||||
}
|
||||
|
||||
if (ret.isEmpty()) {
|
||||
// no matches
|
||||
return null;
|
||||
} else if (ret.size() == 1) {
|
||||
return ret.get(0);
|
||||
} else {
|
||||
throw new IllegalStateException("More than one log segment in " +
|
||||
dir + " starting at txid " + startTxId + ": " +
|
||||
Joiner.on(", ").join(ret));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("FileJournalManager(root=%s)", sd.getRoot());
|
||||
|
@ -329,7 +360,8 @@ class FileJournalManager implements JournalManager {
|
|||
/**
|
||||
* Record of an edit log that has been located and had its filename parsed.
|
||||
*/
|
||||
static class EditLogFile {
|
||||
@InterfaceAudience.Private
|
||||
public static class EditLogFile {
|
||||
private File file;
|
||||
private final long firstTxId;
|
||||
private long lastTxId;
|
||||
|
@ -362,17 +394,20 @@ class FileJournalManager implements JournalManager {
|
|||
assert (firstTxId > 0) || (firstTxId == HdfsConstants.INVALID_TXID);
|
||||
assert file != null;
|
||||
|
||||
Preconditions.checkArgument(!isInProgress ||
|
||||
lastTxId == HdfsConstants.INVALID_TXID);
|
||||
|
||||
this.firstTxId = firstTxId;
|
||||
this.lastTxId = lastTxId;
|
||||
this.file = file;
|
||||
this.isInProgress = isInProgress;
|
||||
}
|
||||
|
||||
long getFirstTxId() {
|
||||
public long getFirstTxId() {
|
||||
return firstTxId;
|
||||
}
|
||||
|
||||
long getLastTxId() {
|
||||
public long getLastTxId() {
|
||||
return lastTxId;
|
||||
}
|
||||
|
||||
|
@ -385,17 +420,17 @@ class FileJournalManager implements JournalManager {
|
|||
* This will update the lastTxId of the EditLogFile or
|
||||
* mark it as corrupt if it is.
|
||||
*/
|
||||
void validateLog() throws IOException {
|
||||
public void validateLog() throws IOException {
|
||||
EditLogValidation val = EditLogFileInputStream.validateEditLog(file);
|
||||
this.lastTxId = val.getEndTxId();
|
||||
this.hasCorruptHeader = val.hasCorruptHeader();
|
||||
}
|
||||
|
||||
boolean isInProgress() {
|
||||
public boolean isInProgress() {
|
||||
return isInProgress;
|
||||
}
|
||||
|
||||
File getFile() {
|
||||
public File getFile() {
|
||||
return file;
|
||||
}
|
||||
|
||||
|
|
|
@ -229,6 +229,13 @@ public class JournalSet implements JournalManager {
|
|||
}
|
||||
jas.getManager().selectInputStreams(allStreams, fromTxId, inProgressOk);
|
||||
}
|
||||
chainAndMakeRedundantStreams(streams, allStreams, fromTxId, inProgressOk);
|
||||
}
|
||||
|
||||
public static void chainAndMakeRedundantStreams(
|
||||
Collection<EditLogInputStream> outStreams,
|
||||
PriorityQueue<EditLogInputStream> allStreams,
|
||||
long fromTxId, boolean inProgressOk) {
|
||||
// We want to group together all the streams that start on the same start
|
||||
// transaction ID. To do this, we maintain an accumulator (acc) of all
|
||||
// the streams we've seen at a given start transaction ID. When we see a
|
||||
|
@ -246,7 +253,7 @@ public class JournalSet implements JournalManager {
|
|||
if (accFirstTxId == elis.getFirstTxId()) {
|
||||
acc.add(elis);
|
||||
} else if (accFirstTxId < elis.getFirstTxId()) {
|
||||
streams.add(new RedundantEditLogInputStream(acc, fromTxId));
|
||||
outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
|
||||
acc.clear();
|
||||
acc.add(elis);
|
||||
} else if (accFirstTxId > elis.getFirstTxId()) {
|
||||
|
@ -257,7 +264,7 @@ public class JournalSet implements JournalManager {
|
|||
}
|
||||
}
|
||||
if (!acc.isEmpty()) {
|
||||
streams.add(new RedundantEditLogInputStream(acc, fromTxId));
|
||||
outStreams.add(new RedundantEditLogInputStream(acc, fromTxId));
|
||||
acc.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -42,6 +42,14 @@ final class NameNodeResourcePolicy {
|
|||
Collection<? extends CheckableNameNodeResource> resources,
|
||||
int minimumRedundantResources) {
|
||||
|
||||
// TODO: workaround:
|
||||
// - during startup, if there are no edits dirs on disk, then there is
|
||||
// a call to areResourcesAvailable() with no dirs at all, which was
|
||||
// previously causing the NN to enter safemode
|
||||
if (resources.isEmpty()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
int requiredResourceCount = 0;
|
||||
int redundantResourceCount = 0;
|
||||
int disabledRedundantResourceCount = 0;
|
||||
|
|
|
@ -17,18 +17,15 @@
|
|||
*/
|
||||
package org.apache.hadoop.hdfs.server.protocol;
|
||||
|
||||
import java.io.DataInput;
|
||||
import java.io.DataOutput;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.io.Writable;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
|
||||
public class RemoteEditLog implements Writable, Comparable<RemoteEditLog> {
|
||||
public class RemoteEditLog implements Comparable<RemoteEditLog> {
|
||||
private long startTxId = HdfsConstants.INVALID_TXID;
|
||||
private long endTxId = HdfsConstants.INVALID_TXID;
|
||||
private boolean isInProgress = false;
|
||||
|
||||
public RemoteEditLog() {
|
||||
}
|
||||
|
@ -36,6 +33,13 @@ public class RemoteEditLog implements Writable, Comparable<RemoteEditLog> {
|
|||
public RemoteEditLog(long startTxId, long endTxId) {
|
||||
this.startTxId = startTxId;
|
||||
this.endTxId = endTxId;
|
||||
this.isInProgress = (endTxId == HdfsConstants.INVALID_TXID);
|
||||
}
|
||||
|
||||
public RemoteEditLog(long startTxId, long endTxId, boolean inProgress) {
|
||||
this.startTxId = startTxId;
|
||||
this.endTxId = endTxId;
|
||||
this.isInProgress = inProgress;
|
||||
}
|
||||
|
||||
public long getStartTxId() {
|
||||
|
@ -46,21 +50,17 @@ public class RemoteEditLog implements Writable, Comparable<RemoteEditLog> {
|
|||
return endTxId;
|
||||
}
|
||||
|
||||
public boolean isInProgress() {
|
||||
return isInProgress;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
if (!isInProgress) {
|
||||
return "[" + startTxId + "," + endTxId + "]";
|
||||
} else {
|
||||
return "[" + startTxId + "-? (in-progress)]";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(DataOutput out) throws IOException {
|
||||
out.writeLong(startTxId);
|
||||
out.writeLong(endTxId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFields(DataInput in) throws IOException {
|
||||
startTxId = in.readLong();
|
||||
endTxId = in.readLong();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,186 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
option java_package = "org.apache.hadoop.hdfs.qjournal.protocol";
|
||||
option java_outer_classname = "QJournalProtocolProtos";
|
||||
option java_generic_services = true;
|
||||
option java_generate_equals_and_hash = true;
|
||||
|
||||
import "hdfs.proto";
|
||||
|
||||
message JournalIdProto {
|
||||
required string identifier = 1;
|
||||
}
|
||||
|
||||
message RequestInfoProto {
|
||||
required JournalIdProto journalId = 1;
|
||||
required uint64 epoch = 2;
|
||||
required uint64 ipcSerialNumber = 3;
|
||||
}
|
||||
|
||||
message SegmentStateProto {
|
||||
required uint64 startTxId = 1;
|
||||
required uint64 endTxId = 2;
|
||||
required bool isInProgress = 3;
|
||||
required bytes md5sum = 4;
|
||||
}
|
||||
|
||||
/**
|
||||
* The storage format used on local disk for previously
|
||||
* accepted decisions.
|
||||
*/
|
||||
message PersistedRecoveryPaxosData {
|
||||
required SegmentStateProto segmentState = 1;
|
||||
required uint64 acceptedInEpoch = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* journal()
|
||||
*/
|
||||
|
||||
message JournalRequestProto {
|
||||
required RequestInfoProto reqInfo = 1;
|
||||
required uint64 firstTxnId = 2;
|
||||
required uint32 numTxns = 3;
|
||||
required bytes records = 4;
|
||||
}
|
||||
|
||||
message JournalResponseProto {
|
||||
}
|
||||
|
||||
/**
|
||||
* startLogSegment()
|
||||
*/
|
||||
message StartLogSegmentRequestProto {
|
||||
required RequestInfoProto reqInfo = 1;
|
||||
required uint64 txid = 2; // Transaction ID
|
||||
}
|
||||
|
||||
message StartLogSegmentResponseProto {
|
||||
}
|
||||
|
||||
/**
|
||||
* finalizeLogSegment()
|
||||
*/
|
||||
message FinalizeLogSegmentRequestProto {
|
||||
required RequestInfoProto reqInfo = 1;
|
||||
required uint64 startTxId = 2;
|
||||
required uint64 endTxId = 3;
|
||||
}
|
||||
|
||||
message FinalizeLogSegmentResponseProto {
|
||||
}
|
||||
|
||||
/**
|
||||
* getJournalState()
|
||||
*/
|
||||
message GetJournalStateRequestProto {
|
||||
required JournalIdProto jid = 1;
|
||||
}
|
||||
|
||||
message GetJournalStateResponseProto {
|
||||
required uint64 lastPromisedEpoch = 1;
|
||||
required uint32 httpPort = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* newEpoch()
|
||||
*/
|
||||
message NewEpochRequestProto {
|
||||
required JournalIdProto jid = 1;
|
||||
required NamespaceInfoProto nsInfo = 2;
|
||||
required uint64 epoch = 3;
|
||||
}
|
||||
|
||||
message NewEpochResponseProto {
|
||||
optional uint64 lastSegmentTxId = 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* getEditLogManifest()
|
||||
*/
|
||||
message GetEditLogManifestRequestProto {
|
||||
required JournalIdProto jid = 1;
|
||||
required uint64 sinceTxId = 2; // Transaction ID
|
||||
}
|
||||
|
||||
message GetEditLogManifestResponseProto {
|
||||
required RemoteEditLogManifestProto manifest = 1;
|
||||
required uint32 httpPort = 2;
|
||||
|
||||
// TODO: we should add nsinfo somewhere
|
||||
// to verify that it matches up with our expectation
|
||||
// required NamespaceInfoProto nsInfo = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* prepareRecovery()
|
||||
*/
|
||||
message PrepareRecoveryRequestProto {
|
||||
required RequestInfoProto reqInfo = 1;
|
||||
required uint64 segmentTxId = 2;
|
||||
}
|
||||
|
||||
message PrepareRecoveryResponseProto {
|
||||
optional SegmentStateProto segmentState = 1;
|
||||
optional uint64 acceptedInEpoch = 2;
|
||||
}
|
||||
|
||||
/**
|
||||
* acceptRecovery()
|
||||
*/
|
||||
message AcceptRecoveryRequestProto {
|
||||
required RequestInfoProto reqInfo = 1;
|
||||
|
||||
/** Details on the segment to recover */
|
||||
required SegmentStateProto stateToAccept = 2;
|
||||
|
||||
/** The URL from which the log may be copied */
|
||||
required string fromURL = 3;
|
||||
}
|
||||
|
||||
message AcceptRecoveryResponseProto {
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Protocol used to journal edits to a JournalNode.
|
||||
* See the request and response for details of rpc call.
|
||||
*/
|
||||
service QJournalProtocolService {
|
||||
rpc getJournalState(GetJournalStateRequestProto) returns (GetJournalStateResponseProto);
|
||||
|
||||
rpc newEpoch(NewEpochRequestProto) returns (NewEpochResponseProto);
|
||||
|
||||
rpc journal(JournalRequestProto) returns (JournalResponseProto);
|
||||
|
||||
rpc startLogSegment(StartLogSegmentRequestProto)
|
||||
returns (StartLogSegmentResponseProto);
|
||||
|
||||
rpc finalizeLogSegment(FinalizeLogSegmentRequestProto)
|
||||
returns (FinalizeLogSegmentResponseProto);
|
||||
|
||||
rpc getEditLogManifest(GetEditLogManifestRequestProto)
|
||||
returns (GetEditLogManifestResponseProto);
|
||||
|
||||
rpc prepareRecovery(PrepareRecoveryRequestProto)
|
||||
returns (PrepareRecoveryResponseProto);
|
||||
|
||||
rpc acceptRecovery(AcceptRecoveryRequestProto)
|
||||
returns (AcceptRecoveryResponseProto);
|
||||
}
|
|
@ -290,6 +290,7 @@ message BlocksWithLocationsProto {
|
|||
message RemoteEditLogProto {
|
||||
required uint64 startTxId = 1; // Starting available edit log transaction
|
||||
required uint64 endTxId = 2; // Ending available edit log transaction
|
||||
optional bool isInProgress = 3 [default = false];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -240,6 +240,11 @@
|
|||
</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.namenode.edits.journal-plugin.qjournal</name>
|
||||
<value>org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>dfs.permissions.enabled</name>
|
||||
<value>true</value>
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
<meta HTTP-EQUIV="REFRESH" content="0;url=journalstatus.jsp"/>
|
||||
<html>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<head><title>Hadoop Administration</title></head>
|
||||
|
||||
<body>
|
||||
<h1>Hadoop Administration</h1>
|
||||
|
||||
<ul>
|
||||
<li><a href="journalstatus.jsp">Status</a></li>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,42 @@
|
|||
<%
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
%>
|
||||
<%@ page
|
||||
contentType="text/html; charset=UTF-8"
|
||||
import="org.apache.hadoop.hdfs.server.common.JspHelper"
|
||||
import="org.apache.hadoop.util.ServletUtil"
|
||||
%>
|
||||
<%!
|
||||
//for java.io.Serializable
|
||||
private static final long serialVersionUID = 1L;
|
||||
%>
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<link rel="stylesheet" type="text/css" href="/static/hadoop.css">
|
||||
<title>Hadoop JournalNode</title>
|
||||
|
||||
<body>
|
||||
<h1>JournalNode</h1>
|
||||
<%= JspHelper.getVersionTable() %>
|
||||
<hr />
|
||||
|
||||
<br />
|
||||
<b><a href="/logs/">Logs</a></b>
|
||||
<%= ServletUtil.htmlFooter() %>
|
|
@ -0,0 +1,17 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License. See accompanying LICENSE file.
|
||||
-->
|
||||
<web-app version="2.4" xmlns="http://java.sun.com/xml/ns/j2ee">
|
||||
@journal.servlet.definitions@
|
||||
</web-app>
|
|
@ -85,6 +85,7 @@ import org.apache.hadoop.security.UserGroupInformation;
|
|||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.util.VersionInfo;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
/** Utilities for HDFS tests */
|
||||
|
@ -586,12 +587,21 @@ public class DFSTestUtil {
|
|||
IOUtils.copyBytes(is, os, s.length(), true);
|
||||
}
|
||||
|
||||
// Returns url content as string.
|
||||
/**
|
||||
* @return url content as string (UTF-8 encoding assumed)
|
||||
*/
|
||||
public static String urlGet(URL url) throws IOException {
|
||||
return new String(urlGetBytes(url), Charsets.UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* @return URL contents as a byte array
|
||||
*/
|
||||
public static byte[] urlGetBytes(URL url) throws IOException {
|
||||
URLConnection conn = url.openConnection();
|
||||
ByteArrayOutputStream out = new ByteArrayOutputStream();
|
||||
IOUtils.copyBytes(conn.getInputStream(), out, 4096, true);
|
||||
return out.toString();
|
||||
return out.toByteArray();
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -0,0 +1,194 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.JournalNode;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
public class MiniJournalCluster {
|
||||
public static class Builder {
|
||||
private String baseDir;
|
||||
private int numJournalNodes = 3;
|
||||
private boolean format = true;
|
||||
private Configuration conf;
|
||||
|
||||
public Builder(Configuration conf) {
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
public Builder baseDir(String d) {
|
||||
this.baseDir = d;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder numJournalNodes(int n) {
|
||||
this.numJournalNodes = n;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder format(boolean f) {
|
||||
this.format = f;
|
||||
return this;
|
||||
}
|
||||
|
||||
public MiniJournalCluster build() throws IOException {
|
||||
return new MiniJournalCluster(this);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(MiniJournalCluster.class);
|
||||
private File baseDir;
|
||||
private JournalNode nodes[];
|
||||
private InetSocketAddress ipcAddrs[];
|
||||
private InetSocketAddress httpAddrs[];
|
||||
|
||||
private MiniJournalCluster(Builder b) throws IOException {
|
||||
LOG.info("Starting MiniJournalCluster with " +
|
||||
b.numJournalNodes + " journal nodes");
|
||||
|
||||
if (b.baseDir != null) {
|
||||
this.baseDir = new File(b.baseDir);
|
||||
} else {
|
||||
this.baseDir = new File(MiniDFSCluster.getBaseDirectory());
|
||||
}
|
||||
|
||||
nodes = new JournalNode[b.numJournalNodes];
|
||||
ipcAddrs = new InetSocketAddress[b.numJournalNodes];
|
||||
httpAddrs = new InetSocketAddress[b.numJournalNodes];
|
||||
for (int i = 0; i < b.numJournalNodes; i++) {
|
||||
if (b.format) {
|
||||
File dir = getStorageDir(i);
|
||||
LOG.debug("Fully deleting JN directory " + dir);
|
||||
FileUtil.fullyDelete(dir);
|
||||
}
|
||||
nodes[i] = new JournalNode();
|
||||
nodes[i].setConf(createConfForNode(b, i));
|
||||
nodes[i].start();
|
||||
|
||||
ipcAddrs[i] = nodes[i].getBoundIpcAddress();
|
||||
httpAddrs[i] = nodes[i].getBoundHttpAddress();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set up the given Configuration object to point to the set of JournalNodes
|
||||
* in this cluster.
|
||||
*/
|
||||
public URI getQuorumJournalURI(String jid) {
|
||||
List<String> addrs = Lists.newArrayList();
|
||||
for (InetSocketAddress addr : ipcAddrs) {
|
||||
addrs.add("127.0.0.1:" + addr.getPort());
|
||||
}
|
||||
String addrsVal = Joiner.on(";").join(addrs);
|
||||
LOG.debug("Setting logger addresses to: " + addrsVal);
|
||||
try {
|
||||
return new URI("qjournal://" + addrsVal + "/" + jid);
|
||||
} catch (URISyntaxException e) {
|
||||
throw new AssertionError(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Start the JournalNodes in the cluster.
|
||||
*/
|
||||
public void start() throws IOException {
|
||||
for (JournalNode jn : nodes) {
|
||||
jn.start();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Shutdown all of the JournalNodes in the cluster.
|
||||
* @throws IOException if one or more nodes failed to stop
|
||||
*/
|
||||
public void shutdown() throws IOException {
|
||||
boolean failed = false;
|
||||
for (JournalNode jn : nodes) {
|
||||
try {
|
||||
jn.stopAndJoin(0);
|
||||
} catch (Exception e) {
|
||||
failed = true;
|
||||
LOG.warn("Unable to stop journal node " + jn, e);
|
||||
}
|
||||
}
|
||||
if (failed) {
|
||||
throw new IOException("Unable to shut down. Check log for details");
|
||||
}
|
||||
}
|
||||
|
||||
private Configuration createConfForNode(Builder b, int idx) {
|
||||
Configuration conf = new Configuration(b.conf);
|
||||
File logDir = getStorageDir(idx);
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY, logDir.toString());
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_KEY, "0.0.0.0:0");
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_HTTP_ADDRESS_KEY, "0.0.0.0:0");
|
||||
return conf;
|
||||
}
|
||||
|
||||
public File getStorageDir(int idx) {
|
||||
return new File(baseDir, "journalnode-" + idx);
|
||||
}
|
||||
|
||||
public File getCurrentDir(int idx, String jid) {
|
||||
return new File(new File(getStorageDir(idx), jid), "current");
|
||||
}
|
||||
|
||||
public JournalNode getJournalNode(int i) {
|
||||
return nodes[i];
|
||||
}
|
||||
|
||||
public void restartJournalNode(int i) throws InterruptedException, IOException {
|
||||
Configuration conf = new Configuration(nodes[i].getConf());
|
||||
if (nodes[i].isStarted()) {
|
||||
nodes[i].stopAndJoin(0);
|
||||
}
|
||||
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_KEY, "127.0.0.1:" +
|
||||
ipcAddrs[i].getPort());
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_HTTP_ADDRESS_KEY, "127.0.0.1:" +
|
||||
httpAddrs[i].getPort());
|
||||
|
||||
JournalNode jn = new JournalNode();
|
||||
jn.setConf(conf);
|
||||
jn.start();
|
||||
}
|
||||
|
||||
public int getQuorumSize() {
|
||||
return nodes.length / 2 + 1;
|
||||
}
|
||||
|
||||
public int getNumNodes() {
|
||||
return nodes.length;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.hadoop.io.DataOutputBuffer;
|
||||
|
||||
public abstract class QJMTestUtil {
|
||||
|
||||
public static byte[] createTxnData(int startTxn, int numTxns) throws Exception {
|
||||
DataOutputBuffer buf = new DataOutputBuffer();
|
||||
FSEditLogOp.Writer writer = new FSEditLogOp.Writer(buf);
|
||||
|
||||
for (long txid = startTxn; txid < startTxn + numTxns; txid++) {
|
||||
FSEditLogOp op = NameNodeAdapter.createMkdirOp("tx " + txid);
|
||||
op.setTransactionId(txid);
|
||||
writer.writeOp(op);
|
||||
}
|
||||
|
||||
return Arrays.copyOf(buf.getData(), buf.getLength());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.JournalNode;
|
||||
import org.junit.Test;
|
||||
|
||||
|
||||
public class TestMiniJournalCluster {
|
||||
@Test
|
||||
public void testStartStop() throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
MiniJournalCluster c = new MiniJournalCluster.Builder(conf)
|
||||
.build();
|
||||
try {
|
||||
URI uri = c.getQuorumJournalURI("myjournal");
|
||||
String[] addrs = uri.getAuthority().split(";");
|
||||
assertEquals(3, addrs.length);
|
||||
|
||||
JournalNode node = c.getJournalNode(0);
|
||||
String dir = node.getConf().get(DFSConfigKeys.DFS_JOURNALNODE_EDITS_DIR_KEY);
|
||||
assertEquals(MiniDFSCluster.getBaseDirectory() + "journalnode-0",
|
||||
dir);
|
||||
} finally {
|
||||
c.shutdown();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.ipc.RemoteException;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.util.ExitUtil;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestNNWithQJM {
|
||||
Configuration conf = new HdfsConfiguration();
|
||||
private MiniJournalCluster mjc;
|
||||
private Path TEST_PATH = new Path("/test-dir");
|
||||
private Path TEST_PATH_2 = new Path("/test-dir");
|
||||
|
||||
@Before
|
||||
public void resetSystemExit() {
|
||||
ExitUtil.resetFirstExitException();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void startJNs() throws Exception {
|
||||
mjc = new MiniJournalCluster.Builder(conf).build();
|
||||
}
|
||||
|
||||
@After
|
||||
public void stopJNs() throws Exception {
|
||||
if (mjc != null) {
|
||||
mjc.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLogAndRestart() throws IOException {
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
||||
MiniDFSCluster.getBaseDirectory() + "/TestNNWithQJM/image");
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
|
||||
mjc.getQuorumJournalURI("myjournal").toString());
|
||||
|
||||
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0)
|
||||
.manageNameDfsDirs(false)
|
||||
.build();
|
||||
try {
|
||||
cluster.getFileSystem().mkdirs(TEST_PATH);
|
||||
|
||||
// Restart the NN and make sure the edit was persisted
|
||||
// and loaded again
|
||||
cluster.restartNameNode();
|
||||
|
||||
assertTrue(cluster.getFileSystem().exists(TEST_PATH));
|
||||
cluster.getFileSystem().mkdirs(TEST_PATH_2);
|
||||
|
||||
// Restart the NN again and make sure both edits are persisted.
|
||||
cluster.restartNameNode();
|
||||
assertTrue(cluster.getFileSystem().exists(TEST_PATH));
|
||||
assertTrue(cluster.getFileSystem().exists(TEST_PATH_2));
|
||||
} finally {
|
||||
cluster.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNewNamenodeTakesOverWriter() throws Exception {
|
||||
File nn1Dir = new File(
|
||||
MiniDFSCluster.getBaseDirectory() + "/TestNNWithQJM/image-nn1");
|
||||
File nn2Dir = new File(
|
||||
MiniDFSCluster.getBaseDirectory() + "/TestNNWithQJM/image-nn2");
|
||||
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
||||
nn1Dir.getAbsolutePath());
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
|
||||
mjc.getQuorumJournalURI("myjournal").toString());
|
||||
|
||||
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0)
|
||||
.manageNameDfsDirs(false)
|
||||
.checkExitOnShutdown(false)
|
||||
.build();
|
||||
|
||||
try {
|
||||
cluster.getFileSystem().mkdirs(TEST_PATH);
|
||||
|
||||
// Start a second NN pointed to the same quorum.
|
||||
// We need to copy the image dir from the first NN -- or else
|
||||
// the new NN will just be rejected because of Namespace mismatch.
|
||||
FileUtil.fullyDelete(nn2Dir);
|
||||
FileUtil.copy(nn1Dir, FileSystem.getLocal(conf).getRaw(),
|
||||
new Path(nn2Dir.getAbsolutePath()), false, conf);
|
||||
|
||||
Configuration conf2 = new Configuration();
|
||||
conf2.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
||||
nn2Dir.getAbsolutePath());
|
||||
conf2.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
|
||||
mjc.getQuorumJournalURI("myjournal").toString());
|
||||
MiniDFSCluster cluster2 = new MiniDFSCluster.Builder(conf2)
|
||||
.numDataNodes(0)
|
||||
.format(false)
|
||||
.manageNameDfsDirs(false)
|
||||
.build();
|
||||
|
||||
// Check that the new cluster sees the edits made on the old cluster
|
||||
try {
|
||||
assertTrue(cluster2.getFileSystem().exists(TEST_PATH));
|
||||
} finally {
|
||||
cluster2.shutdown();
|
||||
}
|
||||
|
||||
// Check that, if we try to write to the old NN
|
||||
// that it aborts.
|
||||
try {
|
||||
cluster.getFileSystem().mkdirs(new Path("/x"));
|
||||
fail("Did not abort trying to write to a fenced NN");
|
||||
} catch (RemoteException re) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Could not sync enough journals to persistent storage", re);
|
||||
}
|
||||
} finally {
|
||||
//cluster.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMismatchedNNIsRejected() throws Exception {
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY,
|
||||
MiniDFSCluster.getBaseDirectory() + "/TestNNWithQJM/image");
|
||||
conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_KEY,
|
||||
mjc.getQuorumJournalURI("myjournal").toString());
|
||||
|
||||
// Start a NN, so the storage is formatted with its namespace info.
|
||||
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0)
|
||||
.manageNameDfsDirs(false)
|
||||
.build();
|
||||
cluster.shutdown();
|
||||
|
||||
// Create a new (freshly-formatted) NN, which should not be able to
|
||||
// reuse the same journal, since its journal ID would not match.
|
||||
try {
|
||||
cluster = new MiniDFSCluster.Builder(conf)
|
||||
.numDataNodes(0)
|
||||
.manageNameDfsDirs(false)
|
||||
.build();
|
||||
fail("New NN with different namespace should have been rejected");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Unable to start log segment 1: too few journals", ioe);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster;
|
||||
import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster.Builder;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.AsyncLogger;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.AsyncLoggerSet;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.invocation.InvocationOnMock;
|
||||
import org.mockito.stubbing.Answer;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.util.concurrent.Futures;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
|
||||
|
||||
public class TestEpochsAreUnique {
|
||||
private static final Log LOG = LogFactory.getLog(TestEpochsAreUnique.class);
|
||||
private static final String JID = "testEpochsAreUnique-jid";
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
private Random r = new Random();
|
||||
|
||||
@Test
|
||||
public void testSingleThreaded() throws IOException {
|
||||
Configuration conf = new Configuration();
|
||||
MiniJournalCluster cluster = new MiniJournalCluster.Builder(conf).build();
|
||||
URI uri = cluster.getQuorumJournalURI(JID);
|
||||
try {
|
||||
// With no failures or contention, epochs should increase one-by-one
|
||||
for (int i = 0; i < 5; i++) {
|
||||
AsyncLoggerSet als = new AsyncLoggerSet(
|
||||
QuorumJournalManager.createLoggers(conf, uri, FAKE_NSINFO));
|
||||
als.createNewUniqueEpoch(FAKE_NSINFO);
|
||||
assertEquals(i + 1, als.getEpoch());
|
||||
}
|
||||
|
||||
long prevEpoch = 5;
|
||||
// With some failures injected, it should still always increase, perhaps
|
||||
// skipping some
|
||||
for (int i = 0; i < 20; i++) {
|
||||
AsyncLoggerSet als = new AsyncLoggerSet(
|
||||
makeFaulty(QuorumJournalManager.createLoggers(conf, uri, FAKE_NSINFO)));
|
||||
long newEpoch = -1;
|
||||
while (true) {
|
||||
try {
|
||||
als.createNewUniqueEpoch(FAKE_NSINFO);
|
||||
newEpoch = als.getEpoch();
|
||||
break;
|
||||
} catch (IOException ioe) {
|
||||
// It's OK to fail to create an epoch, since we randomly inject
|
||||
// faults. It's possible we'll inject faults in too many of the
|
||||
// underlying nodes, and a failure is expected in that case
|
||||
}
|
||||
}
|
||||
LOG.info("Created epoch " + newEpoch);
|
||||
assertTrue("New epoch " + newEpoch + " should be greater than previous " +
|
||||
prevEpoch, newEpoch > prevEpoch);
|
||||
prevEpoch = newEpoch;
|
||||
}
|
||||
} finally {
|
||||
cluster.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private List<AsyncLogger> makeFaulty(List<AsyncLogger> loggers) {
|
||||
List<AsyncLogger> ret = Lists.newArrayList();
|
||||
for (AsyncLogger l : loggers) {
|
||||
AsyncLogger spy = Mockito.spy(l);
|
||||
Mockito.doAnswer(new SometimesFaulty<Long>(0.10f))
|
||||
.when(spy).getJournalState();
|
||||
Mockito.doAnswer(new SometimesFaulty<Void>(0.40f))
|
||||
.when(spy).newEpoch(Mockito.anyLong());
|
||||
ret.add(spy);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
private class SometimesFaulty<T> implements Answer<ListenableFuture<T>> {
|
||||
private float faultProbability;
|
||||
|
||||
public SometimesFaulty(float faultProbability) {
|
||||
this.faultProbability = faultProbability;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public ListenableFuture<T> answer(InvocationOnMock invocation)
|
||||
throws Throwable {
|
||||
if (r.nextFloat() < faultProbability) {
|
||||
return Futures.immediateFailedFuture(
|
||||
new IOException("Injected fault"));
|
||||
}
|
||||
return (ListenableFuture<T>)invocation.callRealMethod();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
|
@ -0,0 +1,130 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.IPCLoggerChannel;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.LoggerTooFarBehindException;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocol;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.hadoop.test.GenericTestUtils.DelayAnswer;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import com.google.common.base.Supplier;
|
||||
|
||||
public class TestIPCLoggerChannel {
|
||||
private static final Log LOG = LogFactory.getLog(
|
||||
TestIPCLoggerChannel.class);
|
||||
|
||||
private Configuration conf = new Configuration();
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
private static final String JID = "test-journalid";
|
||||
private static final InetSocketAddress FAKE_ADDR =
|
||||
new InetSocketAddress(0);
|
||||
private static final byte[] FAKE_DATA = new byte[4096];
|
||||
|
||||
private QJournalProtocol mockProxy = Mockito.mock(QJournalProtocol.class);
|
||||
private IPCLoggerChannel ch;
|
||||
|
||||
private static final int LIMIT_QUEUE_SIZE_MB = 1;
|
||||
private static final int LIMIT_QUEUE_SIZE_BYTES =
|
||||
LIMIT_QUEUE_SIZE_MB * 1024 * 1024;
|
||||
|
||||
@Before
|
||||
public void setupMock() {
|
||||
conf.setInt(DFSConfigKeys.DFS_QJOURNAL_QUEUE_SIZE_LIMIT_KEY,
|
||||
LIMIT_QUEUE_SIZE_MB);
|
||||
|
||||
// Channel to the mock object instead of a real IPC proxy.
|
||||
ch = new IPCLoggerChannel(conf, FAKE_NSINFO, JID, FAKE_ADDR) {
|
||||
@Override
|
||||
protected QJournalProtocol getProxy() throws IOException {
|
||||
return mockProxy;
|
||||
}
|
||||
};
|
||||
|
||||
ch.setEpoch(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleCall() throws Exception {
|
||||
ch.sendEdits(1, 3, FAKE_DATA).get();
|
||||
Mockito.verify(mockProxy).journal(Mockito.<RequestInfo>any(),
|
||||
Mockito.eq(1L), Mockito.eq(3), Mockito.same(FAKE_DATA));
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Test that, once the queue eclipses the configure size limit,
|
||||
* calls to journal more data are rejected.
|
||||
*/
|
||||
@Test
|
||||
public void testQueueLimiting() throws Exception {
|
||||
|
||||
// Block the underlying fake proxy from actually completing any calls.
|
||||
DelayAnswer delayer = new DelayAnswer(LOG);
|
||||
Mockito.doAnswer(delayer).when(mockProxy).journal(
|
||||
Mockito.<RequestInfo>any(),
|
||||
Mockito.eq(1L), Mockito.eq(1), Mockito.same(FAKE_DATA));
|
||||
|
||||
// Queue up the maximum number of calls.
|
||||
int numToQueue = LIMIT_QUEUE_SIZE_BYTES / FAKE_DATA.length;
|
||||
for (int i = 1; i <= numToQueue; i++) {
|
||||
ch.sendEdits((long)i, 1, FAKE_DATA);
|
||||
}
|
||||
|
||||
// The accounting should show the correct total number queued.
|
||||
assertEquals(LIMIT_QUEUE_SIZE_BYTES, ch.getQueuedEditsSize());
|
||||
|
||||
// Trying to queue any more should fail.
|
||||
try {
|
||||
ch.sendEdits(numToQueue + 1, 1, FAKE_DATA).get(1, TimeUnit.SECONDS);
|
||||
fail("Did not fail to queue more calls after queue was full");
|
||||
} catch (ExecutionException ee) {
|
||||
if (!(ee.getCause() instanceof LoggerTooFarBehindException)) {
|
||||
throw ee;
|
||||
}
|
||||
}
|
||||
|
||||
delayer.proceed();
|
||||
|
||||
// After we allow it to proceeed, it should chug through the original queue
|
||||
GenericTestUtils.waitFor(new Supplier<Boolean>() {
|
||||
@Override
|
||||
public Boolean get() {
|
||||
return ch.getQueuedEditsSize() == 0;
|
||||
}
|
||||
}, 10, 1000);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,69 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumCall;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.util.concurrent.SettableFuture;
|
||||
|
||||
public class TestQuorumCall {
|
||||
@Test(timeout=10000)
|
||||
public void testQuorums() throws Exception {
|
||||
Map<String, SettableFuture<String>> futures = ImmutableMap.of(
|
||||
"f1", SettableFuture.<String>create(),
|
||||
"f2", SettableFuture.<String>create(),
|
||||
"f3", SettableFuture.<String>create());
|
||||
|
||||
QuorumCall<String, String> q = QuorumCall.create(futures);
|
||||
assertEquals(0, q.countResponses());
|
||||
|
||||
futures.get("f1").set("first future");
|
||||
q.waitFor(1, 0, 0, 100000); // wait for 1 response
|
||||
q.waitFor(0, 1, 0, 100000); // wait for 1 success
|
||||
assertEquals(1, q.countResponses());
|
||||
|
||||
|
||||
futures.get("f2").setException(new Exception("error"));
|
||||
assertEquals(2, q.countResponses());
|
||||
|
||||
futures.get("f3").set("second future");
|
||||
q.waitFor(3, 0, 100, 100000); // wait for 3 responses
|
||||
q.waitFor(0, 2, 100, 100000); // 2 successes
|
||||
|
||||
assertEquals(3, q.countResponses());
|
||||
assertEquals("f1=first future,f3=second future",
|
||||
Joiner.on(",").withKeyValueSeparator("=").join(
|
||||
new TreeMap<String, String>(q.getResults())));
|
||||
|
||||
try {
|
||||
q.waitFor(0, 4, 100, 10);
|
||||
fail("Didn't time out waiting for more responses than came back");
|
||||
} catch (TimeoutException te) {
|
||||
// expected
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,361 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.util.List;
|
||||
import java.util.SortedSet;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.qjournal.MiniJournalCluster;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.AsyncLogger;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.IPCLoggerChannel;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumException;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NNStorage;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
/**
|
||||
* Functional tests for QuorumJournalManager.
|
||||
* For true unit tests, see {@link TestQuorumJournalManagerUnit}.
|
||||
*/
|
||||
public class TestQuorumJournalManager {
|
||||
private static final Log LOG = LogFactory.getLog(
|
||||
TestQuorumJournalManager.class);
|
||||
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
private static final String JID = "testQuorumJournalManager";
|
||||
private MiniJournalCluster cluster;
|
||||
private Configuration conf;
|
||||
private QuorumJournalManager qjm;
|
||||
private List<AsyncLogger> spies;
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
conf = new Configuration();
|
||||
cluster = new MiniJournalCluster.Builder(conf)
|
||||
.build();
|
||||
|
||||
qjm = createSpyingQJM();
|
||||
spies = qjm.getLoggerSetForTests().getLoggersForTests();
|
||||
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
assertEquals(1, qjm.getLoggerSetForTests().getEpoch());
|
||||
}
|
||||
|
||||
@After
|
||||
public void shutdown() throws IOException {
|
||||
cluster.shutdown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSingleWriter() throws Exception {
|
||||
writeSegment(qjm, 1, 3, true);
|
||||
|
||||
// Should be finalized
|
||||
checkRecovery(cluster, 1, 3);
|
||||
|
||||
// Start a new segment
|
||||
writeSegment(qjm, 4, 1, true);
|
||||
|
||||
// Should be finalized
|
||||
checkRecovery(cluster, 4, 4);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testOrchestratedFailures() throws Exception {
|
||||
writeSegment(qjm, 1, 3, true);
|
||||
writeSegment(qjm, 4, 3, true);
|
||||
|
||||
SortedSet<Long> serials = Sets.newTreeSet();
|
||||
for (AsyncLogger l : qjm.getLoggerSetForTests().getLoggersForTests()) {
|
||||
IPCLoggerChannel ch = (IPCLoggerChannel)l;
|
||||
ch.waitForAllPendingCalls();
|
||||
serials.add(ch.getNextIpcSerial());
|
||||
}
|
||||
|
||||
// All of the loggers should have sent the same number of RPCs, since there
|
||||
// were no failures.
|
||||
assertEquals(1, serials.size());
|
||||
|
||||
long maxSerial = serials.first();
|
||||
LOG.info("Max IPC serial = " + maxSerial);
|
||||
|
||||
cluster.shutdown();
|
||||
|
||||
cluster = new MiniJournalCluster.Builder(conf)
|
||||
.build();
|
||||
qjm = createSpyingQJM();
|
||||
spies = qjm.getLoggerSetForTests().getLoggersForTests();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Test case where a new writer picks up from an old one with no failures
|
||||
* and the previous unfinalized segment entirely consistent -- i.e. all
|
||||
* the JournalNodes end at the same transaction ID.
|
||||
*/
|
||||
@Test
|
||||
public void testChangeWritersLogsInSync() throws Exception {
|
||||
writeSegment(qjm, 1, 3, false);
|
||||
assertExistsInQuorum(cluster,
|
||||
NNStorage.getInProgressEditsFileName(1));
|
||||
|
||||
// Make a new QJM
|
||||
qjm = new QuorumJournalManager(
|
||||
conf, cluster.getQuorumJournalURI(JID), FAKE_NSINFO);
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
checkRecovery(cluster, 1, 3);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test case where a new writer picks up from an old one which crashed
|
||||
* with the three loggers at different txnids
|
||||
*/
|
||||
@Test
|
||||
public void testChangeWritersLogsOutOfSync1() throws Exception {
|
||||
// Journal states: [3, 4, 5]
|
||||
// During recovery: [x, 4, 5]
|
||||
// Should recovery to txn 5
|
||||
doOutOfSyncTest(0, 5L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChangeWritersLogsOutOfSync2() throws Exception {
|
||||
// Journal states: [3, 4, 5]
|
||||
// During recovery: [3, x, 5]
|
||||
// Should recovery to txn 5
|
||||
doOutOfSyncTest(1, 5L);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testChangeWritersLogsOutOfSync3() throws Exception {
|
||||
// Journal states: [3, 4, 5]
|
||||
// During recovery: [3, 4, x]
|
||||
// Should recovery to txn 4
|
||||
doOutOfSyncTest(2, 4L);
|
||||
}
|
||||
|
||||
|
||||
private void doOutOfSyncTest(int missingOnRecoveryIdx,
|
||||
long expectedRecoveryTxnId) throws Exception {
|
||||
EditLogOutputStream stm = qjm.startLogSegment(1);
|
||||
|
||||
failLoggerAtTxn(spies.get(0), 4);
|
||||
failLoggerAtTxn(spies.get(1), 5);
|
||||
|
||||
writeTxns(stm, 1, 3);
|
||||
|
||||
// This should succeed to 2/3 loggers
|
||||
writeTxns(stm, 4, 1);
|
||||
|
||||
// This should only succeed to 1 logger (index 2). Hence it should
|
||||
// fail
|
||||
try {
|
||||
writeTxns(stm, 5, 1);
|
||||
fail("Did not fail to write when only a minority succeeded");
|
||||
} catch (QuorumException qe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"too many exceptions to achieve quorum size 2/3",
|
||||
qe);
|
||||
}
|
||||
|
||||
assertExistsInQuorum(cluster,
|
||||
NNStorage.getInProgressEditsFileName(1));
|
||||
|
||||
// Shut down the specified JN, so it's not present during recovery.
|
||||
cluster.getJournalNode(missingOnRecoveryIdx).stopAndJoin(0);
|
||||
|
||||
// Make a new QJM
|
||||
qjm = createSpyingQJM();
|
||||
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
checkRecovery(cluster, 1, expectedRecoveryTxnId);
|
||||
}
|
||||
|
||||
|
||||
private void failLoggerAtTxn(AsyncLogger spy, long txid) {
|
||||
TestQuorumJournalManagerUnit.futureThrows(new IOException("mock failure"))
|
||||
.when(spy).sendEdits(
|
||||
Mockito.eq(txid), Mockito.eq(1), Mockito.<byte[]>any());
|
||||
}
|
||||
|
||||
/**
|
||||
* edit lengths [3,4,5]
|
||||
* first recovery:
|
||||
* - sees [3,4,x]
|
||||
* - picks length 4 for recoveryEndTxId
|
||||
* - calls acceptRecovery()
|
||||
* - crashes before finalizing
|
||||
* second recovery:
|
||||
* - sees [x, 4, 5]
|
||||
* - should pick recovery length 4, even though it saw
|
||||
* a larger txid, because a previous recovery accepted it
|
||||
*/
|
||||
@Test
|
||||
public void testRecoverAfterIncompleteRecovery() throws Exception {
|
||||
EditLogOutputStream stm = qjm.startLogSegment(1);
|
||||
|
||||
failLoggerAtTxn(spies.get(0), 4);
|
||||
failLoggerAtTxn(spies.get(1), 5);
|
||||
|
||||
writeTxns(stm, 1, 3);
|
||||
|
||||
// This should succeed to 2/3 loggers
|
||||
writeTxns(stm, 4, 1);
|
||||
|
||||
// This should only succeed to 1 logger (index 2). Hence it should
|
||||
// fail
|
||||
try {
|
||||
writeTxns(stm, 5, 1);
|
||||
fail("Did not fail to write when only a minority succeeded");
|
||||
} catch (QuorumException qe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"too many exceptions to achieve quorum size 2/3",
|
||||
qe);
|
||||
}
|
||||
|
||||
// Shut down the logger that has length = 5
|
||||
cluster.getJournalNode(2).stopAndJoin(0);
|
||||
|
||||
qjm = createSpyingQJM();
|
||||
spies = qjm.getLoggerSetForTests().getLoggersForTests();
|
||||
|
||||
// Allow no logger to finalize
|
||||
for (AsyncLogger spy : spies) {
|
||||
TestQuorumJournalManagerUnit.futureThrows(new IOException("injected"))
|
||||
.when(spy).finalizeLogSegment(Mockito.eq(1L),
|
||||
Mockito.eq(4L));
|
||||
}
|
||||
try {
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
fail("Should have failed recovery since no finalization occurred");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains("injected", ioe);
|
||||
}
|
||||
|
||||
// Now bring back the logger that had 5, and run recovery again.
|
||||
// We should recover to 4, even though there's a longer log.
|
||||
cluster.getJournalNode(0).stopAndJoin(0);
|
||||
cluster.restartJournalNode(2);
|
||||
|
||||
qjm = createSpyingQJM();
|
||||
spies = qjm.getLoggerSetForTests().getLoggersForTests();
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
checkRecovery(cluster, 1, 4);
|
||||
}
|
||||
|
||||
|
||||
private QuorumJournalManager createSpyingQJM()
|
||||
throws IOException, URISyntaxException {
|
||||
return new QuorumJournalManager(
|
||||
conf, cluster.getQuorumJournalURI(JID), FAKE_NSINFO) {
|
||||
@Override
|
||||
protected List<AsyncLogger> createLoggers() throws IOException {
|
||||
LOG.info("===> make spies");
|
||||
List<AsyncLogger> realLoggers = super.createLoggers();
|
||||
List<AsyncLogger> spies = Lists.newArrayList();
|
||||
for (AsyncLogger logger : realLoggers) {
|
||||
spies.add(Mockito.spy(logger));
|
||||
}
|
||||
return spies;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private void writeSegment(QuorumJournalManager qjm,
|
||||
int startTxId, int numTxns, boolean finalize) throws IOException {
|
||||
EditLogOutputStream stm = qjm.startLogSegment(startTxId);
|
||||
// Should create in-progress
|
||||
assertExistsInQuorum(cluster,
|
||||
NNStorage.getInProgressEditsFileName(startTxId));
|
||||
|
||||
writeTxns(stm, startTxId, numTxns);
|
||||
if (finalize) {
|
||||
stm.close();
|
||||
qjm.finalizeLogSegment(startTxId, startTxId + numTxns - 1);
|
||||
}
|
||||
}
|
||||
|
||||
private void writeTxns(EditLogOutputStream stm, int startTxId, int numTxns)
|
||||
throws IOException {
|
||||
for (long txid = startTxId; txid < startTxId + numTxns; txid++) {
|
||||
TestQuorumJournalManagerUnit.writeOp(stm, txid);
|
||||
}
|
||||
stm.setReadyToFlush();
|
||||
stm.flush();
|
||||
}
|
||||
|
||||
private void assertExistsInQuorum(MiniJournalCluster cluster,
|
||||
String fname) {
|
||||
int count = 0;
|
||||
for (int i = 0; i < 3; i++) {
|
||||
File dir = cluster.getCurrentDir(i, JID);
|
||||
if (new File(dir, fname).exists()) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
assertTrue("File " + fname + " should exist in a quorum of dirs",
|
||||
count >= cluster.getQuorumSize());
|
||||
}
|
||||
|
||||
private void checkRecovery(MiniJournalCluster cluster,
|
||||
long segmentTxId, long expectedEndTxId)
|
||||
throws IOException {
|
||||
int numFinalized = 0;
|
||||
for (int i = 0; i < cluster.getNumNodes(); i++) {
|
||||
File logDir = cluster.getCurrentDir(i, JID);
|
||||
EditLogFile elf = FileJournalManager.getLogFile(logDir, segmentTxId);
|
||||
if (elf == null) {
|
||||
continue;
|
||||
}
|
||||
if (!elf.isInProgress()) {
|
||||
numFinalized++;
|
||||
if (elf.getLastTxId() != expectedEndTxId) {
|
||||
fail("File " + elf + " finalized to wrong txid, expected " +
|
||||
expectedEndTxId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (numFinalized < cluster.getQuorumSize()) {
|
||||
fail("Did not find a quorum of finalized logs starting at " +
|
||||
segmentTxId);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.client;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Matchers.eq;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.logging.impl.Log4JLogger;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.AsyncLogger;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumException;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.GetJournalStateResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.server.namenode.EditLogOutputStream;
|
||||
import org.apache.hadoop.hdfs.server.namenode.FSEditLogOp;
|
||||
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.apache.log4j.Level;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
import org.mockito.stubbing.Stubber;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.util.concurrent.Futures;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
import com.google.common.util.concurrent.SettableFuture;
|
||||
|
||||
/**
|
||||
* True unit tests for QuorumJournalManager
|
||||
*/
|
||||
public class TestQuorumJournalManagerUnit {
|
||||
static {
|
||||
((Log4JLogger)QuorumJournalManager.LOG).getLogger().setLevel(Level.ALL);
|
||||
}
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
|
||||
private Configuration conf = new Configuration();
|
||||
private List<AsyncLogger> spyLoggers;
|
||||
private QuorumJournalManager qjm;
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
spyLoggers = ImmutableList.of(
|
||||
mockLogger(),
|
||||
mockLogger(),
|
||||
mockLogger());
|
||||
|
||||
qjm = new QuorumJournalManager(conf, new URI("qjournal://host/jid"), FAKE_NSINFO) {
|
||||
@Override
|
||||
protected List<AsyncLogger> createLoggers() {
|
||||
return spyLoggers;
|
||||
}
|
||||
};
|
||||
|
||||
for (AsyncLogger logger : spyLoggers) {
|
||||
futureReturns(GetJournalStateResponseProto.newBuilder()
|
||||
.setLastPromisedEpoch(0)
|
||||
.setHttpPort(-1)
|
||||
.build())
|
||||
.when(logger).getJournalState();
|
||||
|
||||
futureReturns(
|
||||
NewEpochResponseProto.newBuilder().build()
|
||||
).when(logger).newEpoch(Mockito.anyLong());
|
||||
}
|
||||
|
||||
qjm.recoverUnfinalizedSegments();
|
||||
}
|
||||
|
||||
private AsyncLogger mockLogger() {
|
||||
return Mockito.mock(AsyncLogger.class);
|
||||
}
|
||||
|
||||
static <V> Stubber futureReturns(V value) {
|
||||
ListenableFuture<V> ret = Futures.immediateFuture(value);
|
||||
return Mockito.doReturn(ret);
|
||||
}
|
||||
|
||||
static Stubber futureThrows(Throwable t) {
|
||||
ListenableFuture<?> ret = Futures.immediateFailedFuture(t);
|
||||
return Mockito.doReturn(ret);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testAllLoggersStartOk() throws Exception {
|
||||
futureReturns(null).when(spyLoggers.get(0)).startLogSegment(Mockito.anyLong());
|
||||
futureReturns(null).when(spyLoggers.get(1)).startLogSegment(Mockito.anyLong());
|
||||
futureReturns(null).when(spyLoggers.get(2)).startLogSegment(Mockito.anyLong());
|
||||
qjm.startLogSegment(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuorumOfLoggersStartOk() throws Exception {
|
||||
futureReturns(null).when(spyLoggers.get(0)).startLogSegment(Mockito.anyLong());
|
||||
futureReturns(null).when(spyLoggers.get(1)).startLogSegment(Mockito.anyLong());
|
||||
futureThrows(new IOException("logger failed"))
|
||||
.when(spyLoggers.get(2)).startLogSegment(Mockito.anyLong());
|
||||
qjm.startLogSegment(1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testQuorumOfLoggersFail() throws Exception {
|
||||
futureReturns(null).when(spyLoggers.get(0)).startLogSegment(Mockito.anyLong());
|
||||
futureThrows(new IOException("logger failed"))
|
||||
.when(spyLoggers.get(1)).startLogSegment(Mockito.anyLong());
|
||||
futureThrows(new IOException("logger failed"))
|
||||
.when(spyLoggers.get(2)).startLogSegment(Mockito.anyLong());
|
||||
try {
|
||||
qjm.startLogSegment(1);
|
||||
fail("Did not throw when quorum failed");
|
||||
} catch (QuorumException qe) {
|
||||
GenericTestUtils.assertExceptionContains("logger failed", qe);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEdits() throws Exception {
|
||||
EditLogOutputStream stm = createLogSegment();
|
||||
writeOp(stm, 1);
|
||||
writeOp(stm, 2);
|
||||
|
||||
stm.setReadyToFlush();
|
||||
writeOp(stm, 3);
|
||||
|
||||
// The flush should log txn 1-2
|
||||
futureReturns(null).when(spyLoggers.get(0)).sendEdits(
|
||||
eq(1L), eq(2), Mockito.<byte[]>any());
|
||||
futureReturns(null).when(spyLoggers.get(1)).sendEdits(
|
||||
eq(1L), eq(2), Mockito.<byte[]>any());
|
||||
futureReturns(null).when(spyLoggers.get(2)).sendEdits(
|
||||
eq(1L), eq(2), Mockito.<byte[]>any());
|
||||
stm.flush();
|
||||
|
||||
// Another flush should now log txn #3
|
||||
stm.setReadyToFlush();
|
||||
futureReturns(null).when(spyLoggers.get(0)).sendEdits(
|
||||
eq(3L), eq(1), Mockito.<byte[]>any());
|
||||
futureReturns(null).when(spyLoggers.get(1)).sendEdits(
|
||||
eq(3L), eq(1), Mockito.<byte[]>any());
|
||||
futureReturns(null).when(spyLoggers.get(2)).sendEdits(
|
||||
eq(3L), eq(1), Mockito.<byte[]>any());
|
||||
stm.flush();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWriteEditsOneSlow() throws Exception {
|
||||
EditLogOutputStream stm = createLogSegment();
|
||||
writeOp(stm, 1);
|
||||
stm.setReadyToFlush();
|
||||
|
||||
// Make the first two logs respond immediately
|
||||
futureReturns(null).when(spyLoggers.get(0)).sendEdits(
|
||||
eq(1L), eq(1), Mockito.<byte[]>any());
|
||||
futureReturns(null).when(spyLoggers.get(1)).sendEdits(
|
||||
eq(1L), eq(1), Mockito.<byte[]>any());
|
||||
|
||||
// And the third log not respond
|
||||
SettableFuture<Void> slowLog = SettableFuture.<Void>create();
|
||||
Mockito.doReturn(slowLog).when(spyLoggers.get(2)).sendEdits(
|
||||
eq(1L), eq(1), Mockito.<byte[]>any());
|
||||
stm.flush();
|
||||
}
|
||||
|
||||
private EditLogOutputStream createLogSegment() throws IOException {
|
||||
futureReturns(null).when(spyLoggers.get(0)).startLogSegment(Mockito.anyLong());
|
||||
futureReturns(null).when(spyLoggers.get(1)).startLogSegment(Mockito.anyLong());
|
||||
futureReturns(null).when(spyLoggers.get(2)).startLogSegment(Mockito.anyLong());
|
||||
EditLogOutputStream stm = qjm.startLogSegment(1);
|
||||
return stm;
|
||||
}
|
||||
|
||||
static void writeOp(EditLogOutputStream stm, long txid) throws IOException {
|
||||
FSEditLogOp op = NameNodeAdapter.createMkdirOp("tx " + txid);
|
||||
op.setTransactionId(txid);
|
||||
stm.write(op);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,166 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.hadoop.fs.FileUtil;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hdfs.qjournal.QJMTestUtil;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.RequestInfo;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProtoOrBuilder;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.Journal;
|
||||
import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
|
||||
import org.apache.hadoop.hdfs.server.common.Storage;
|
||||
import org.apache.hadoop.hdfs.server.common.StorageErrorReporter;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Assume;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
public class TestJournal {
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
private static final NamespaceInfo FAKE_NSINFO_2 = new NamespaceInfo(
|
||||
6789, "mycluster", "my-bp", 0L, 0);
|
||||
|
||||
private static final String JID = "test-journal";
|
||||
|
||||
private static final File TEST_LOG_DIR = new File(
|
||||
new File(MiniDFSCluster.getBaseDirectory()), "TestJournal");
|
||||
|
||||
private StorageErrorReporter mockErrorReporter = Mockito.mock(
|
||||
StorageErrorReporter.class);
|
||||
|
||||
private Journal journal;
|
||||
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
FileUtil.fullyDelete(TEST_LOG_DIR);
|
||||
journal = new Journal(TEST_LOG_DIR, mockErrorReporter);
|
||||
}
|
||||
|
||||
@After
|
||||
public void verifyNoStorageErrors() throws Exception{
|
||||
Mockito.verify(mockErrorReporter, Mockito.never())
|
||||
.reportErrorOnFile(Mockito.<File>any());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEpochHandling() throws Exception {
|
||||
assertEquals(0, journal.getLastPromisedEpoch());
|
||||
NewEpochResponseProto newEpoch =
|
||||
journal.newEpoch(FAKE_NSINFO, 1);
|
||||
assertFalse(newEpoch.hasLastSegmentTxId());
|
||||
assertEquals(1, journal.getLastPromisedEpoch());
|
||||
journal.newEpoch(FAKE_NSINFO, 3);
|
||||
assertFalse(newEpoch.hasLastSegmentTxId());
|
||||
assertEquals(3, journal.getLastPromisedEpoch());
|
||||
try {
|
||||
journal.newEpoch(FAKE_NSINFO, 3);
|
||||
fail("Should have failed to promise same epoch twice");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Proposed epoch 3 <= last promise 3", ioe);
|
||||
}
|
||||
try {
|
||||
journal.startLogSegment(new RequestInfo(JID, 1L, 1L),
|
||||
12345L);
|
||||
fail("Should have rejected call from prior epoch");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"epoch 1 is less than the last promised epoch 3", ioe);
|
||||
}
|
||||
try {
|
||||
journal.journal(new RequestInfo(JID, 1L, 1L),
|
||||
100L, 0, new byte[0]);
|
||||
fail("Should have rejected call from prior epoch");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"epoch 1 is less than the last promised epoch 3", ioe);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRestartJournal() throws Exception {
|
||||
journal.newEpoch(FAKE_NSINFO, 1);
|
||||
journal.startLogSegment(new RequestInfo("j", 1, 1), 1);
|
||||
journal.journal(new RequestInfo("j", 1, 2), 1, 2,
|
||||
QJMTestUtil.createTxnData(1, 2));
|
||||
// Don't finalize.
|
||||
|
||||
journal.close(); // close to unlock the storage dir
|
||||
|
||||
// Now re-instantiate, make sure history is still there
|
||||
journal = new Journal(TEST_LOG_DIR, mockErrorReporter);
|
||||
assertEquals(1, journal.getLastPromisedEpoch());
|
||||
NewEpochResponseProtoOrBuilder newEpoch = journal.newEpoch(FAKE_NSINFO, 2);
|
||||
assertEquals(1, newEpoch.getLastSegmentTxId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJournalLocking() throws Exception {
|
||||
StorageDirectory sd = journal.getStorage().getStorageDir(0);
|
||||
File lockFile = new File(sd.getRoot(), Storage.STORAGE_FILE_LOCK);
|
||||
|
||||
// Journal should not be locked, since we lazily initialize it.
|
||||
assertFalse(lockFile.exists());
|
||||
|
||||
journal.newEpoch(FAKE_NSINFO, 1);
|
||||
Assume.assumeTrue(journal.getStorage().getStorageDir(0).isLockSupported());
|
||||
|
||||
// Journal should be locked
|
||||
GenericTestUtils.assertExists(lockFile);
|
||||
|
||||
Journal journal2 = new Journal(TEST_LOG_DIR, mockErrorReporter);
|
||||
try {
|
||||
journal2.newEpoch(FAKE_NSINFO, 2);
|
||||
fail("Did not fail to create another journal in same dir");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Cannot lock storage", ioe);
|
||||
}
|
||||
|
||||
journal.close();
|
||||
|
||||
// Journal should no longer be locked after the close() call.
|
||||
journal2.newEpoch(FAKE_NSINFO, 2);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNamespaceVerification() throws Exception {
|
||||
journal.newEpoch(FAKE_NSINFO, 1);
|
||||
|
||||
try {
|
||||
journal.newEpoch(FAKE_NSINFO_2, 2);
|
||||
fail("Did not fail newEpoch() when namespaces mismatched");
|
||||
} catch (IOException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"Incompatible namespaceID", ioe);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,241 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hdfs.qjournal.server;
|
||||
|
||||
import static org.junit.Assert.assertArrayEquals;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.URL;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
||||
import org.apache.hadoop.hdfs.DFSTestUtil;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsConstants;
|
||||
import org.apache.hadoop.hdfs.qjournal.QJMTestUtil;
|
||||
import org.apache.hadoop.hdfs.qjournal.client.IPCLoggerChannel;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.NewEpochResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.PrepareRecoveryResponseProto;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.Journal;
|
||||
import org.apache.hadoop.hdfs.qjournal.server.JournalNode;
|
||||
import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
|
||||
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
||||
import org.apache.hadoop.test.GenericTestUtils;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.common.primitives.Bytes;
|
||||
import com.google.common.primitives.Ints;
|
||||
|
||||
|
||||
public class TestJournalNode {
|
||||
private static final NamespaceInfo FAKE_NSINFO = new NamespaceInfo(
|
||||
12345, "mycluster", "my-bp", 0L, 0);
|
||||
private static final String JID = "test-journalid";
|
||||
|
||||
private JournalNode jn;
|
||||
private Journal journal;
|
||||
private Configuration conf = new Configuration();
|
||||
private IPCLoggerChannel ch;
|
||||
|
||||
static {
|
||||
// Avoid an error when we double-initialize JvmMetrics
|
||||
DefaultMetricsSystem.setMiniClusterMode(true);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setup() throws Exception {
|
||||
conf.set(DFSConfigKeys.DFS_JOURNALNODE_RPC_ADDRESS_KEY,
|
||||
"0.0.0.0:0");
|
||||
jn = new JournalNode();
|
||||
jn.setConf(conf);
|
||||
jn.start();
|
||||
journal = jn.getOrCreateJournal(JID);
|
||||
journal.format(FAKE_NSINFO);
|
||||
|
||||
ch = new IPCLoggerChannel(conf, FAKE_NSINFO, JID, jn.getBoundIpcAddress());
|
||||
}
|
||||
|
||||
@After
|
||||
public void teardown() throws Exception {
|
||||
jn.stop(0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testJournal() throws Exception {
|
||||
IPCLoggerChannel ch = new IPCLoggerChannel(
|
||||
conf, FAKE_NSINFO, JID, jn.getBoundIpcAddress());
|
||||
ch.newEpoch(1).get();
|
||||
ch.setEpoch(1);
|
||||
ch.startLogSegment(1).get();
|
||||
ch.sendEdits(1, 1, "hello".getBytes(Charsets.UTF_8)).get();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testReturnsSegmentInfoAtEpochTransition() throws Exception {
|
||||
ch.newEpoch(1).get();
|
||||
ch.setEpoch(1);
|
||||
ch.startLogSegment(1).get();
|
||||
ch.sendEdits(1, 2, QJMTestUtil.createTxnData(1, 2)).get();
|
||||
|
||||
// Switch to a new epoch without closing earlier segment
|
||||
NewEpochResponseProto response = ch.newEpoch(2).get();
|
||||
ch.setEpoch(2);
|
||||
assertEquals(1, response.getLastSegmentTxId());
|
||||
|
||||
ch.finalizeLogSegment(1, 2).get();
|
||||
|
||||
// Switch to a new epoch after just closing the earlier segment.
|
||||
response = ch.newEpoch(3).get();
|
||||
ch.setEpoch(3);
|
||||
assertEquals(1, response.getLastSegmentTxId());
|
||||
|
||||
// Start a segment but don't write anything, check newEpoch segment info
|
||||
ch.startLogSegment(3).get();
|
||||
response = ch.newEpoch(4).get();
|
||||
ch.setEpoch(4);
|
||||
assertEquals(3, response.getLastSegmentTxId());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHttpServer() throws Exception {
|
||||
InetSocketAddress addr = jn.getBoundHttpAddress();
|
||||
assertTrue(addr.getPort() > 0);
|
||||
|
||||
String urlRoot = "http://localhost:" + addr.getPort();
|
||||
|
||||
// Check default servlets.
|
||||
String pageContents = DFSTestUtil.urlGet(new URL(urlRoot + "/jmx"));
|
||||
assertTrue("Bad contents: " + pageContents,
|
||||
pageContents.contains(
|
||||
"Hadoop:service=JournalNode,name=JvmMetrics"));
|
||||
|
||||
// Check JSP page.
|
||||
pageContents = DFSTestUtil.urlGet(
|
||||
new URL(urlRoot + "/journalstatus.jsp"));
|
||||
assertTrue(pageContents.contains("JournalNode"));
|
||||
|
||||
// Create some edits on server side
|
||||
byte[] EDITS_DATA = QJMTestUtil.createTxnData(1, 3);
|
||||
IPCLoggerChannel ch = new IPCLoggerChannel(
|
||||
conf, FAKE_NSINFO, JID, jn.getBoundIpcAddress());
|
||||
ch.newEpoch(1).get();
|
||||
ch.setEpoch(1);
|
||||
ch.startLogSegment(1).get();
|
||||
ch.sendEdits(1, 3, EDITS_DATA).get();
|
||||
ch.finalizeLogSegment(1, 3).get();
|
||||
|
||||
// Attempt to retrieve via HTTP, ensure we get the data back
|
||||
// including the header we expected
|
||||
byte[] retrievedViaHttp = DFSTestUtil.urlGetBytes(new URL(urlRoot +
|
||||
"/getJournal?segmentTxId=1&jid=" + JID));
|
||||
byte[] expected = Bytes.concat(
|
||||
Ints.toByteArray(HdfsConstants.LAYOUT_VERSION),
|
||||
EDITS_DATA);
|
||||
|
||||
assertArrayEquals(expected, retrievedViaHttp);
|
||||
|
||||
// Attempt to fetch a non-existent file, check that we get an
|
||||
// error status code
|
||||
URL badUrl = new URL(urlRoot + "/getJournal?segmentTxId=12345&jid=" + JID);
|
||||
HttpURLConnection connection = (HttpURLConnection)badUrl.openConnection();
|
||||
try {
|
||||
assertEquals(404, connection.getResponseCode());
|
||||
} finally {
|
||||
connection.disconnect();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that the JournalNode performs correctly as a Paxos
|
||||
* <em>Acceptor</em> process.
|
||||
*/
|
||||
@Test
|
||||
public void testAcceptRecoveryBehavior() throws Exception {
|
||||
// We need to run newEpoch() first, or else we have no way to distinguish
|
||||
// different proposals for the same decision.
|
||||
try {
|
||||
ch.prepareRecovery(1L).get();
|
||||
fail("Did not throw IllegalState when trying to run paxos without an epoch");
|
||||
} catch (ExecutionException ise) {
|
||||
GenericTestUtils.assertExceptionContains("bad epoch", ise);
|
||||
}
|
||||
|
||||
ch.newEpoch(1).get();
|
||||
ch.setEpoch(1);
|
||||
|
||||
// prepare() with no previously accepted value and no logs present
|
||||
PrepareRecoveryResponseProto prep = ch.prepareRecovery(1L).get();
|
||||
System.err.println("Prep: " + prep);
|
||||
assertFalse(prep.hasAcceptedInEpoch());
|
||||
assertFalse(prep.hasSegmentState());
|
||||
|
||||
// Make a log segment, and prepare again -- this time should see the
|
||||
// segment existing.
|
||||
ch.startLogSegment(1L).get();
|
||||
ch.sendEdits(1L, 1, QJMTestUtil.createTxnData(1, 1)).get();
|
||||
|
||||
prep = ch.prepareRecovery(1L).get();
|
||||
System.err.println("Prep: " + prep);
|
||||
assertFalse(prep.hasAcceptedInEpoch());
|
||||
assertTrue(prep.hasSegmentState());
|
||||
|
||||
// accept() should save the accepted value in persistent storage
|
||||
// TODO: should be able to accept without a URL here
|
||||
ch.acceptRecovery(prep.getSegmentState(), new URL("file:///dev/null")).get();
|
||||
|
||||
// So another prepare() call from a new epoch would return this value
|
||||
ch.newEpoch(2);
|
||||
ch.setEpoch(2);
|
||||
prep = ch.prepareRecovery(1L).get();
|
||||
assertEquals(1L, prep.getAcceptedInEpoch());
|
||||
assertEquals(1L, prep.getSegmentState().getEndTxId());
|
||||
|
||||
// A prepare() or accept() call from an earlier epoch should now be rejected
|
||||
ch.setEpoch(1);
|
||||
try {
|
||||
ch.prepareRecovery(1L).get();
|
||||
fail("prepare from earlier epoch not rejected");
|
||||
} catch (ExecutionException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"epoch 1 is less than the last promised epoch 2",
|
||||
ioe);
|
||||
}
|
||||
try {
|
||||
ch.acceptRecovery(prep.getSegmentState(), new URL("file:///dev/null")).get();
|
||||
fail("accept from earlier epoch not rejected");
|
||||
} catch (ExecutionException ioe) {
|
||||
GenericTestUtils.assertExceptionContains(
|
||||
"epoch 1 is less than the last promised epoch 2",
|
||||
ioe);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO:
|
||||
// - add test that checks formatting behavior
|
||||
// - add test that checks rejects newEpoch if nsinfo doesn't match
|
||||
|
||||
}
|
|
@ -17,11 +17,16 @@
|
|||
*/
|
||||
package org.apache.hadoop.hdfs.server.namenode;
|
||||
|
||||
import static org.mockito.Matchers.anyInt;
|
||||
import static org.mockito.Mockito.doNothing;
|
||||
import static org.mockito.Mockito.spy;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
import org.apache.hadoop.fs.UnresolvedLinkException;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.fs.permission.PermissionStatus;
|
||||
import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
||||
import org.apache.hadoop.hdfs.protocol.HdfsFileStatus;
|
||||
|
@ -183,6 +188,15 @@ public class NameNodeAdapter {
|
|||
}
|
||||
}
|
||||
|
||||
public static FSEditLogOp createMkdirOp(String path) {
|
||||
MkdirOp op = MkdirOp.getInstance(new FSEditLogOp.OpInstanceCache())
|
||||
.setPath(path)
|
||||
.setTimestamp(0)
|
||||
.setPermissionStatus(new PermissionStatus(
|
||||
"testuser", "testgroup", FsPermission.getDefault()));
|
||||
return op;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the number of blocks marked safe by safemode, or -1
|
||||
* if safemode is not running.
|
||||
|
|
Loading…
Reference in New Issue