Add checksumming and versions to the Translog's Checkpoint files (#19797)

This prepares the infrastructure to be able to extend the checkpoint file to store more information.
This commit is contained in:
Boaz Leskes 2016-08-04 20:42:12 +02:00 committed by GitHub
parent bca9ad86c6
commit 7010082112
3 changed files with 91 additions and 41 deletions

View File

@ -18,16 +18,19 @@
*/
package org.elasticsearch.index.translog;
import org.apache.lucene.store.ByteArrayDataOutput;
import org.apache.lucene.codecs.CodecUtil;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.InputStreamDataInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.OutputStreamIndexOutput;
import org.apache.lucene.store.SimpleFSDirectory;
import org.elasticsearch.common.io.Channels;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileChannel;
import java.nio.file.Files;
import java.nio.file.OpenOption;
import java.nio.file.Path;
@ -35,38 +38,46 @@ import java.nio.file.Path;
*/
class Checkpoint {
static final int BUFFER_SIZE = Integer.BYTES // ops
+ Long.BYTES // offset
+ Long.BYTES;// generation
final long offset;
final int numOps;
final long generation;
private static final int INITIAL_VERSION = 1; // start with 1, just to recognize there was some magic serialization logic before
private static final String CHECKPOINT_CODEC = "ckp";
static final int FILE_SIZE = CodecUtil.headerLength(CHECKPOINT_CODEC)
+ Integer.BYTES // ops
+ Long.BYTES // offset
+ Long.BYTES // generation
+ CodecUtil.footerLength();
static final int LEGACY_NON_CHECKSUMMED_FILE_LENGTH = Integer.BYTES // ops
+ Long.BYTES // offset
+ Long.BYTES; // generation
Checkpoint(long offset, int numOps, long generation) {
this.offset = offset;
this.numOps = numOps;
this.generation = generation;
}
Checkpoint(DataInput in) throws IOException {
offset = in.readLong();
numOps = in.readInt();
generation = in.readLong();
}
private void write(FileChannel channel) throws IOException {
byte[] buffer = new byte[BUFFER_SIZE];
final ByteArrayDataOutput out = new ByteArrayDataOutput(buffer);
write(out);
Channels.writeToChannel(buffer, channel);
}
void write(DataOutput out) throws IOException {
private void write(DataOutput out) throws IOException {
out.writeLong(offset);
out.writeInt(numOps);
out.writeLong(generation);
}
// reads a checksummed checkpoint introduced in ES 5.0.0
static Checkpoint readChecksummedV1(DataInput in) throws IOException {
return new Checkpoint(in.readLong(), in.readInt(), in.readLong());
}
// reads checkpoint from ES < 5.0.0
static Checkpoint readNonChecksummed(DataInput in) throws IOException {
return new Checkpoint(in.readLong(), in.readInt(), in.readLong());
}
@Override
public String toString() {
return "Checkpoint{" +
@ -77,27 +88,67 @@ class Checkpoint {
}
public static Checkpoint read(Path path) throws IOException {
try (InputStream in = Files.newInputStream(path)) {
return new Checkpoint(new InputStreamDataInput(in));
try (Directory dir = new SimpleFSDirectory(path.getParent())) {
try (final IndexInput indexInput = dir.openInput(path.getFileName().toString(), IOContext.DEFAULT)) {
if (indexInput.length() == LEGACY_NON_CHECKSUMMED_FILE_LENGTH) {
// OLD unchecksummed file that was written < ES 5.0.0
return Checkpoint.readNonChecksummed(indexInput);
}
// We checksum the entire file before we even go and parse it. If it's corrupted we barf right here.
CodecUtil.checksumEntireFile(indexInput);
final int fileVersion = CodecUtil.checkHeader(indexInput, CHECKPOINT_CODEC, INITIAL_VERSION, INITIAL_VERSION);
return Checkpoint.readChecksummedV1(indexInput);
}
}
}
public static void write(ChannelFactory factory, Path checkpointFile, Checkpoint checkpoint, OpenOption... options) throws IOException {
final ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(FILE_SIZE) {
@Override
public synchronized byte[] toByteArray() {
// don't clone
return buf;
}
};
final String resourceDesc = "checkpoint(path=\"" + checkpointFile + "\", gen=" + checkpoint + ")";
try (final OutputStreamIndexOutput indexOutput =
new OutputStreamIndexOutput(resourceDesc, checkpointFile.toString(), byteOutputStream, FILE_SIZE)) {
CodecUtil.writeHeader(indexOutput, CHECKPOINT_CODEC, INITIAL_VERSION);
checkpoint.write(indexOutput);
CodecUtil.writeFooter(indexOutput);
assert indexOutput.getFilePointer() == FILE_SIZE :
"get you number straights. Bytes written: " + indexOutput.getFilePointer() + " buffer size: " + FILE_SIZE;
assert indexOutput.getFilePointer() < 512 :
"checkpoint files have to be smaller 512b for atomic writes. size: " + indexOutput.getFilePointer();
}
// now go and write to the channel, in one go.
try (FileChannel channel = factory.open(checkpointFile, options)) {
checkpoint.write(channel);
Channels.writeToChannel(byteOutputStream.toByteArray(), channel);
// no need to force metadata, file size stays the same and we did the full fsync
// when we first created the file, so the directory entry doesn't change as well
channel.force(false);
}
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
Checkpoint that = (Checkpoint) o;
if (offset != that.offset) return false;
if (numOps != that.numOps) return false;
if (offset != that.offset) {
return false;
}
if (numOps != that.numOps) {
return false;
}
return generation == that.generation;
}

View File

@ -200,7 +200,9 @@ public class Translog extends AbstractIndexShardComponent implements IndexShardC
Files.createDirectories(location);
final long generation = 1;
Checkpoint checkpoint = new Checkpoint(0, 0, generation);
Checkpoint.write(getChannelFactory(), location.resolve(CHECKPOINT_FILE_NAME), checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW);
final Path checkpointFile = location.resolve(CHECKPOINT_FILE_NAME);
Checkpoint.write(getChannelFactory(), checkpointFile, checkpoint, StandardOpenOption.WRITE, StandardOpenOption.CREATE_NEW);
IOUtils.fsync(checkpointFile, false);
current = createWriter(generation);
this.lastCommittedTranslogFileGeneration = NOT_SET_GENERATION;

View File

@ -36,11 +36,9 @@ import org.apache.lucene.util.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.cli.SettingCommand;
import org.elasticsearch.cli.Terminal;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.SuppressForbidden;
import org.elasticsearch.common.io.PathUtils;
import org.elasticsearch.index.IndexNotFoundException;
import org.elasticsearch.index.translog.Checkpoint;
import java.io.IOException;
import java.nio.channels.Channels;
@ -168,12 +166,11 @@ public class TruncateTranslogCommand extends SettingCommand {
/** Write a checkpoint file to the given location with the given generation */
public static void writeEmptyCheckpoint(Path filename, int translogLength, long translogGeneration) throws IOException {
try (FileChannel fc = FileChannel.open(filename, StandardOpenOption.WRITE, StandardOpenOption.READ, StandardOpenOption.CREATE_NEW);
OutputStreamDataOutput out = new OutputStreamDataOutput(Channels.newOutputStream(fc))) {
Checkpoint emptyCheckpoint = new Checkpoint(translogLength, 0, translogGeneration);
emptyCheckpoint.write(out);
fc.force(true);
}
Checkpoint.write(FileChannel::open, filename, emptyCheckpoint,
StandardOpenOption.WRITE, StandardOpenOption.READ, StandardOpenOption.CREATE_NEW);
// fsync with metadata here to make sure.
IOUtils.fsync(filename, false);
}
/**