HADOOP-1381. The distance between sync blocks in SequenceFiles should be configurable rather than hard coded to 2000 bytes. Contributed by Harsh J.

This commit is contained in:
Harsh J 2016-10-26 20:04:33 +05:30
parent ee3d437a33
commit 07825f2b49
2 changed files with 146 additions and 41 deletions

View File

@ -24,6 +24,7 @@ import java.util.*;
import java.rmi.server.UID; import java.rmi.server.UID;
import java.security.MessageDigest; import java.security.MessageDigest;
import com.google.common.annotations.VisibleForTesting;
import org.apache.commons.logging.*; import org.apache.commons.logging.*;
import org.apache.hadoop.util.Options; import org.apache.hadoop.util.Options;
import org.apache.hadoop.fs.*; import org.apache.hadoop.fs.*;
@ -146,7 +147,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSU
* </ul> * </ul>
* </li> * </li>
* <li> * <li>
* A sync-marker every few <code>100</code> bytes or so. * A sync-marker every few <code>100</code> kilobytes or so.
* </li> * </li>
* </ul> * </ul>
* *
@ -165,7 +166,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSU
* </ul> * </ul>
* </li> * </li>
* <li> * <li>
* A sync-marker every few <code>100</code> bytes or so. * A sync-marker every few <code>100</code> kilobytes or so.
* </li> * </li>
* </ul> * </ul>
* *
@ -217,8 +218,11 @@ public class SequenceFile {
private static final int SYNC_HASH_SIZE = 16; // number of bytes in hash private static final int SYNC_HASH_SIZE = 16; // number of bytes in hash
private static final int SYNC_SIZE = 4+SYNC_HASH_SIZE; // escape + hash private static final int SYNC_SIZE = 4+SYNC_HASH_SIZE; // escape + hash
/** The number of bytes between sync points.*/ /**
public static final int SYNC_INTERVAL = 100*SYNC_SIZE; * The number of bytes between sync points. 100 KB, default.
* Computed as 5 KB * 20 = 100 KB
*/
public static final int SYNC_INTERVAL = 5 * 1024 * SYNC_SIZE; // 5KB*(16+4)
/** /**
* The compression type used to compress key/value pairs in the * The compression type used to compress key/value pairs in the
@ -856,6 +860,9 @@ public class SequenceFile {
// starts and ends by scanning for this value. // starts and ends by scanning for this value.
long lastSyncPos; // position of last sync long lastSyncPos; // position of last sync
byte[] sync; // 16 random bytes byte[] sync; // 16 random bytes
@VisibleForTesting
int syncInterval;
{ {
try { try {
MessageDigest digester = MessageDigest.getInstance("MD5"); MessageDigest digester = MessageDigest.getInstance("MD5");
@ -988,6 +995,15 @@ public class SequenceFile {
return new SequenceFile.Writer.FileSystemOption(fs); return new SequenceFile.Writer.FileSystemOption(fs);
} }
private static class SyncIntervalOption extends Options.IntegerOption
implements Option {
SyncIntervalOption(int val) {
// If a negative sync interval is provided,
// fall back to the default sync interval.
super(val < 0 ? SYNC_INTERVAL : val);
}
}
public static Option bufferSize(int value) { public static Option bufferSize(int value) {
return new BufferSizeOption(value); return new BufferSizeOption(value);
} }
@ -1033,10 +1049,14 @@ public class SequenceFile {
return new CompressionOption(value, codec); return new CompressionOption(value, codec);
} }
public static Option syncInterval(int value) {
return new SyncIntervalOption(value);
}
/** /**
* Construct a uncompressed writer from a set of options. * Construct a uncompressed writer from a set of options.
* @param conf the configuration to use * @param conf the configuration to use
* @param options the options used when creating the writer * @param opts the options used when creating the writer
* @throws IOException if it fails * @throws IOException if it fails
*/ */
Writer(Configuration conf, Writer(Configuration conf,
@ -1062,6 +1082,8 @@ public class SequenceFile {
Options.getOption(MetadataOption.class, opts); Options.getOption(MetadataOption.class, opts);
CompressionOption compressionTypeOption = CompressionOption compressionTypeOption =
Options.getOption(CompressionOption.class, opts); Options.getOption(CompressionOption.class, opts);
SyncIntervalOption syncIntervalOption =
Options.getOption(SyncIntervalOption.class, opts);
// check consistency of options // check consistency of options
if ((fileOption == null) == (streamOption == null)) { if ((fileOption == null) == (streamOption == null)) {
throw new IllegalArgumentException("file or stream must be specified"); throw new IllegalArgumentException("file or stream must be specified");
@ -1163,7 +1185,12 @@ public class SequenceFile {
"GzipCodec without native-hadoop " + "GzipCodec without native-hadoop " +
"code!"); "code!");
} }
init(conf, out, ownStream, keyClass, valueClass, codec, metadata); this.syncInterval = (syncIntervalOption == null) ?
SYNC_INTERVAL :
syncIntervalOption.getValue();
init(
conf, out, ownStream, keyClass, valueClass,
codec, metadata, syncInterval);
} }
/** Create the named file. /** Create the named file.
@ -1176,7 +1203,7 @@ public class SequenceFile {
Class keyClass, Class valClass) throws IOException { Class keyClass, Class valClass) throws IOException {
this.compress = CompressionType.NONE; this.compress = CompressionType.NONE;
init(conf, fs.create(name), true, keyClass, valClass, null, init(conf, fs.create(name), true, keyClass, valClass, null,
new Metadata()); new Metadata(), SYNC_INTERVAL);
} }
/** Create the named file with write-progress reporter. /** Create the named file with write-progress reporter.
@ -1190,7 +1217,7 @@ public class SequenceFile {
Progressable progress, Metadata metadata) throws IOException { Progressable progress, Metadata metadata) throws IOException {
this.compress = CompressionType.NONE; this.compress = CompressionType.NONE;
init(conf, fs.create(name, progress), true, keyClass, valClass, init(conf, fs.create(name, progress), true, keyClass, valClass,
null, metadata); null, metadata, SYNC_INTERVAL);
} }
/** Create the named file with write-progress reporter. /** Create the named file with write-progress reporter.
@ -1206,7 +1233,7 @@ public class SequenceFile {
this.compress = CompressionType.NONE; this.compress = CompressionType.NONE;
init(conf, init(conf,
fs.create(name, true, bufferSize, replication, blockSize, progress), fs.create(name, true, bufferSize, replication, blockSize, progress),
true, keyClass, valClass, null, metadata); true, keyClass, valClass, null, metadata, SYNC_INTERVAL);
} }
boolean isCompressed() { return compress != CompressionType.NONE; } boolean isCompressed() { return compress != CompressionType.NONE; }
@ -1234,18 +1261,21 @@ public class SequenceFile {
/** Initialize. */ /** Initialize. */
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
void init(Configuration conf, FSDataOutputStream out, boolean ownStream, void init(Configuration config, FSDataOutputStream outStream,
Class keyClass, Class valClass, boolean ownStream, Class key, Class val,
CompressionCodec codec, Metadata metadata) CompressionCodec compCodec, Metadata meta,
int syncIntervalVal)
throws IOException { throws IOException {
this.conf = conf; this.conf = config;
this.out = out; this.out = outStream;
this.ownOutputStream = ownStream; this.ownOutputStream = ownStream;
this.keyClass = keyClass; this.keyClass = key;
this.valClass = valClass; this.valClass = val;
this.codec = codec; this.codec = compCodec;
this.metadata = metadata; this.metadata = meta;
SerializationFactory serializationFactory = new SerializationFactory(conf); this.syncInterval = syncIntervalVal;
SerializationFactory serializationFactory =
new SerializationFactory(config);
this.keySerializer = serializationFactory.getSerializer(keyClass); this.keySerializer = serializationFactory.getSerializer(keyClass);
if (this.keySerializer == null) { if (this.keySerializer == null) {
throw new IOException( throw new IOException(
@ -1366,7 +1396,7 @@ public class SequenceFile {
synchronized void checkAndWriteSync() throws IOException { synchronized void checkAndWriteSync() throws IOException {
if (sync != null && if (sync != null &&
out.getPos() >= lastSyncPos+SYNC_INTERVAL) { // time to emit sync out.getPos() >= lastSyncPos+this.syncInterval) { // time to emit sync
sync(); sync();
} }
} }

View File

@ -27,13 +27,15 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile.CompressionType;
import org.apache.hadoop.test.GenericTestUtils; import org.apache.hadoop.test.GenericTestUtils;
import org.junit.Test; import org.junit.Test;
/** Tests sync based seek reads/write intervals inside SequenceFiles. */
public class TestSequenceFileSync { public class TestSequenceFileSync {
private static final int NUMRECORDS = 2000; private static final int NUMRECORDS = 2000;
private static final int RECORDSIZE = 80; private static final int RECORDSIZE = 80;
private static final Random rand = new Random(); private static final Random RAND = new Random();
private final static String REC_FMT = "%d RECORDID %d : "; private final static String REC_FMT = "%d RECORDID %d : ";
@ -46,19 +48,83 @@ public class TestSequenceFileSync {
reader.next(key, val); reader.next(key, val);
assertEquals(key.get(), expectedRecord); assertEquals(key.get(), expectedRecord);
final String test = String.format(REC_FMT, expectedRecord, expectedRecord); final String test = String.format(REC_FMT, expectedRecord, expectedRecord);
assertEquals("Invalid value " + val, 0, val.find(test, 0)); assertEquals(
"Invalid value in iter " + iter + ": " + val,
0,
val.find(test, 0));
} }
@Test @Test
public void testLowSyncpoint() throws IOException { public void testDefaultSyncInterval() throws IOException {
// Uses the default sync interval of 100 KB
final Configuration conf = new Configuration(); final Configuration conf = new Configuration();
final FileSystem fs = FileSystem.getLocal(conf); final FileSystem fs = FileSystem.getLocal(conf);
final Path path = new Path(GenericTestUtils.getTempPath( final Path path = new Path(GenericTestUtils.getTempPath(
"sequencefile.sync.test")); "sequencefile.sync.test"));
final IntWritable input = new IntWritable(); final IntWritable input = new IntWritable();
final Text val = new Text(); final Text val = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path, SequenceFile.Writer writer = new SequenceFile.Writer(
IntWritable.class, Text.class); conf,
SequenceFile.Writer.file(path),
SequenceFile.Writer.compression(CompressionType.NONE),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class)
);
try {
writeSequenceFile(writer, NUMRECORDS*4);
for (int i = 0; i < 5; i++) {
final SequenceFile.Reader reader;
//try different SequenceFile.Reader constructors
if (i % 2 == 0) {
final int buffersize = conf.getInt("io.file.buffer.size", 4096);
reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.file(path),
SequenceFile.Reader.bufferSize(buffersize));
} else {
final FSDataInputStream in = fs.open(path);
final long length = fs.getFileStatus(path).getLen();
reader = new SequenceFile.Reader(conf,
SequenceFile.Reader.stream(in),
SequenceFile.Reader.start(0L),
SequenceFile.Reader.length(length));
}
try {
forOffset(reader, input, val, i, 0, 0);
forOffset(reader, input, val, i, 65, 0);
// There would be over 1000 records within
// this sync interval
forOffset(reader, input, val, i, 2000, 1101);
forOffset(reader, input, val, i, 0, 0);
} finally {
reader.close();
}
}
} finally {
fs.delete(path, false);
}
}
@Test
public void testLowSyncpoint() throws IOException {
// Uses a smaller sync interval of 2000 bytes
final Configuration conf = new Configuration();
final FileSystem fs = FileSystem.getLocal(conf);
final Path path = new Path(GenericTestUtils.getTempPath(
"sequencefile.sync.test"));
final IntWritable input = new IntWritable();
final Text val = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(
conf,
SequenceFile.Writer.file(path),
SequenceFile.Writer.compression(CompressionType.NONE),
SequenceFile.Writer.keyClass(IntWritable.class),
SequenceFile.Writer.valueClass(Text.class),
SequenceFile.Writer.syncInterval(20*100)
);
// Ensure the custom sync interval value is set
assertEquals(writer.syncInterval, 20*100);
try { try {
writeSequenceFile(writer, NUMRECORDS); writeSequenceFile(writer, NUMRECORDS);
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
@ -66,17 +132,26 @@ public class TestSequenceFileSync {
//try different SequenceFile.Reader constructors //try different SequenceFile.Reader constructors
if (i % 2 == 0) { if (i % 2 == 0) {
reader = new SequenceFile.Reader(fs, path, conf); final int bufferSize = conf.getInt("io.file.buffer.size", 4096);
reader = new SequenceFile.Reader(
conf,
SequenceFile.Reader.file(path),
SequenceFile.Reader.bufferSize(bufferSize));
} else { } else {
final FSDataInputStream in = fs.open(path); final FSDataInputStream in = fs.open(path);
final long length = fs.getFileStatus(path).getLen(); final long length = fs.getFileStatus(path).getLen();
final int buffersize = conf.getInt("io.file.buffer.size", 4096); reader = new SequenceFile.Reader(
reader = new SequenceFile.Reader(in, buffersize, 0L, length, conf); conf,
SequenceFile.Reader.stream(in),
SequenceFile.Reader.start(0L),
SequenceFile.Reader.length(length));
} }
try { try {
forOffset(reader, input, val, i, 0, 0); forOffset(reader, input, val, i, 0, 0);
forOffset(reader, input, val, i, 65, 0); forOffset(reader, input, val, i, 65, 0);
// There would be only a few records within
// this sync interval
forOffset(reader, input, val, i, 2000, 21); forOffset(reader, input, val, i, 2000, 21);
forOffset(reader, input, val, i, 0, 0); forOffset(reader, input, val, i, 0, 0);
} finally { } finally {
@ -88,7 +163,7 @@ public class TestSequenceFileSync {
} }
} }
public static void writeSequenceFile(SequenceFile.Writer writer, private static void writeSequenceFile(SequenceFile.Writer writer,
int numRecords) throws IOException { int numRecords) throws IOException {
final IntWritable key = new IntWritable(); final IntWritable key = new IntWritable();
final Text val = new Text(); final Text val = new Text();
@ -100,13 +175,13 @@ public class TestSequenceFileSync {
writer.close(); writer.close();
} }
static void randomText(Text val, int id, int recordSize) { private static void randomText(Text val, int id, int recordSize) {
val.clear(); val.clear();
final StringBuilder ret = new StringBuilder(recordSize); final StringBuilder ret = new StringBuilder(recordSize);
ret.append(String.format(REC_FMT, id, id)); ret.append(String.format(REC_FMT, id, id));
recordSize -= ret.length(); recordSize -= ret.length();
for (int i = 0; i < recordSize; ++i) { for (int i = 0; i < recordSize; ++i) {
ret.append(rand.nextInt(9)); ret.append(RAND.nextInt(9));
} }
val.set(ret.toString()); val.set(ret.toString());
} }