HADOOP-1381. The distance between sync blocks in SequenceFiles should be configurable rather than hard coded to 2000 bytes. Contributed by Harsh J.
This commit is contained in:
parent
ee3d437a33
commit
07825f2b49
|
@ -24,6 +24,7 @@ import java.util.*;
|
||||||
import java.rmi.server.UID;
|
import java.rmi.server.UID;
|
||||||
import java.security.MessageDigest;
|
import java.security.MessageDigest;
|
||||||
|
|
||||||
|
import com.google.common.annotations.VisibleForTesting;
|
||||||
import org.apache.commons.logging.*;
|
import org.apache.commons.logging.*;
|
||||||
import org.apache.hadoop.util.Options;
|
import org.apache.hadoop.util.Options;
|
||||||
import org.apache.hadoop.fs.*;
|
import org.apache.hadoop.fs.*;
|
||||||
|
@ -146,7 +147,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSU
|
||||||
* </ul>
|
* </ul>
|
||||||
* </li>
|
* </li>
|
||||||
* <li>
|
* <li>
|
||||||
* A sync-marker every few <code>100</code> bytes or so.
|
* A sync-marker every few <code>100</code> kilobytes or so.
|
||||||
* </li>
|
* </li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
|
@ -165,7 +166,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeysPublic.IO_SKIP_CHECKSU
|
||||||
* </ul>
|
* </ul>
|
||||||
* </li>
|
* </li>
|
||||||
* <li>
|
* <li>
|
||||||
* A sync-marker every few <code>100</code> bytes or so.
|
* A sync-marker every few <code>100</code> kilobytes or so.
|
||||||
* </li>
|
* </li>
|
||||||
* </ul>
|
* </ul>
|
||||||
*
|
*
|
||||||
|
@ -217,8 +218,11 @@ public class SequenceFile {
|
||||||
private static final int SYNC_HASH_SIZE = 16; // number of bytes in hash
|
private static final int SYNC_HASH_SIZE = 16; // number of bytes in hash
|
||||||
private static final int SYNC_SIZE = 4+SYNC_HASH_SIZE; // escape + hash
|
private static final int SYNC_SIZE = 4+SYNC_HASH_SIZE; // escape + hash
|
||||||
|
|
||||||
/** The number of bytes between sync points.*/
|
/**
|
||||||
public static final int SYNC_INTERVAL = 100*SYNC_SIZE;
|
* The number of bytes between sync points. 100 KB, default.
|
||||||
|
* Computed as 5 KB * 20 = 100 KB
|
||||||
|
*/
|
||||||
|
public static final int SYNC_INTERVAL = 5 * 1024 * SYNC_SIZE; // 5KB*(16+4)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The compression type used to compress key/value pairs in the
|
* The compression type used to compress key/value pairs in the
|
||||||
|
@ -856,6 +860,9 @@ public class SequenceFile {
|
||||||
// starts and ends by scanning for this value.
|
// starts and ends by scanning for this value.
|
||||||
long lastSyncPos; // position of last sync
|
long lastSyncPos; // position of last sync
|
||||||
byte[] sync; // 16 random bytes
|
byte[] sync; // 16 random bytes
|
||||||
|
@VisibleForTesting
|
||||||
|
int syncInterval;
|
||||||
|
|
||||||
{
|
{
|
||||||
try {
|
try {
|
||||||
MessageDigest digester = MessageDigest.getInstance("MD5");
|
MessageDigest digester = MessageDigest.getInstance("MD5");
|
||||||
|
@ -988,6 +995,15 @@ public class SequenceFile {
|
||||||
return new SequenceFile.Writer.FileSystemOption(fs);
|
return new SequenceFile.Writer.FileSystemOption(fs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class SyncIntervalOption extends Options.IntegerOption
|
||||||
|
implements Option {
|
||||||
|
SyncIntervalOption(int val) {
|
||||||
|
// If a negative sync interval is provided,
|
||||||
|
// fall back to the default sync interval.
|
||||||
|
super(val < 0 ? SYNC_INTERVAL : val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static Option bufferSize(int value) {
|
public static Option bufferSize(int value) {
|
||||||
return new BufferSizeOption(value);
|
return new BufferSizeOption(value);
|
||||||
}
|
}
|
||||||
|
@ -1033,10 +1049,14 @@ public class SequenceFile {
|
||||||
return new CompressionOption(value, codec);
|
return new CompressionOption(value, codec);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Option syncInterval(int value) {
|
||||||
|
return new SyncIntervalOption(value);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a uncompressed writer from a set of options.
|
* Construct a uncompressed writer from a set of options.
|
||||||
* @param conf the configuration to use
|
* @param conf the configuration to use
|
||||||
* @param options the options used when creating the writer
|
* @param opts the options used when creating the writer
|
||||||
* @throws IOException if it fails
|
* @throws IOException if it fails
|
||||||
*/
|
*/
|
||||||
Writer(Configuration conf,
|
Writer(Configuration conf,
|
||||||
|
@ -1062,6 +1082,8 @@ public class SequenceFile {
|
||||||
Options.getOption(MetadataOption.class, opts);
|
Options.getOption(MetadataOption.class, opts);
|
||||||
CompressionOption compressionTypeOption =
|
CompressionOption compressionTypeOption =
|
||||||
Options.getOption(CompressionOption.class, opts);
|
Options.getOption(CompressionOption.class, opts);
|
||||||
|
SyncIntervalOption syncIntervalOption =
|
||||||
|
Options.getOption(SyncIntervalOption.class, opts);
|
||||||
// check consistency of options
|
// check consistency of options
|
||||||
if ((fileOption == null) == (streamOption == null)) {
|
if ((fileOption == null) == (streamOption == null)) {
|
||||||
throw new IllegalArgumentException("file or stream must be specified");
|
throw new IllegalArgumentException("file or stream must be specified");
|
||||||
|
@ -1163,7 +1185,12 @@ public class SequenceFile {
|
||||||
"GzipCodec without native-hadoop " +
|
"GzipCodec without native-hadoop " +
|
||||||
"code!");
|
"code!");
|
||||||
}
|
}
|
||||||
init(conf, out, ownStream, keyClass, valueClass, codec, metadata);
|
this.syncInterval = (syncIntervalOption == null) ?
|
||||||
|
SYNC_INTERVAL :
|
||||||
|
syncIntervalOption.getValue();
|
||||||
|
init(
|
||||||
|
conf, out, ownStream, keyClass, valueClass,
|
||||||
|
codec, metadata, syncInterval);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create the named file.
|
/** Create the named file.
|
||||||
|
@ -1176,7 +1203,7 @@ public class SequenceFile {
|
||||||
Class keyClass, Class valClass) throws IOException {
|
Class keyClass, Class valClass) throws IOException {
|
||||||
this.compress = CompressionType.NONE;
|
this.compress = CompressionType.NONE;
|
||||||
init(conf, fs.create(name), true, keyClass, valClass, null,
|
init(conf, fs.create(name), true, keyClass, valClass, null,
|
||||||
new Metadata());
|
new Metadata(), SYNC_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create the named file with write-progress reporter.
|
/** Create the named file with write-progress reporter.
|
||||||
|
@ -1190,7 +1217,7 @@ public class SequenceFile {
|
||||||
Progressable progress, Metadata metadata) throws IOException {
|
Progressable progress, Metadata metadata) throws IOException {
|
||||||
this.compress = CompressionType.NONE;
|
this.compress = CompressionType.NONE;
|
||||||
init(conf, fs.create(name, progress), true, keyClass, valClass,
|
init(conf, fs.create(name, progress), true, keyClass, valClass,
|
||||||
null, metadata);
|
null, metadata, SYNC_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Create the named file with write-progress reporter.
|
/** Create the named file with write-progress reporter.
|
||||||
|
@ -1206,7 +1233,7 @@ public class SequenceFile {
|
||||||
this.compress = CompressionType.NONE;
|
this.compress = CompressionType.NONE;
|
||||||
init(conf,
|
init(conf,
|
||||||
fs.create(name, true, bufferSize, replication, blockSize, progress),
|
fs.create(name, true, bufferSize, replication, blockSize, progress),
|
||||||
true, keyClass, valClass, null, metadata);
|
true, keyClass, valClass, null, metadata, SYNC_INTERVAL);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean isCompressed() { return compress != CompressionType.NONE; }
|
boolean isCompressed() { return compress != CompressionType.NONE; }
|
||||||
|
@ -1234,18 +1261,21 @@ public class SequenceFile {
|
||||||
|
|
||||||
/** Initialize. */
|
/** Initialize. */
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
void init(Configuration conf, FSDataOutputStream out, boolean ownStream,
|
void init(Configuration config, FSDataOutputStream outStream,
|
||||||
Class keyClass, Class valClass,
|
boolean ownStream, Class key, Class val,
|
||||||
CompressionCodec codec, Metadata metadata)
|
CompressionCodec compCodec, Metadata meta,
|
||||||
|
int syncIntervalVal)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
this.conf = conf;
|
this.conf = config;
|
||||||
this.out = out;
|
this.out = outStream;
|
||||||
this.ownOutputStream = ownStream;
|
this.ownOutputStream = ownStream;
|
||||||
this.keyClass = keyClass;
|
this.keyClass = key;
|
||||||
this.valClass = valClass;
|
this.valClass = val;
|
||||||
this.codec = codec;
|
this.codec = compCodec;
|
||||||
this.metadata = metadata;
|
this.metadata = meta;
|
||||||
SerializationFactory serializationFactory = new SerializationFactory(conf);
|
this.syncInterval = syncIntervalVal;
|
||||||
|
SerializationFactory serializationFactory =
|
||||||
|
new SerializationFactory(config);
|
||||||
this.keySerializer = serializationFactory.getSerializer(keyClass);
|
this.keySerializer = serializationFactory.getSerializer(keyClass);
|
||||||
if (this.keySerializer == null) {
|
if (this.keySerializer == null) {
|
||||||
throw new IOException(
|
throw new IOException(
|
||||||
|
@ -1366,7 +1396,7 @@ public class SequenceFile {
|
||||||
|
|
||||||
synchronized void checkAndWriteSync() throws IOException {
|
synchronized void checkAndWriteSync() throws IOException {
|
||||||
if (sync != null &&
|
if (sync != null &&
|
||||||
out.getPos() >= lastSyncPos+SYNC_INTERVAL) { // time to emit sync
|
out.getPos() >= lastSyncPos+this.syncInterval) { // time to emit sync
|
||||||
sync();
|
sync();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,13 +27,15 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.io.SequenceFile.CompressionType;
|
||||||
import org.apache.hadoop.test.GenericTestUtils;
|
import org.apache.hadoop.test.GenericTestUtils;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
|
/** Tests sync based seek reads/write intervals inside SequenceFiles. */
|
||||||
public class TestSequenceFileSync {
|
public class TestSequenceFileSync {
|
||||||
private static final int NUMRECORDS = 2000;
|
private static final int NUMRECORDS = 2000;
|
||||||
private static final int RECORDSIZE = 80;
|
private static final int RECORDSIZE = 80;
|
||||||
private static final Random rand = new Random();
|
private static final Random RAND = new Random();
|
||||||
|
|
||||||
private final static String REC_FMT = "%d RECORDID %d : ";
|
private final static String REC_FMT = "%d RECORDID %d : ";
|
||||||
|
|
||||||
|
@ -46,19 +48,83 @@ public class TestSequenceFileSync {
|
||||||
reader.next(key, val);
|
reader.next(key, val);
|
||||||
assertEquals(key.get(), expectedRecord);
|
assertEquals(key.get(), expectedRecord);
|
||||||
final String test = String.format(REC_FMT, expectedRecord, expectedRecord);
|
final String test = String.format(REC_FMT, expectedRecord, expectedRecord);
|
||||||
assertEquals("Invalid value " + val, 0, val.find(test, 0));
|
assertEquals(
|
||||||
|
"Invalid value in iter " + iter + ": " + val,
|
||||||
|
0,
|
||||||
|
val.find(test, 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testLowSyncpoint() throws IOException {
|
public void testDefaultSyncInterval() throws IOException {
|
||||||
|
// Uses the default sync interval of 100 KB
|
||||||
final Configuration conf = new Configuration();
|
final Configuration conf = new Configuration();
|
||||||
final FileSystem fs = FileSystem.getLocal(conf);
|
final FileSystem fs = FileSystem.getLocal(conf);
|
||||||
final Path path = new Path(GenericTestUtils.getTempPath(
|
final Path path = new Path(GenericTestUtils.getTempPath(
|
||||||
"sequencefile.sync.test"));
|
"sequencefile.sync.test"));
|
||||||
final IntWritable input = new IntWritable();
|
final IntWritable input = new IntWritable();
|
||||||
final Text val = new Text();
|
final Text val = new Text();
|
||||||
SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, path,
|
SequenceFile.Writer writer = new SequenceFile.Writer(
|
||||||
IntWritable.class, Text.class);
|
conf,
|
||||||
|
SequenceFile.Writer.file(path),
|
||||||
|
SequenceFile.Writer.compression(CompressionType.NONE),
|
||||||
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class)
|
||||||
|
);
|
||||||
|
try {
|
||||||
|
writeSequenceFile(writer, NUMRECORDS*4);
|
||||||
|
for (int i = 0; i < 5; i++) {
|
||||||
|
final SequenceFile.Reader reader;
|
||||||
|
|
||||||
|
//try different SequenceFile.Reader constructors
|
||||||
|
if (i % 2 == 0) {
|
||||||
|
final int buffersize = conf.getInt("io.file.buffer.size", 4096);
|
||||||
|
reader = new SequenceFile.Reader(conf,
|
||||||
|
SequenceFile.Reader.file(path),
|
||||||
|
SequenceFile.Reader.bufferSize(buffersize));
|
||||||
|
} else {
|
||||||
|
final FSDataInputStream in = fs.open(path);
|
||||||
|
final long length = fs.getFileStatus(path).getLen();
|
||||||
|
reader = new SequenceFile.Reader(conf,
|
||||||
|
SequenceFile.Reader.stream(in),
|
||||||
|
SequenceFile.Reader.start(0L),
|
||||||
|
SequenceFile.Reader.length(length));
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
forOffset(reader, input, val, i, 0, 0);
|
||||||
|
forOffset(reader, input, val, i, 65, 0);
|
||||||
|
// There would be over 1000 records within
|
||||||
|
// this sync interval
|
||||||
|
forOffset(reader, input, val, i, 2000, 1101);
|
||||||
|
forOffset(reader, input, val, i, 0, 0);
|
||||||
|
} finally {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
fs.delete(path, false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testLowSyncpoint() throws IOException {
|
||||||
|
// Uses a smaller sync interval of 2000 bytes
|
||||||
|
final Configuration conf = new Configuration();
|
||||||
|
final FileSystem fs = FileSystem.getLocal(conf);
|
||||||
|
final Path path = new Path(GenericTestUtils.getTempPath(
|
||||||
|
"sequencefile.sync.test"));
|
||||||
|
final IntWritable input = new IntWritable();
|
||||||
|
final Text val = new Text();
|
||||||
|
SequenceFile.Writer writer = new SequenceFile.Writer(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Writer.file(path),
|
||||||
|
SequenceFile.Writer.compression(CompressionType.NONE),
|
||||||
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
||||||
|
SequenceFile.Writer.valueClass(Text.class),
|
||||||
|
SequenceFile.Writer.syncInterval(20*100)
|
||||||
|
);
|
||||||
|
// Ensure the custom sync interval value is set
|
||||||
|
assertEquals(writer.syncInterval, 20*100);
|
||||||
try {
|
try {
|
||||||
writeSequenceFile(writer, NUMRECORDS);
|
writeSequenceFile(writer, NUMRECORDS);
|
||||||
for (int i = 0; i < 5; i++) {
|
for (int i = 0; i < 5; i++) {
|
||||||
|
@ -66,17 +132,26 @@ public class TestSequenceFileSync {
|
||||||
|
|
||||||
//try different SequenceFile.Reader constructors
|
//try different SequenceFile.Reader constructors
|
||||||
if (i % 2 == 0) {
|
if (i % 2 == 0) {
|
||||||
reader = new SequenceFile.Reader(fs, path, conf);
|
final int bufferSize = conf.getInt("io.file.buffer.size", 4096);
|
||||||
|
reader = new SequenceFile.Reader(
|
||||||
|
conf,
|
||||||
|
SequenceFile.Reader.file(path),
|
||||||
|
SequenceFile.Reader.bufferSize(bufferSize));
|
||||||
} else {
|
} else {
|
||||||
final FSDataInputStream in = fs.open(path);
|
final FSDataInputStream in = fs.open(path);
|
||||||
final long length = fs.getFileStatus(path).getLen();
|
final long length = fs.getFileStatus(path).getLen();
|
||||||
final int buffersize = conf.getInt("io.file.buffer.size", 4096);
|
reader = new SequenceFile.Reader(
|
||||||
reader = new SequenceFile.Reader(in, buffersize, 0L, length, conf);
|
conf,
|
||||||
|
SequenceFile.Reader.stream(in),
|
||||||
|
SequenceFile.Reader.start(0L),
|
||||||
|
SequenceFile.Reader.length(length));
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
forOffset(reader, input, val, i, 0, 0);
|
forOffset(reader, input, val, i, 0, 0);
|
||||||
forOffset(reader, input, val, i, 65, 0);
|
forOffset(reader, input, val, i, 65, 0);
|
||||||
|
// There would be only a few records within
|
||||||
|
// this sync interval
|
||||||
forOffset(reader, input, val, i, 2000, 21);
|
forOffset(reader, input, val, i, 2000, 21);
|
||||||
forOffset(reader, input, val, i, 0, 0);
|
forOffset(reader, input, val, i, 0, 0);
|
||||||
} finally {
|
} finally {
|
||||||
|
@ -88,7 +163,7 @@ public class TestSequenceFileSync {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void writeSequenceFile(SequenceFile.Writer writer,
|
private static void writeSequenceFile(SequenceFile.Writer writer,
|
||||||
int numRecords) throws IOException {
|
int numRecords) throws IOException {
|
||||||
final IntWritable key = new IntWritable();
|
final IntWritable key = new IntWritable();
|
||||||
final Text val = new Text();
|
final Text val = new Text();
|
||||||
|
@ -100,13 +175,13 @@ public class TestSequenceFileSync {
|
||||||
writer.close();
|
writer.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
static void randomText(Text val, int id, int recordSize) {
|
private static void randomText(Text val, int id, int recordSize) {
|
||||||
val.clear();
|
val.clear();
|
||||||
final StringBuilder ret = new StringBuilder(recordSize);
|
final StringBuilder ret = new StringBuilder(recordSize);
|
||||||
ret.append(String.format(REC_FMT, id, id));
|
ret.append(String.format(REC_FMT, id, id));
|
||||||
recordSize -= ret.length();
|
recordSize -= ret.length();
|
||||||
for (int i = 0; i < recordSize; ++i) {
|
for (int i = 0; i < recordSize; ++i) {
|
||||||
ret.append(rand.nextInt(9));
|
ret.append(RAND.nextInt(9));
|
||||||
}
|
}
|
||||||
val.set(ret.toString());
|
val.set(ret.toString());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue