HADOOP-8449. hadoop fs -text fails with compressed sequence files with the codec file extension (backported from trunk) (harsh)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1355637 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Harsh J 2012-06-30 05:05:41 +00:00
parent b51d32719d
commit 6f299e4e25
3 changed files with 42 additions and 12 deletions

View File

@ -47,6 +47,9 @@ Release 2.0.1-alpha - UNRELEASED
HADOOP-8524. Allow users to get source of a Configuration
parameter (harsh)
HADOOP-8449. hadoop fs -text fails with compressed sequence files
with the codec file extension (harsh)
BUG FIXES
HADOOP-8372. NetUtils.normalizeHostName() incorrectly handles hostname

View File

@ -108,30 +108,37 @@ public static class Text extends Cat {
@Override
protected InputStream getInputStream(PathData item) throws IOException {
FSDataInputStream i = (FSDataInputStream)super.getInputStream(item);
// check codecs
CompressionCodecFactory cf = new CompressionCodecFactory(getConf());
CompressionCodec codec = cf.getCodec(item.path);
if (codec != null) {
return codec.createInputStream(i);
}
// Check type of stream first
switch(i.readShort()) {
case 0x1f8b: { // RFC 1952
// Must be gzip
i.seek(0);
return new GZIPInputStream(i);
}
case 0x5345: { // 'S' 'E'
// Might be a SequenceFile
if (i.readByte() == 'Q') {
i.close();
return new TextRecordInputStream(item.stat);
}
}
default: {
// Check the type of compression instead, depending on Codec class's
// own detection methods, based on the provided path.
CompressionCodecFactory cf = new CompressionCodecFactory(getConf());
CompressionCodec codec = cf.getCodec(item.path);
if (codec != null) {
return codec.createInputStream(i);
}
break;
}
}
// File is non-compressed, or not a file container we know.
i.seek(0);
return i;
}
}
}
protected class TextRecordInputStream extends InputStream {

View File

@ -48,6 +48,8 @@
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.tools.DFSAdmin;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
@ -545,7 +547,7 @@ public void testText() throws Exception {
textTest(new Path("/texttest").makeQualified(dfs.getUri(),
dfs.getWorkingDirectory()), conf);
conf.set("fs.default.name", dfs.getUri().toString());
conf.set("fs.defaultFS", dfs.getUri().toString());
final FileSystem lfs = FileSystem.getLocal(conf);
textTest(new Path(TEST_ROOT_DIR, "texttest").makeQualified(lfs.getUri(),
lfs.getWorkingDirectory()), conf);
@ -564,6 +566,7 @@ private void textTest(Path root, Configuration conf) throws Exception {
OutputStream zout = new GZIPOutputStream(
fs.create(new Path(root, "file.gz")));
Random r = new Random();
bak = System.out;
ByteArrayOutputStream file = new ByteArrayOutputStream();
for (int i = 0; i < 1024; ++i) {
char c = Character.forDigit(r.nextInt(26) + 10, 36);
@ -572,7 +575,6 @@ private void textTest(Path root, Configuration conf) throws Exception {
}
zout.close();
bak = System.out;
ByteArrayOutputStream out = new ByteArrayOutputStream();
System.setOut(new PrintStream(out));
@ -581,10 +583,28 @@ private void textTest(Path root, Configuration conf) throws Exception {
argv[1] = new Path(root, "file.gz").toString();
int ret = ToolRunner.run(new FsShell(conf), argv);
assertEquals("'-text " + argv[1] + " returned " + ret, 0, ret);
file.reset();
out.reset();
assertTrue("Output doesn't match input",
Arrays.equals(file.toByteArray(), out.toByteArray()));
// Create a sequence file with a gz extension, to test proper
// container detection
SequenceFile.Writer writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(new Path(root, "file.gz")),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class));
writer.append(new Text("Foo"), new Text("Bar"));
writer.close();
out = new ByteArrayOutputStream();
System.setOut(new PrintStream(out));
argv = new String[2];
argv[0] = "-text";
argv[1] = new Path(root, "file.gz").toString();
ret = ToolRunner.run(new FsShell(conf), argv);
assertEquals("'-text " + argv[1] + " returned " + ret, 0, ret);
assertTrue("Output doesn't match input",
Arrays.equals("Foo\tBar\n".getBytes(), out.toByteArray()));
out.reset();
} finally {
if (null != bak) {
System.setOut(bak);