HADOOP-8449. hadoop fs -text fails with compressed sequence files with the codec file extension (backported from trunk) (harsh)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1355637 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Harsh J 2012-06-30 05:05:41 +00:00
parent b51d32719d
commit 6f299e4e25
3 changed files with 42 additions and 12 deletions

View File

@ -47,6 +47,9 @@ Release 2.0.1-alpha - UNRELEASED
HADOOP-8524. Allow users to get source of a Configuration HADOOP-8524. Allow users to get source of a Configuration
parameter (harsh) parameter (harsh)
HADOOP-8449. hadoop fs -text fails with compressed sequence files
with the codec file extension (harsh)
BUG FIXES BUG FIXES
HADOOP-8372. NetUtils.normalizeHostName() incorrectly handles hostname HADOOP-8372. NetUtils.normalizeHostName() incorrectly handles hostname

View File

@ -109,26 +109,33 @@ public static class Text extends Cat {
protected InputStream getInputStream(PathData item) throws IOException { protected InputStream getInputStream(PathData item) throws IOException {
FSDataInputStream i = (FSDataInputStream)super.getInputStream(item); FSDataInputStream i = (FSDataInputStream)super.getInputStream(item);
// check codecs // Check type of stream first
switch(i.readShort()) {
case 0x1f8b: { // RFC 1952
// Must be gzip
i.seek(0);
return new GZIPInputStream(i);
}
case 0x5345: { // 'S' 'E'
// Might be a SequenceFile
if (i.readByte() == 'Q') {
i.close();
return new TextRecordInputStream(item.stat);
}
}
default: {
// Check the type of compression instead, depending on Codec class's
// own detection methods, based on the provided path.
CompressionCodecFactory cf = new CompressionCodecFactory(getConf()); CompressionCodecFactory cf = new CompressionCodecFactory(getConf());
CompressionCodec codec = cf.getCodec(item.path); CompressionCodec codec = cf.getCodec(item.path);
if (codec != null) { if (codec != null) {
return codec.createInputStream(i); return codec.createInputStream(i);
} }
switch(i.readShort()) {
case 0x1f8b: { // RFC 1952
i.seek(0);
return new GZIPInputStream(i);
}
case 0x5345: { // 'S' 'E'
if (i.readByte() == 'Q') {
i.close();
return new TextRecordInputStream(item.stat);
}
break; break;
} }
} }
// File is non-compressed, or not a file container we know.
i.seek(0); i.seek(0);
return i; return i;
} }

View File

@ -48,6 +48,8 @@
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils; import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
import org.apache.hadoop.hdfs.tools.DFSAdmin; import org.apache.hadoop.hdfs.tools.DFSAdmin;
import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.security.UserGroupInformation;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner; import org.apache.hadoop.util.ToolRunner;
@ -545,7 +547,7 @@ public void testText() throws Exception {
textTest(new Path("/texttest").makeQualified(dfs.getUri(), textTest(new Path("/texttest").makeQualified(dfs.getUri(),
dfs.getWorkingDirectory()), conf); dfs.getWorkingDirectory()), conf);
conf.set("fs.default.name", dfs.getUri().toString()); conf.set("fs.defaultFS", dfs.getUri().toString());
final FileSystem lfs = FileSystem.getLocal(conf); final FileSystem lfs = FileSystem.getLocal(conf);
textTest(new Path(TEST_ROOT_DIR, "texttest").makeQualified(lfs.getUri(), textTest(new Path(TEST_ROOT_DIR, "texttest").makeQualified(lfs.getUri(),
lfs.getWorkingDirectory()), conf); lfs.getWorkingDirectory()), conf);
@ -564,6 +566,7 @@ private void textTest(Path root, Configuration conf) throws Exception {
OutputStream zout = new GZIPOutputStream( OutputStream zout = new GZIPOutputStream(
fs.create(new Path(root, "file.gz"))); fs.create(new Path(root, "file.gz")));
Random r = new Random(); Random r = new Random();
bak = System.out;
ByteArrayOutputStream file = new ByteArrayOutputStream(); ByteArrayOutputStream file = new ByteArrayOutputStream();
for (int i = 0; i < 1024; ++i) { for (int i = 0; i < 1024; ++i) {
char c = Character.forDigit(r.nextInt(26) + 10, 36); char c = Character.forDigit(r.nextInt(26) + 10, 36);
@ -572,7 +575,6 @@ private void textTest(Path root, Configuration conf) throws Exception {
} }
zout.close(); zout.close();
bak = System.out;
ByteArrayOutputStream out = new ByteArrayOutputStream(); ByteArrayOutputStream out = new ByteArrayOutputStream();
System.setOut(new PrintStream(out)); System.setOut(new PrintStream(out));
@ -581,10 +583,28 @@ private void textTest(Path root, Configuration conf) throws Exception {
argv[1] = new Path(root, "file.gz").toString(); argv[1] = new Path(root, "file.gz").toString();
int ret = ToolRunner.run(new FsShell(conf), argv); int ret = ToolRunner.run(new FsShell(conf), argv);
assertEquals("'-text " + argv[1] + " returned " + ret, 0, ret); assertEquals("'-text " + argv[1] + " returned " + ret, 0, ret);
file.reset();
out.reset();
assertTrue("Output doesn't match input", assertTrue("Output doesn't match input",
Arrays.equals(file.toByteArray(), out.toByteArray())); Arrays.equals(file.toByteArray(), out.toByteArray()));
// Create a sequence file with a gz extension, to test proper
// container detection
SequenceFile.Writer writer = SequenceFile.createWriter(
conf,
SequenceFile.Writer.file(new Path(root, "file.gz")),
SequenceFile.Writer.keyClass(Text.class),
SequenceFile.Writer.valueClass(Text.class));
writer.append(new Text("Foo"), new Text("Bar"));
writer.close();
out = new ByteArrayOutputStream();
System.setOut(new PrintStream(out));
argv = new String[2];
argv[0] = "-text";
argv[1] = new Path(root, "file.gz").toString();
ret = ToolRunner.run(new FsShell(conf), argv);
assertEquals("'-text " + argv[1] + " returned " + ret, 0, ret);
assertTrue("Output doesn't match input",
Arrays.equals("Foo\tBar\n".getBytes(), out.toByteArray()));
out.reset();
} finally { } finally {
if (null != bak) { if (null != bak) {
System.setOut(bak); System.setOut(bak);