From 061c05cc05ff6257b14c5c4f25cbcec2d184cda7 Mon Sep 17 00:00:00 2001 From: Akira Ajisaka Date: Fri, 18 Dec 2015 13:58:28 +0900 Subject: [PATCH] HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl. Contributed by Kanaka Kumar Avvaru. --- .../hadoop-common/CHANGES.txt | 3 ++ .../apache/hadoop/fs/shell/CopyCommands.java | 40 ++++++++++++------- .../src/site/markdown/FileSystemShell.md | 1 + .../org/apache/hadoop/fs/TestFsShellCopy.java | 24 +++++++++-- .../src/test/resources/testConf.xml | 7 +++- 5 files changed, 55 insertions(+), 20 deletions(-) diff --git a/hadoop-common-project/hadoop-common/CHANGES.txt b/hadoop-common-project/hadoop-common/CHANGES.txt index 9bd4d6e17ab..6263f74cc95 100644 --- a/hadoop-common-project/hadoop-common/CHANGES.txt +++ b/hadoop-common-project/hadoop-common/CHANGES.txt @@ -688,6 +688,9 @@ Release 2.8.0 - UNRELEASED HADOOP-10300. Allowed deferred sending of call responses. (Daryn Sharp via yliu) + HADOOP-12657. Add a option to skip newline on empty files with getMerge -nl. + (Kanaka Kumar Avvaru via aajisaka) + IMPROVEMENTS HADOOP-12458. Retries is typoed to spell Retires in parts of diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java index c4e42c94b93..e2fad7560e3 100644 --- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java +++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/fs/shell/CopyCommands.java @@ -53,24 +53,29 @@ class CopyCommands { /** merge multiple files together */ public static class Merge extends FsCommand { public static final String NAME = "getmerge"; - public static final String USAGE = "[-nl] "; + public static final String USAGE = "[-nl] [-skip-empty-file] " + + " "; public static final String DESCRIPTION = - "Get all the files in the directories that " + - "match the source file pattern and merge and sort them to only " + - "one file on local fs. is kept.\n" + - "-nl: Add a newline character at the end of each file."; + "Get all the files in the directories that " + + "match the source file pattern and merge and sort them to only " + + "one file on local fs. is kept.\n" + + "-nl: Add a newline character at the end of each file.\n" + + "-skip-empty-file: Do not add new line character for empty file."; protected PathData dst = null; protected String delimiter = null; + private boolean skipEmptyFileDelimiter; protected List srcs = null; @Override protected void processOptions(LinkedList args) throws IOException { try { - CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl"); + CommandFormat cf = new CommandFormat(2, Integer.MAX_VALUE, "nl", + "skip-empty-file"); cf.parse(args); delimiter = cf.getOpt("nl") ? "\n" : null; + skipEmptyFileDelimiter = cf.getOpt("skip-empty-file"); dst = new PathData(new URI(args.removeLast()), getConf()); if (dst.exists && dst.stat.isDirectory()) { @@ -92,21 +97,26 @@ class CopyCommands { FSDataOutputStream out = dst.fs.create(dst.path); try { for (PathData src : srcs) { - FSDataInputStream in = src.fs.open(src.path); - try { - IOUtils.copyBytes(in, out, getConf(), false); - if (delimiter != null) { - out.write(delimiter.getBytes("UTF-8")); + if (src.stat.getLen() != 0) { + try (FSDataInputStream in = src.fs.open(src.path)) { + IOUtils.copyBytes(in, out, getConf(), false); + writeDelimiter(out); } - } finally { - in.close(); + } else if (!skipEmptyFileDelimiter) { + writeDelimiter(out); } } } finally { out.close(); - } + } } - + + private void writeDelimiter(FSDataOutputStream out) throws IOException { + if (delimiter != null) { + out.write(delimiter.getBytes("UTF-8")); + } + } + @Override protected void processNonexistentPath(PathData item) throws IOException { exitCode = 1; // flag that a path is bad diff --git a/hadoop-common-project/hadoop-common/src/site/markdown/FileSystemShell.md b/hadoop-common-project/hadoop-common/src/site/markdown/FileSystemShell.md index e243deafbbe..d32156e6cd1 100644 --- a/hadoop-common-project/hadoop-common/src/site/markdown/FileSystemShell.md +++ b/hadoop-common-project/hadoop-common/src/site/markdown/FileSystemShell.md @@ -375,6 +375,7 @@ getmerge Usage: `hadoop fs -getmerge [-nl] ` Takes a source directory and a destination file as input and concatenates files in src into the destination local file. Optionally -nl can be set to enable adding a newline character (LF) at the end of each file. +-skip-empty-file can be used to avoid unwanted newline characters in case of empty files. Examples: diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFsShellCopy.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFsShellCopy.java index 1d626f98030..6b5de745c0a 100644 --- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFsShellCopy.java +++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/fs/TestFsShellCopy.java @@ -318,6 +318,7 @@ public class TestFsShellCopy { Path f1 = new Path(root, "f1"); Path f2 = new Path(root, "f2"); Path f3 = new Path(root, "f3"); + Path empty = new Path(root, "empty"); Path fnf = new Path(root, "fnf"); Path d = new Path(root, "dir"); Path df1 = new Path(d, "df1"); @@ -325,7 +326,8 @@ public class TestFsShellCopy { Path df3 = new Path(d, "df3"); createFile(f1, f2, f3, df1, df2, df3); - + createEmptyFile(empty); + int exit; // one file, kind of silly exit = shell.run(new String[]{ @@ -366,6 +368,13 @@ public class TestFsShellCopy { assertEquals(0, exit); assertEquals("f1\nf2\n", readFile("out")); + exit = shell.run(new String[]{ + "-getmerge", "-nl", "-skip-empty-file", + f1.toString(), f2.toString(), empty.toString(), + "out" }); + assertEquals(0, exit); + assertEquals("f1\nf2\n", readFile("out")); + // glob three files shell.run(new String[]{ "-getmerge", "-nl", @@ -374,13 +383,13 @@ public class TestFsShellCopy { assertEquals(0, exit); assertEquals("f1\nf2\nf3\n", readFile("out")); - // directory with 3 files, should skip subdir + // directory with 1 empty + 3 non empty files, should skip subdir shell.run(new String[]{ "-getmerge", "-nl", root.toString(), "out" }); assertEquals(0, exit); - assertEquals("f1\nf2\nf3\n", readFile("out")); + assertEquals("\nf1\nf2\nf3\n", readFile("out")); // subdir shell.run(new String[]{ @@ -538,7 +547,14 @@ public class TestFsShellCopy { out.close(); } } - + + private void createEmptyFile(Path ... paths) throws IOException { + for (Path path : paths) { + FSDataOutputStream out = lfs.create(path); + out.close(); + } + } + private String readFile(String out) throws IOException { Path path = new Path(out); FileStatus stat = lfs.getFileStatus(path); diff --git a/hadoop-common-project/hadoop-common/src/test/resources/testConf.xml b/hadoop-common-project/hadoop-common/src/test/resources/testConf.xml index fd71034a040..79ab282c4a5 100644 --- a/hadoop-common-project/hadoop-common/src/test/resources/testConf.xml +++ b/hadoop-common-project/hadoop-common/src/test/resources/testConf.xml @@ -601,7 +601,7 @@ RegexpComparator - ^-getmerge \[-nl\] <src> <localdst> :\s* + ^-getmerge \[-nl\] \[-skip-empty-file\] <src> <localdst> :\s* RegexpComparator @@ -615,6 +615,11 @@ RegexpComparator ^( |\t)*-nl\s+Add a newline character at the end of each file.( )* + + RegexpComparator + ^( |\t)*-skip-empty-file\s+Do not add new line character for empty file.( )* + +