HADOOP-18029: Update CompressionCodecFactory to handle uppercase file extensions (#3739)

Co-authored-by: Desmond Sisson <sissonde@amazon.com>
This commit is contained in:
Desmond Sisson 2021-12-01 15:36:54 -08:00 committed by GitHub
parent 4dea4a7b67
commit df4197592f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 110 additions and 79 deletions

View File

@ -57,7 +57,7 @@ public class CompressionCodecFactory {
private Map<String, CompressionCodec> codecsByName = null; private Map<String, CompressionCodec> codecsByName = null;
/** /**
* A map from class names to the codecs * A map from class names to the codecs.
*/ */
private HashMap<String, CompressionCodec> codecsByClassName = null; private HashMap<String, CompressionCodec> codecsByClassName = null;
@ -110,8 +110,8 @@ public class CompressionCodecFactory {
*/ */
public static List<Class<? extends CompressionCodec>> getCodecClasses( public static List<Class<? extends CompressionCodec>> getCodecClasses(
Configuration conf) { Configuration conf) {
List<Class<? extends CompressionCodec>> result List<Class<? extends CompressionCodec>> result =
= new ArrayList<Class<? extends CompressionCodec>>(); new ArrayList<Class<? extends CompressionCodec>>();
// Add codec classes discovered via service loading // Add codec classes discovered via service loading
synchronized (CODEC_PROVIDERS) { synchronized (CODEC_PROVIDERS) {
// CODEC_PROVIDERS is a lazy collection. Synchronize so it is // CODEC_PROVIDERS is a lazy collection. Synchronize so it is
@ -200,11 +200,13 @@ public class CompressionCodecFactory {
String filename = file.getName(); String filename = file.getName();
String reversedFilename = String reversedFilename =
new StringBuilder(filename).reverse().toString(); new StringBuilder(filename).reverse().toString();
String lowerReversedFilename =
StringUtils.toLowerCase(reversedFilename);
SortedMap<String, CompressionCodec> subMap = SortedMap<String, CompressionCodec> subMap =
codecs.headMap(reversedFilename); codecs.headMap(lowerReversedFilename);
if (!subMap.isEmpty()) { if (!subMap.isEmpty()) {
String potentialSuffix = subMap.lastKey(); String potentialSuffix = subMap.lastKey();
if (reversedFilename.startsWith(potentialSuffix)) { if (lowerReversedFilename.startsWith(potentialSuffix)) {
result = codecs.get(potentialSuffix); result = codecs.get(potentialSuffix);
} }
} }
@ -323,8 +325,12 @@ public class CompressionCodecFactory {
len = in.read(buffer); len = in.read(buffer);
} }
} finally { } finally {
if(out != null) { out.close(); } if(out != null) {
if(in != null) { in.close(); } out.close();
}
if(in != null) {
in.close();
}
} }
} else { } else {
CompressionInputStream in = null; CompressionInputStream in = null;
@ -338,7 +344,9 @@ public class CompressionCodecFactory {
len = in.read(buffer); len = in.read(buffer);
} }
} finally { } finally {
if(in != null) { in.close(); } if(in != null) {
in.close();
}
} }
} }
} }

View File

@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
public class TestCodecFactory { public class TestCodecFactory {
@ -125,7 +126,7 @@ public class TestCodecFactory {
} }
/** /**
* Returns a factory for a given set of codecs * Returns a factory for a given set of codecs.
* @param classes the codec classes to include * @param classes the codec classes to include
* @return a new factory * @return a new factory
*/ */
@ -137,10 +138,16 @@ public class TestCodecFactory {
private static void checkCodec(String msg, private static void checkCodec(String msg,
Class expected, CompressionCodec actual) { Class expected, CompressionCodec actual) {
if (expected == null) {
assertNull(msg, actual);
} else if (actual == null) {
fail(msg + " result was null");
} else {
assertEquals(msg + " unexpected codec found", assertEquals(msg + " unexpected codec found",
expected.getName(), expected.getName(),
actual.getClass().getName()); actual.getClass().getName());
} }
}
@Test @Test
public void testFinding() { public void testFinding() {
@ -153,6 +160,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.gz")); codec = factory.getCodec(new Path("/tmp/foo.gz"));
checkCodec("default factory for .gz", GzipCodec.class, codec); checkCodec("default factory for .gz", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
checkCodec("default factory for .GZ", GzipCodec.class, codec);
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
checkCodec("default factory for gzip codec", GzipCodec.class, codec); checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("gzip"); codec = factory.getCodecByName("gzip");
@ -168,6 +177,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.bz2")); codec = factory.getCodec(new Path("/tmp/foo.bz2"));
checkCodec("default factory for .bz2", BZip2Codec.class, codec); checkCodec("default factory for .bz2", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
checkCodec("default factory for .BZ2", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("bzip2"); codec = factory.getCodecByName("bzip2");
@ -221,16 +232,22 @@ public class TestCodecFactory {
FooBarCodec.class}); FooBarCodec.class});
codec = factory.getCodec(new Path("/tmp/.foo.bar.gz")); codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
checkCodec("full factory gz codec", GzipCodec.class, codec); checkCodec("full factory gz codec", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/.foo.bar.GZ"));
checkCodec("full factory GZ codec", GzipCodec.class, codec);
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
checkCodec("full codec gz codec", GzipCodec.class, codec); checkCodec("full codec gz codec", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.bz2")); codec = factory.getCodec(new Path("/tmp/foo.bz2"));
checkCodec("full factory for .bz2", BZip2Codec.class, codec); checkCodec("full factory for .bz2", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
checkCodec("full factory for .BZ2", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
checkCodec("full codec bzip2 codec", BZip2Codec.class, codec); checkCodec("full codec bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.bar")); codec = factory.getCodec(new Path("/tmp/foo.bar"));
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BAR"));
checkCodec("full factory BAR codec", BarCodec.class, codec);
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodecByName("bar"); codec = factory.getCodecByName("bar");
@ -240,6 +257,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo/baz.FOO.bar"));
checkCodec("full factory FOO bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByName("foobar"); codec = factory.getCodecByName("foobar");
@ -249,6 +268,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.foo")); codec = factory.getCodec(new Path("/tmp/foo.foo"));
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/FOO.FOO"));
checkCodec("full factory FOO codec", FooCodec.class, codec);
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodecByName("foo"); codec = factory.getCodecByName("foo");
@ -259,6 +280,8 @@ public class TestCodecFactory {
factory = setClasses(new Class[]{NewGzipCodec.class}); factory = setClasses(new Class[]{NewGzipCodec.class});
codec = factory.getCodec(new Path("/tmp/foo.gz")); codec = factory.getCodec(new Path("/tmp/foo.gz"));
checkCodec("overridden factory for .gz", NewGzipCodec.class, codec); checkCodec("overridden factory for .gz", NewGzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
checkCodec("overridden factory for .GZ", NewGzipCodec.class, codec);
codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName());
checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec); checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec);