HADOOP-18029: Update CompressionCodecFactory to handle uppercase file extensions (#3739)

Co-authored-by: Desmond Sisson <sissonde@amazon.com>
This commit is contained in:
Desmond Sisson 2021-12-01 15:36:54 -08:00 committed by GitHub
parent 4dea4a7b67
commit df4197592f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 110 additions and 79 deletions

View File

@ -40,7 +40,7 @@ public class CompressionCodecFactory {
LoggerFactory.getLogger(CompressionCodecFactory.class.getName()); LoggerFactory.getLogger(CompressionCodecFactory.class.getName());
private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS = private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
ServiceLoader.load(CompressionCodec.class); ServiceLoader.load(CompressionCodec.class);
/** /**
* A map from the reversed filename suffixes to the codecs. * A map from the reversed filename suffixes to the codecs.
@ -49,15 +49,15 @@ public class CompressionCodecFactory {
*/ */
private SortedMap<String, CompressionCodec> codecs = null; private SortedMap<String, CompressionCodec> codecs = null;
/** /**
* A map from the reversed filename suffixes to the codecs. * A map from the reversed filename suffixes to the codecs.
* This is probably overkill, because the maps should be small, but it * This is probably overkill, because the maps should be small, but it
* automatically supports finding the longest matching suffix. * automatically supports finding the longest matching suffix.
*/ */
private Map<String, CompressionCodec> codecsByName = null; private Map<String, CompressionCodec> codecsByName = null;
/** /**
* A map from class names to the codecs * A map from class names to the codecs.
*/ */
private HashMap<String, CompressionCodec> codecsByClassName = null; private HashMap<String, CompressionCodec> codecsByClassName = null;
@ -80,8 +80,8 @@ public class CompressionCodecFactory {
@Override @Override
public String toString() { public String toString() {
StringBuilder buf = new StringBuilder(); StringBuilder buf = new StringBuilder();
Iterator<Map.Entry<String, CompressionCodec>> itr = Iterator<Map.Entry<String, CompressionCodec>> itr =
codecs.entrySet().iterator(); codecs.entrySet().iterator();
buf.append("{ "); buf.append("{ ");
if (itr.hasNext()) { if (itr.hasNext()) {
Map.Entry<String, CompressionCodec> entry = itr.next(); Map.Entry<String, CompressionCodec> entry = itr.next();
@ -110,8 +110,8 @@ public class CompressionCodecFactory {
*/ */
public static List<Class<? extends CompressionCodec>> getCodecClasses( public static List<Class<? extends CompressionCodec>> getCodecClasses(
Configuration conf) { Configuration conf) {
List<Class<? extends CompressionCodec>> result List<Class<? extends CompressionCodec>> result =
= new ArrayList<Class<? extends CompressionCodec>>(); new ArrayList<Class<? extends CompressionCodec>>();
// Add codec classes discovered via service loading // Add codec classes discovered via service loading
synchronized (CODEC_PROVIDERS) { synchronized (CODEC_PROVIDERS) {
// CODEC_PROVIDERS is a lazy collection. Synchronize so it is // CODEC_PROVIDERS is a lazy collection. Synchronize so it is
@ -200,11 +200,13 @@ public class CompressionCodecFactory {
String filename = file.getName(); String filename = file.getName();
String reversedFilename = String reversedFilename =
new StringBuilder(filename).reverse().toString(); new StringBuilder(filename).reverse().toString();
SortedMap<String, CompressionCodec> subMap = String lowerReversedFilename =
codecs.headMap(reversedFilename); StringUtils.toLowerCase(reversedFilename);
SortedMap<String, CompressionCodec> subMap =
codecs.headMap(lowerReversedFilename);
if (!subMap.isEmpty()) { if (!subMap.isEmpty()) {
String potentialSuffix = subMap.lastKey(); String potentialSuffix = subMap.lastKey();
if (reversedFilename.startsWith(potentialSuffix)) { if (lowerReversedFilename.startsWith(potentialSuffix)) {
result = codecs.get(potentialSuffix); result = codecs.get(potentialSuffix);
} }
} }
@ -224,57 +226,57 @@ public class CompressionCodecFactory {
return codecsByClassName.get(classname); return codecsByClassName.get(classname);
} }
/** /**
* Find the relevant compression codec for the codec's canonical class name * Find the relevant compression codec for the codec's canonical class name
* or by codec alias. * or by codec alias.
* <p> * <p>
* Codec aliases are case insensitive. * Codec aliases are case insensitive.
* <p> * <p>
* The code alias is the short class name (without the package name). * The code alias is the short class name (without the package name).
* If the short class name ends with 'Codec', then there are two aliases for * If the short class name ends with 'Codec', then there are two aliases for
* the codec, the complete short class name and the short class name without * the codec, the complete short class name and the short class name without
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
* alias are 'gzip' and 'gzipcodec'. * alias are 'gzip' and 'gzipcodec'.
* *
* @param codecName the canonical class name of the codec * @param codecName the canonical class name of the codec
* @return the codec object * @return the codec object
*/ */
public CompressionCodec getCodecByName(String codecName) { public CompressionCodec getCodecByName(String codecName) {
if (codecsByClassName == null) { if (codecsByClassName == null) {
return null; return null;
}
CompressionCodec codec = getCodecByClassName(codecName);
if (codec == null) {
// trying to get the codec by name in case the name was specified
// instead a class
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
}
return codec;
} }
CompressionCodec codec = getCodecByClassName(codecName);
if (codec == null) {
// trying to get the codec by name in case the name was specified
// instead a class
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
}
return codec;
}
/** /**
* Find the relevant compression codec for the codec's canonical class name * Find the relevant compression codec for the codec's canonical class name
* or by codec alias and returns its implemetation class. * or by codec alias and returns its implemetation class.
* <p> * <p>
* Codec aliases are case insensitive. * Codec aliases are case insensitive.
* <p> * <p>
* The code alias is the short class name (without the package name). * The code alias is the short class name (without the package name).
* If the short class name ends with 'Codec', then there are two aliases for * If the short class name ends with 'Codec', then there are two aliases for
* the codec, the complete short class name and the short class name without * the codec, the complete short class name and the short class name without
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the * the 'Codec' ending. For example for the 'GzipCodec' codec class name the
* alias are 'gzip' and 'gzipcodec'. * alias are 'gzip' and 'gzipcodec'.
* *
* @param codecName the canonical class name of the codec * @param codecName the canonical class name of the codec
* @return the codec class * @return the codec class
*/ */
public Class<? extends CompressionCodec> getCodecClassByName( public Class<? extends CompressionCodec> getCodecClassByName(
String codecName) { String codecName) {
CompressionCodec codec = getCodecByName(codecName); CompressionCodec codec = getCodecByName(codecName);
if (codec == null) { if (codec == null) {
return null; return null;
}
return codec.getClass();
} }
return codec.getClass();
}
/** /**
* Removes a suffix from a filename, if it has it. * Removes a suffix from a filename, if it has it.
@ -323,8 +325,12 @@ public class CompressionCodecFactory {
len = in.read(buffer); len = in.read(buffer);
} }
} finally { } finally {
if(out != null) { out.close(); } if(out != null) {
if(in != null) { in.close(); } out.close();
}
if(in != null) {
in.close();
}
} }
} else { } else {
CompressionInputStream in = null; CompressionInputStream in = null;
@ -338,7 +344,9 @@ public class CompressionCodecFactory {
len = in.read(buffer); len = in.read(buffer);
} }
} finally { } finally {
if(in != null) { in.close(); } if(in != null) {
in.close();
}
} }
} }
} }

View File

@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration;
import org.junit.Test; import org.junit.Test;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.fail; import static org.junit.Assert.fail;
public class TestCodecFactory { public class TestCodecFactory {
@ -45,8 +46,8 @@ public class TestCodecFactory {
} }
@Override @Override
public CompressionOutputStream createOutputStream(OutputStream out) public CompressionOutputStream createOutputStream(OutputStream out)
throws IOException { throws IOException {
return null; return null;
} }
@ -62,21 +63,21 @@ public class TestCodecFactory {
@Override @Override
public CompressionInputStream createInputStream(InputStream in, public CompressionInputStream createInputStream(InputStream in,
Decompressor decompressor) Decompressor decompressor)
throws IOException { throws IOException {
return null; return null;
} }
@Override @Override
public CompressionInputStream createInputStream(InputStream in) public CompressionInputStream createInputStream(InputStream in)
throws IOException { throws IOException {
return null; return null;
} }
@Override @Override
public CompressionOutputStream createOutputStream(OutputStream out, public CompressionOutputStream createOutputStream(OutputStream out,
Compressor compressor) Compressor compressor)
throws IOException { throws IOException {
return null; return null;
} }
@ -125,7 +126,7 @@ public class TestCodecFactory {
} }
/** /**
* Returns a factory for a given set of codecs * Returns a factory for a given set of codecs.
* @param classes the codec classes to include * @param classes the codec classes to include
* @return a new factory * @return a new factory
*/ */
@ -137,15 +138,21 @@ public class TestCodecFactory {
private static void checkCodec(String msg, private static void checkCodec(String msg,
Class expected, CompressionCodec actual) { Class expected, CompressionCodec actual) {
assertEquals(msg + " unexpected codec found", if (expected == null) {
expected.getName(), assertNull(msg, actual);
actual.getClass().getName()); } else if (actual == null) {
fail(msg + " result was null");
} else {
assertEquals(msg + " unexpected codec found",
expected.getName(),
actual.getClass().getName());
}
} }
@Test @Test
public void testFinding() { public void testFinding() {
CompressionCodecFactory factory = CompressionCodecFactory factory =
new CompressionCodecFactory(new Configuration()); new CompressionCodecFactory(new Configuration());
CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar")); CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar"));
assertEquals("default factory foo codec", null, codec); assertEquals("default factory foo codec", null, codec);
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
@ -153,6 +160,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.gz")); codec = factory.getCodec(new Path("/tmp/foo.gz"));
checkCodec("default factory for .gz", GzipCodec.class, codec); checkCodec("default factory for .gz", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
checkCodec("default factory for .GZ", GzipCodec.class, codec);
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
checkCodec("default factory for gzip codec", GzipCodec.class, codec); checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("gzip"); codec = factory.getCodecByName("gzip");
@ -168,6 +177,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.bz2")); codec = factory.getCodec(new Path("/tmp/foo.bz2"));
checkCodec("default factory for .bz2", BZip2Codec.class, codec); checkCodec("default factory for .bz2", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
checkCodec("default factory for .BZ2", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("bzip2"); codec = factory.getCodecByName("bzip2");
@ -221,16 +232,22 @@ public class TestCodecFactory {
FooBarCodec.class}); FooBarCodec.class});
codec = factory.getCodec(new Path("/tmp/.foo.bar.gz")); codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
checkCodec("full factory gz codec", GzipCodec.class, codec); checkCodec("full factory gz codec", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/.foo.bar.GZ"));
checkCodec("full factory GZ codec", GzipCodec.class, codec);
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
checkCodec("full codec gz codec", GzipCodec.class, codec); checkCodec("full codec gz codec", GzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.bz2")); codec = factory.getCodec(new Path("/tmp/foo.bz2"));
checkCodec("full factory for .bz2", BZip2Codec.class, codec); checkCodec("full factory for .bz2", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
checkCodec("full factory for .BZ2", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
checkCodec("full codec bzip2 codec", BZip2Codec.class, codec); checkCodec("full codec bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.bar")); codec = factory.getCodec(new Path("/tmp/foo.bar"));
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.BAR"));
checkCodec("full factory BAR codec", BarCodec.class, codec);
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodecByName("bar"); codec = factory.getCodecByName("bar");
@ -240,6 +257,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo/baz.FOO.bar"));
checkCodec("full factory FOO bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByName("foobar"); codec = factory.getCodecByName("foobar");
@ -249,6 +268,8 @@ public class TestCodecFactory {
codec = factory.getCodec(new Path("/tmp/foo.foo")); codec = factory.getCodec(new Path("/tmp/foo.foo"));
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/FOO.FOO"));
checkCodec("full factory FOO codec", FooCodec.class, codec);
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodecByName("foo"); codec = factory.getCodecByName("foo");
@ -259,6 +280,8 @@ public class TestCodecFactory {
factory = setClasses(new Class[]{NewGzipCodec.class}); factory = setClasses(new Class[]{NewGzipCodec.class});
codec = factory.getCodec(new Path("/tmp/foo.gz")); codec = factory.getCodec(new Path("/tmp/foo.gz"));
checkCodec("overridden factory for .gz", NewGzipCodec.class, codec); checkCodec("overridden factory for .gz", NewGzipCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
checkCodec("overridden factory for .GZ", NewGzipCodec.class, codec);
codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName());
checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec); checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec);