HADOOP-18029: Update CompressionCodecFactory to handle uppercase file extensions (#3739)
Co-authored-by: Desmond Sisson <sissonde@amazon.com>
This commit is contained in:
parent
4dea4a7b67
commit
df4197592f
|
@ -40,7 +40,7 @@ public class CompressionCodecFactory {
|
||||||
LoggerFactory.getLogger(CompressionCodecFactory.class.getName());
|
LoggerFactory.getLogger(CompressionCodecFactory.class.getName());
|
||||||
|
|
||||||
private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
|
private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS =
|
||||||
ServiceLoader.load(CompressionCodec.class);
|
ServiceLoader.load(CompressionCodec.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from the reversed filename suffixes to the codecs.
|
* A map from the reversed filename suffixes to the codecs.
|
||||||
|
@ -49,15 +49,15 @@ public class CompressionCodecFactory {
|
||||||
*/
|
*/
|
||||||
private SortedMap<String, CompressionCodec> codecs = null;
|
private SortedMap<String, CompressionCodec> codecs = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from the reversed filename suffixes to the codecs.
|
* A map from the reversed filename suffixes to the codecs.
|
||||||
* This is probably overkill, because the maps should be small, but it
|
* This is probably overkill, because the maps should be small, but it
|
||||||
* automatically supports finding the longest matching suffix.
|
* automatically supports finding the longest matching suffix.
|
||||||
*/
|
*/
|
||||||
private Map<String, CompressionCodec> codecsByName = null;
|
private Map<String, CompressionCodec> codecsByName = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from class names to the codecs
|
* A map from class names to the codecs.
|
||||||
*/
|
*/
|
||||||
private HashMap<String, CompressionCodec> codecsByClassName = null;
|
private HashMap<String, CompressionCodec> codecsByClassName = null;
|
||||||
|
|
||||||
|
@ -80,8 +80,8 @@ public class CompressionCodecFactory {
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
StringBuilder buf = new StringBuilder();
|
StringBuilder buf = new StringBuilder();
|
||||||
Iterator<Map.Entry<String, CompressionCodec>> itr =
|
Iterator<Map.Entry<String, CompressionCodec>> itr =
|
||||||
codecs.entrySet().iterator();
|
codecs.entrySet().iterator();
|
||||||
buf.append("{ ");
|
buf.append("{ ");
|
||||||
if (itr.hasNext()) {
|
if (itr.hasNext()) {
|
||||||
Map.Entry<String, CompressionCodec> entry = itr.next();
|
Map.Entry<String, CompressionCodec> entry = itr.next();
|
||||||
|
@ -110,8 +110,8 @@ public class CompressionCodecFactory {
|
||||||
*/
|
*/
|
||||||
public static List<Class<? extends CompressionCodec>> getCodecClasses(
|
public static List<Class<? extends CompressionCodec>> getCodecClasses(
|
||||||
Configuration conf) {
|
Configuration conf) {
|
||||||
List<Class<? extends CompressionCodec>> result
|
List<Class<? extends CompressionCodec>> result =
|
||||||
= new ArrayList<Class<? extends CompressionCodec>>();
|
new ArrayList<Class<? extends CompressionCodec>>();
|
||||||
// Add codec classes discovered via service loading
|
// Add codec classes discovered via service loading
|
||||||
synchronized (CODEC_PROVIDERS) {
|
synchronized (CODEC_PROVIDERS) {
|
||||||
// CODEC_PROVIDERS is a lazy collection. Synchronize so it is
|
// CODEC_PROVIDERS is a lazy collection. Synchronize so it is
|
||||||
|
@ -200,11 +200,13 @@ public class CompressionCodecFactory {
|
||||||
String filename = file.getName();
|
String filename = file.getName();
|
||||||
String reversedFilename =
|
String reversedFilename =
|
||||||
new StringBuilder(filename).reverse().toString();
|
new StringBuilder(filename).reverse().toString();
|
||||||
SortedMap<String, CompressionCodec> subMap =
|
String lowerReversedFilename =
|
||||||
codecs.headMap(reversedFilename);
|
StringUtils.toLowerCase(reversedFilename);
|
||||||
|
SortedMap<String, CompressionCodec> subMap =
|
||||||
|
codecs.headMap(lowerReversedFilename);
|
||||||
if (!subMap.isEmpty()) {
|
if (!subMap.isEmpty()) {
|
||||||
String potentialSuffix = subMap.lastKey();
|
String potentialSuffix = subMap.lastKey();
|
||||||
if (reversedFilename.startsWith(potentialSuffix)) {
|
if (lowerReversedFilename.startsWith(potentialSuffix)) {
|
||||||
result = codecs.get(potentialSuffix);
|
result = codecs.get(potentialSuffix);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -224,57 +226,57 @@ public class CompressionCodecFactory {
|
||||||
return codecsByClassName.get(classname);
|
return codecsByClassName.get(classname);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the relevant compression codec for the codec's canonical class name
|
* Find the relevant compression codec for the codec's canonical class name
|
||||||
* or by codec alias.
|
* or by codec alias.
|
||||||
* <p>
|
* <p>
|
||||||
* Codec aliases are case insensitive.
|
* Codec aliases are case insensitive.
|
||||||
* <p>
|
* <p>
|
||||||
* The code alias is the short class name (without the package name).
|
* The code alias is the short class name (without the package name).
|
||||||
* If the short class name ends with 'Codec', then there are two aliases for
|
* If the short class name ends with 'Codec', then there are two aliases for
|
||||||
* the codec, the complete short class name and the short class name without
|
* the codec, the complete short class name and the short class name without
|
||||||
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
||||||
* alias are 'gzip' and 'gzipcodec'.
|
* alias are 'gzip' and 'gzipcodec'.
|
||||||
*
|
*
|
||||||
* @param codecName the canonical class name of the codec
|
* @param codecName the canonical class name of the codec
|
||||||
* @return the codec object
|
* @return the codec object
|
||||||
*/
|
*/
|
||||||
public CompressionCodec getCodecByName(String codecName) {
|
public CompressionCodec getCodecByName(String codecName) {
|
||||||
if (codecsByClassName == null) {
|
if (codecsByClassName == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
|
||||||
CompressionCodec codec = getCodecByClassName(codecName);
|
|
||||||
if (codec == null) {
|
|
||||||
// trying to get the codec by name in case the name was specified
|
|
||||||
// instead a class
|
|
||||||
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
|
|
||||||
}
|
|
||||||
return codec;
|
|
||||||
}
|
}
|
||||||
|
CompressionCodec codec = getCodecByClassName(codecName);
|
||||||
|
if (codec == null) {
|
||||||
|
// trying to get the codec by name in case the name was specified
|
||||||
|
// instead a class
|
||||||
|
codec = codecsByName.get(StringUtils.toLowerCase(codecName));
|
||||||
|
}
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Find the relevant compression codec for the codec's canonical class name
|
* Find the relevant compression codec for the codec's canonical class name
|
||||||
* or by codec alias and returns its implemetation class.
|
* or by codec alias and returns its implemetation class.
|
||||||
* <p>
|
* <p>
|
||||||
* Codec aliases are case insensitive.
|
* Codec aliases are case insensitive.
|
||||||
* <p>
|
* <p>
|
||||||
* The code alias is the short class name (without the package name).
|
* The code alias is the short class name (without the package name).
|
||||||
* If the short class name ends with 'Codec', then there are two aliases for
|
* If the short class name ends with 'Codec', then there are two aliases for
|
||||||
* the codec, the complete short class name and the short class name without
|
* the codec, the complete short class name and the short class name without
|
||||||
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
||||||
* alias are 'gzip' and 'gzipcodec'.
|
* alias are 'gzip' and 'gzipcodec'.
|
||||||
*
|
*
|
||||||
* @param codecName the canonical class name of the codec
|
* @param codecName the canonical class name of the codec
|
||||||
* @return the codec class
|
* @return the codec class
|
||||||
*/
|
*/
|
||||||
public Class<? extends CompressionCodec> getCodecClassByName(
|
public Class<? extends CompressionCodec> getCodecClassByName(
|
||||||
String codecName) {
|
String codecName) {
|
||||||
CompressionCodec codec = getCodecByName(codecName);
|
CompressionCodec codec = getCodecByName(codecName);
|
||||||
if (codec == null) {
|
if (codec == null) {
|
||||||
return null;
|
return null;
|
||||||
}
|
|
||||||
return codec.getClass();
|
|
||||||
}
|
}
|
||||||
|
return codec.getClass();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes a suffix from a filename, if it has it.
|
* Removes a suffix from a filename, if it has it.
|
||||||
|
@ -323,8 +325,12 @@ public class CompressionCodecFactory {
|
||||||
len = in.read(buffer);
|
len = in.read(buffer);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
if(out != null) { out.close(); }
|
if(out != null) {
|
||||||
if(in != null) { in.close(); }
|
out.close();
|
||||||
|
}
|
||||||
|
if(in != null) {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
CompressionInputStream in = null;
|
CompressionInputStream in = null;
|
||||||
|
@ -338,7 +344,9 @@ public class CompressionCodecFactory {
|
||||||
len = in.read(buffer);
|
len = in.read(buffer);
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
if(in != null) { in.close(); }
|
if(in != null) {
|
||||||
|
in.close();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.hadoop.conf.Configuration;
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertNull;
|
||||||
import static org.junit.Assert.fail;
|
import static org.junit.Assert.fail;
|
||||||
|
|
||||||
public class TestCodecFactory {
|
public class TestCodecFactory {
|
||||||
|
@ -45,8 +46,8 @@ public class TestCodecFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompressionOutputStream createOutputStream(OutputStream out)
|
public CompressionOutputStream createOutputStream(OutputStream out)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -62,21 +63,21 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompressionInputStream createInputStream(InputStream in,
|
public CompressionInputStream createInputStream(InputStream in,
|
||||||
Decompressor decompressor)
|
Decompressor decompressor)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompressionInputStream createInputStream(InputStream in)
|
public CompressionInputStream createInputStream(InputStream in)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public CompressionOutputStream createOutputStream(OutputStream out,
|
public CompressionOutputStream createOutputStream(OutputStream out,
|
||||||
Compressor compressor)
|
Compressor compressor)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -125,7 +126,7 @@ public class TestCodecFactory {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a factory for a given set of codecs
|
* Returns a factory for a given set of codecs.
|
||||||
* @param classes the codec classes to include
|
* @param classes the codec classes to include
|
||||||
* @return a new factory
|
* @return a new factory
|
||||||
*/
|
*/
|
||||||
|
@ -137,15 +138,21 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
private static void checkCodec(String msg,
|
private static void checkCodec(String msg,
|
||||||
Class expected, CompressionCodec actual) {
|
Class expected, CompressionCodec actual) {
|
||||||
assertEquals(msg + " unexpected codec found",
|
if (expected == null) {
|
||||||
expected.getName(),
|
assertNull(msg, actual);
|
||||||
actual.getClass().getName());
|
} else if (actual == null) {
|
||||||
|
fail(msg + " result was null");
|
||||||
|
} else {
|
||||||
|
assertEquals(msg + " unexpected codec found",
|
||||||
|
expected.getName(),
|
||||||
|
actual.getClass().getName());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testFinding() {
|
public void testFinding() {
|
||||||
CompressionCodecFactory factory =
|
CompressionCodecFactory factory =
|
||||||
new CompressionCodecFactory(new Configuration());
|
new CompressionCodecFactory(new Configuration());
|
||||||
CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
CompressionCodec codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
||||||
assertEquals("default factory foo codec", null, codec);
|
assertEquals("default factory foo codec", null, codec);
|
||||||
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
||||||
|
@ -153,6 +160,8 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.gz"));
|
codec = factory.getCodec(new Path("/tmp/foo.gz"));
|
||||||
checkCodec("default factory for .gz", GzipCodec.class, codec);
|
checkCodec("default factory for .gz", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
|
||||||
|
checkCodec("default factory for .GZ", GzipCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
||||||
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
codec = factory.getCodecByName("gzip");
|
codec = factory.getCodecByName("gzip");
|
||||||
|
@ -168,6 +177,8 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
||||||
checkCodec("default factory for .bz2", BZip2Codec.class, codec);
|
checkCodec("default factory for .bz2", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
|
||||||
|
checkCodec("default factory for .BZ2", BZip2Codec.class, codec);
|
||||||
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
||||||
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
codec = factory.getCodecByName("bzip2");
|
codec = factory.getCodecByName("bzip2");
|
||||||
|
@ -221,16 +232,22 @@ public class TestCodecFactory {
|
||||||
FooBarCodec.class});
|
FooBarCodec.class});
|
||||||
codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
|
codec = factory.getCodec(new Path("/tmp/.foo.bar.gz"));
|
||||||
checkCodec("full factory gz codec", GzipCodec.class, codec);
|
checkCodec("full factory gz codec", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/.foo.bar.GZ"));
|
||||||
|
checkCodec("full factory GZ codec", GzipCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
||||||
checkCodec("full codec gz codec", GzipCodec.class, codec);
|
checkCodec("full codec gz codec", GzipCodec.class, codec);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
||||||
checkCodec("full factory for .bz2", BZip2Codec.class, codec);
|
checkCodec("full factory for .bz2", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo.BZ2"));
|
||||||
|
checkCodec("full factory for .BZ2", BZip2Codec.class, codec);
|
||||||
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
||||||
checkCodec("full codec bzip2 codec", BZip2Codec.class, codec);
|
checkCodec("full codec bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
||||||
checkCodec("full factory bar codec", BarCodec.class, codec);
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo.BAR"));
|
||||||
|
checkCodec("full factory BAR codec", BarCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory bar codec", BarCodec.class, codec);
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
codec = factory.getCodecByName("bar");
|
codec = factory.getCodecByName("bar");
|
||||||
|
@ -240,6 +257,8 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
|
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
|
||||||
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo/baz.FOO.bar"));
|
||||||
|
checkCodec("full factory FOO bar codec", FooBarCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
codec = factory.getCodecByName("foobar");
|
codec = factory.getCodecByName("foobar");
|
||||||
|
@ -249,6 +268,8 @@ public class TestCodecFactory {
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.foo"));
|
codec = factory.getCodec(new Path("/tmp/foo.foo"));
|
||||||
checkCodec("full factory foo codec", FooCodec.class, codec);
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/FOO.FOO"));
|
||||||
|
checkCodec("full factory FOO codec", FooCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory foo codec", FooCodec.class, codec);
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
codec = factory.getCodecByName("foo");
|
codec = factory.getCodecByName("foo");
|
||||||
|
@ -259,6 +280,8 @@ public class TestCodecFactory {
|
||||||
factory = setClasses(new Class[]{NewGzipCodec.class});
|
factory = setClasses(new Class[]{NewGzipCodec.class});
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.gz"));
|
codec = factory.getCodec(new Path("/tmp/foo.gz"));
|
||||||
checkCodec("overridden factory for .gz", NewGzipCodec.class, codec);
|
checkCodec("overridden factory for .gz", NewGzipCodec.class, codec);
|
||||||
|
codec = factory.getCodec(new Path("/tmp/foo.GZ"));
|
||||||
|
checkCodec("overridden factory for .GZ", NewGzipCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(NewGzipCodec.class.getCanonicalName());
|
||||||
checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec);
|
checkCodec("overridden factory for gzip codec", NewGzipCodec.class, codec);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue