HADOOP-7323. Add capability to resolve compression codec based on codec name. Contributed by Alejandro Abdelnur.
git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1133125 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cde5b33ba2
commit
8576c3984b
|
@ -199,6 +199,9 @@ Trunk (unreleased changes)
|
||||||
HADOOP-7316. Add public javadocs to FSDataInputStream and
|
HADOOP-7316. Add public javadocs to FSDataInputStream and
|
||||||
FSDataOutputStream. (eli)
|
FSDataOutputStream. (eli)
|
||||||
|
|
||||||
|
HADOOP-7323. Add capability to resolve compression codec based on codec
|
||||||
|
name. (Alejandro Abdelnur via tomwhite)
|
||||||
|
|
||||||
OPTIMIZATIONS
|
OPTIMIZATIONS
|
||||||
|
|
||||||
HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole
|
HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole
|
||||||
|
|
|
@ -174,7 +174,7 @@
|
||||||
|
|
||||||
<property>
|
<property>
|
||||||
<name>io.compression.codecs</name>
|
<name>io.compression.codecs</name>
|
||||||
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec</value>
|
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec</value>
|
||||||
<description>A list of the compression codec classes that can be used
|
<description>A list of the compression codec classes that can be used
|
||||||
for compression/decompression.</description>
|
for compression/decompression.</description>
|
||||||
</property>
|
</property>
|
||||||
|
|
|
@ -43,7 +43,14 @@ public class CompressionCodecFactory {
|
||||||
* automatically supports finding the longest matching suffix.
|
* automatically supports finding the longest matching suffix.
|
||||||
*/
|
*/
|
||||||
private SortedMap<String, CompressionCodec> codecs = null;
|
private SortedMap<String, CompressionCodec> codecs = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A map from the reversed filename suffixes to the codecs.
|
||||||
|
* This is probably overkill, because the maps should be small, but it
|
||||||
|
* automatically supports finding the longest matching suffix.
|
||||||
|
*/
|
||||||
|
private Map<String, CompressionCodec> codecsByName = null;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A map from class names to the codecs
|
* A map from class names to the codecs
|
||||||
*/
|
*/
|
||||||
|
@ -53,8 +60,15 @@ public class CompressionCodecFactory {
|
||||||
String suffix = codec.getDefaultExtension();
|
String suffix = codec.getDefaultExtension();
|
||||||
codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
|
codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
|
||||||
codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
|
codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
|
||||||
|
|
||||||
|
String codecName = codec.getClass().getSimpleName();
|
||||||
|
codecsByName.put(codecName.toLowerCase(), codec);
|
||||||
|
if (codecName.endsWith("Codec")) {
|
||||||
|
codecName = codecName.substring(0, codecName.length() - "Codec".length());
|
||||||
|
codecsByName.put(codecName.toLowerCase(), codec);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Print the extension map out as a string.
|
* Print the extension map out as a string.
|
||||||
*/
|
*/
|
||||||
|
@ -142,6 +156,7 @@ public class CompressionCodecFactory {
|
||||||
public CompressionCodecFactory(Configuration conf) {
|
public CompressionCodecFactory(Configuration conf) {
|
||||||
codecs = new TreeMap<String, CompressionCodec>();
|
codecs = new TreeMap<String, CompressionCodec>();
|
||||||
codecsByClassName = new HashMap<String, CompressionCodec>();
|
codecsByClassName = new HashMap<String, CompressionCodec>();
|
||||||
|
codecsByName = new HashMap<String, CompressionCodec>();
|
||||||
List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
|
List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
|
||||||
if (codecClasses == null) {
|
if (codecClasses == null) {
|
||||||
addCodec(new GzipCodec());
|
addCodec(new GzipCodec());
|
||||||
|
@ -190,6 +205,56 @@ public class CompressionCodecFactory {
|
||||||
return codecsByClassName.get(classname);
|
return codecsByClassName.get(classname);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the relevant compression codec for the codec's canonical class name
|
||||||
|
* or by codec alias.
|
||||||
|
* <p/>
|
||||||
|
* Codec aliases are case insensitive.
|
||||||
|
* <p/>
|
||||||
|
* The code alias is the short class name (without the package name).
|
||||||
|
* If the short class name ends with 'Codec', then there are two aliases for
|
||||||
|
* the codec, the complete short class name and the short class name without
|
||||||
|
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
||||||
|
* alias are 'gzip' and 'gzipcodec'.
|
||||||
|
*
|
||||||
|
* @param codecName the canonical class name of the codec
|
||||||
|
* @return the codec object
|
||||||
|
*/
|
||||||
|
public CompressionCodec getCodecByName(String codecName) {
|
||||||
|
if (codecsByClassName == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
CompressionCodec codec = getCodecByClassName(codecName);
|
||||||
|
if (codec == null) {
|
||||||
|
// trying to get the codec by name in case the name was specified instead a class
|
||||||
|
codec = codecsByName.get(codecName.toLowerCase());
|
||||||
|
}
|
||||||
|
return codec;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Find the relevant compression codec for the codec's canonical class name
|
||||||
|
* or by codec alias and returns its implemetation class.
|
||||||
|
* <p/>
|
||||||
|
* Codec aliases are case insensitive.
|
||||||
|
* <p/>
|
||||||
|
* The code alias is the short class name (without the package name).
|
||||||
|
* If the short class name ends with 'Codec', then there are two aliases for
|
||||||
|
* the codec, the complete short class name and the short class name without
|
||||||
|
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
|
||||||
|
* alias are 'gzip' and 'gzipcodec'.
|
||||||
|
*
|
||||||
|
* @param codecName the canonical class name of the codec
|
||||||
|
* @return the codec class
|
||||||
|
*/
|
||||||
|
public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
|
||||||
|
CompressionCodec codec = getCodecByName(codecName);
|
||||||
|
if (codec == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return codec.getClass();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes a suffix from a filename, if it has it.
|
* Removes a suffix from a filename, if it has it.
|
||||||
* @param filename the filename to strip
|
* @param filename the filename to strip
|
||||||
|
|
|
@ -0,0 +1,24 @@
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.hadoop.io.compress;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Alias class for DefaultCodec to enable codec discovery by 'deflate' name.
|
||||||
|
*/
|
||||||
|
public class DeflateCodec extends DefaultCodec {
|
||||||
|
}
|
|
@ -97,6 +97,12 @@ public class TestCodec {
|
||||||
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec");
|
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDeflateCodec() throws IOException {
|
||||||
|
codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.DeflateCodec");
|
||||||
|
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.DeflateCodec");
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGzipCodecWithParam() throws IOException {
|
public void testGzipCodecWithParam() throws IOException {
|
||||||
Configuration conf = new Configuration(this.conf);
|
Configuration conf = new Configuration(this.conf);
|
||||||
|
@ -427,6 +433,13 @@ public class TestCodec {
|
||||||
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000);
|
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSequenceFileDeflateCodec() throws IOException, ClassNotFoundException,
|
||||||
|
InstantiationException, IllegalAccessException {
|
||||||
|
sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.DeflateCodec", 100);
|
||||||
|
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.DeflateCodec", 1000000);
|
||||||
|
}
|
||||||
|
|
||||||
private static void sequenceFileCodecTest(Configuration conf, int lines,
|
private static void sequenceFileCodecTest(Configuration conf, int lines,
|
||||||
String codecClass, int blockSize)
|
String codecClass, int blockSize)
|
||||||
throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
|
throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {
|
||||||
|
|
|
@ -131,12 +131,41 @@ public class TestCodecFactory extends TestCase {
|
||||||
checkCodec("default factory for .gz", GzipCodec.class, codec);
|
checkCodec("default factory for .gz", GzipCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
|
||||||
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("gzip");
|
||||||
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("GZIP");
|
||||||
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("GZIPCodec");
|
||||||
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("gzipcodec");
|
||||||
|
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
|
||||||
|
Class klass = factory.getCodecClassByName("gzipcodec");
|
||||||
|
assertEquals(GzipCodec.class, klass);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
codec = factory.getCodec(new Path("/tmp/foo.bz2"));
|
||||||
checkCodec("default factory for .bz2", BZip2Codec.class, codec);
|
checkCodec("default factory for .bz2", BZip2Codec.class, codec);
|
||||||
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
||||||
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodecByName("bzip2");
|
||||||
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodecByName("bzip2codec");
|
||||||
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodecByName("BZIP2");
|
||||||
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
codec = factory.getCodecByName("BZIP2CODEC");
|
||||||
|
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
|
||||||
|
|
||||||
|
codec = factory.getCodecByClassName(DeflateCodec.class.getCanonicalName());
|
||||||
|
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("deflate");
|
||||||
|
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("deflatecodec");
|
||||||
|
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("DEFLATE");
|
||||||
|
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("DEFLATECODEC");
|
||||||
|
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
|
||||||
|
|
||||||
factory = setClasses(new Class[0]);
|
factory = setClasses(new Class[0]);
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
||||||
assertEquals("empty codec bar codec", null, codec);
|
assertEquals("empty codec bar codec", null, codec);
|
||||||
|
@ -164,20 +193,32 @@ public class TestCodecFactory extends TestCase {
|
||||||
assertEquals("full factory for .bz2", null, codec);
|
assertEquals("full factory for .bz2", null, codec);
|
||||||
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
|
||||||
assertEquals("full codec bzip2 codec", null, codec);
|
assertEquals("full codec bzip2 codec", null, codec);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
codec = factory.getCodec(new Path("/tmp/foo.bar"));
|
||||||
checkCodec("full factory bar codec", BarCodec.class, codec);
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory bar codec", BarCodec.class, codec);
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("bar");
|
||||||
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("BAR");
|
||||||
|
checkCodec("full factory bar codec", BarCodec.class, codec);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
|
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
|
||||||
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("foobar");
|
||||||
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("FOOBAR");
|
||||||
|
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
|
||||||
|
|
||||||
codec = factory.getCodec(new Path("/tmp/foo.foo"));
|
codec = factory.getCodec(new Path("/tmp/foo.foo"));
|
||||||
checkCodec("full factory foo codec", FooCodec.class, codec);
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
|
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
|
||||||
checkCodec("full factory foo codec", FooCodec.class, codec);
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("foo");
|
||||||
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
|
codec = factory.getCodecByName("FOO");
|
||||||
|
checkCodec("full factory foo codec", FooCodec.class, codec);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue