HADOOP-7323. Add capability to resolve compression codec based on codec name. Contributed by Alejandro Abdelnur.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1133125 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Thomas White 2011-06-07 18:31:26 +00:00
parent cde5b33ba2
commit 8576c3984b
6 changed files with 154 additions and 8 deletions

View File

@ -199,6 +199,9 @@ Trunk (unreleased changes)
HADOOP-7316. Add public javadocs to FSDataInputStream and HADOOP-7316. Add public javadocs to FSDataInputStream and
FSDataOutputStream. (eli) FSDataOutputStream. (eli)
HADOOP-7323. Add capability to resolve compression codec based on codec
name. (Alejandro Abdelnur via tomwhite)
OPTIMIZATIONS OPTIMIZATIONS
HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole HADOOP-7333. Performance improvement in PureJavaCrc32. (Eric Caspole

View File

@ -174,7 +174,7 @@
<property> <property>
<name>io.compression.codecs</name> <name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec</value> <value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec</value>
<description>A list of the compression codec classes that can be used <description>A list of the compression codec classes that can be used
for compression/decompression.</description> for compression/decompression.</description>
</property> </property>

View File

@ -43,7 +43,14 @@ public class CompressionCodecFactory {
* automatically supports finding the longest matching suffix. * automatically supports finding the longest matching suffix.
*/ */
private SortedMap<String, CompressionCodec> codecs = null; private SortedMap<String, CompressionCodec> codecs = null;
/**
* A map from the reversed filename suffixes to the codecs.
* This is probably overkill, because the maps should be small, but it
* automatically supports finding the longest matching suffix.
*/
private Map<String, CompressionCodec> codecsByName = null;
/** /**
* A map from class names to the codecs * A map from class names to the codecs
*/ */
@ -53,8 +60,15 @@ public class CompressionCodecFactory {
String suffix = codec.getDefaultExtension(); String suffix = codec.getDefaultExtension();
codecs.put(new StringBuilder(suffix).reverse().toString(), codec); codecs.put(new StringBuilder(suffix).reverse().toString(), codec);
codecsByClassName.put(codec.getClass().getCanonicalName(), codec); codecsByClassName.put(codec.getClass().getCanonicalName(), codec);
String codecName = codec.getClass().getSimpleName();
codecsByName.put(codecName.toLowerCase(), codec);
if (codecName.endsWith("Codec")) {
codecName = codecName.substring(0, codecName.length() - "Codec".length());
codecsByName.put(codecName.toLowerCase(), codec);
}
} }
/** /**
* Print the extension map out as a string. * Print the extension map out as a string.
*/ */
@ -142,6 +156,7 @@ public class CompressionCodecFactory {
public CompressionCodecFactory(Configuration conf) { public CompressionCodecFactory(Configuration conf) {
codecs = new TreeMap<String, CompressionCodec>(); codecs = new TreeMap<String, CompressionCodec>();
codecsByClassName = new HashMap<String, CompressionCodec>(); codecsByClassName = new HashMap<String, CompressionCodec>();
codecsByName = new HashMap<String, CompressionCodec>();
List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf); List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf);
if (codecClasses == null) { if (codecClasses == null) {
addCodec(new GzipCodec()); addCodec(new GzipCodec());
@ -190,6 +205,56 @@ public class CompressionCodecFactory {
return codecsByClassName.get(classname); return codecsByClassName.get(classname);
} }
/**
* Find the relevant compression codec for the codec's canonical class name
* or by codec alias.
* <p/>
* Codec aliases are case insensitive.
* <p/>
* The code alias is the short class name (without the package name).
* If the short class name ends with 'Codec', then there are two aliases for
* the codec, the complete short class name and the short class name without
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
* alias are 'gzip' and 'gzipcodec'.
*
* @param codecName the canonical class name of the codec
* @return the codec object
*/
public CompressionCodec getCodecByName(String codecName) {
if (codecsByClassName == null) {
return null;
}
CompressionCodec codec = getCodecByClassName(codecName);
if (codec == null) {
// trying to get the codec by name in case the name was specified instead a class
codec = codecsByName.get(codecName.toLowerCase());
}
return codec;
}
/**
* Find the relevant compression codec for the codec's canonical class name
* or by codec alias and returns its implemetation class.
* <p/>
* Codec aliases are case insensitive.
* <p/>
* The code alias is the short class name (without the package name).
* If the short class name ends with 'Codec', then there are two aliases for
* the codec, the complete short class name and the short class name without
* the 'Codec' ending. For example for the 'GzipCodec' codec class name the
* alias are 'gzip' and 'gzipcodec'.
*
* @param codecName the canonical class name of the codec
* @return the codec class
*/
public Class<? extends CompressionCodec> getCodecClassByName(String codecName) {
CompressionCodec codec = getCodecByName(codecName);
if (codec == null) {
return null;
}
return codec.getClass();
}
/** /**
* Removes a suffix from a filename, if it has it. * Removes a suffix from a filename, if it has it.
* @param filename the filename to strip * @param filename the filename to strip

View File

@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.io.compress;
/**
* Alias class for DefaultCodec to enable codec discovery by 'deflate' name.
*/
public class DeflateCodec extends DefaultCodec {
}

View File

@ -97,6 +97,12 @@ public class TestCodec {
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec"); codecTest(conf, seed, count, "org.apache.hadoop.io.compress.BZip2Codec");
} }
@Test
public void testDeflateCodec() throws IOException {
codecTest(conf, seed, 0, "org.apache.hadoop.io.compress.DeflateCodec");
codecTest(conf, seed, count, "org.apache.hadoop.io.compress.DeflateCodec");
}
@Test @Test
public void testGzipCodecWithParam() throws IOException { public void testGzipCodecWithParam() throws IOException {
Configuration conf = new Configuration(this.conf); Configuration conf = new Configuration(this.conf);
@ -427,6 +433,13 @@ public class TestCodec {
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000); sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.BZip2Codec", 1000000);
} }
@Test
public void testSequenceFileDeflateCodec() throws IOException, ClassNotFoundException,
InstantiationException, IllegalAccessException {
sequenceFileCodecTest(conf, 100, "org.apache.hadoop.io.compress.DeflateCodec", 100);
sequenceFileCodecTest(conf, 200000, "org.apache.hadoop.io.compress.DeflateCodec", 1000000);
}
private static void sequenceFileCodecTest(Configuration conf, int lines, private static void sequenceFileCodecTest(Configuration conf, int lines,
String codecClass, int blockSize) String codecClass, int blockSize)
throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException { throws IOException, ClassNotFoundException, InstantiationException, IllegalAccessException {

View File

@ -131,12 +131,41 @@ public class TestCodecFactory extends TestCase {
checkCodec("default factory for .gz", GzipCodec.class, codec); checkCodec("default factory for .gz", GzipCodec.class, codec);
codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(GzipCodec.class.getCanonicalName());
checkCodec("default factory for gzip codec", GzipCodec.class, codec); checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("gzip");
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("GZIP");
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("GZIPCodec");
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
codec = factory.getCodecByName("gzipcodec");
checkCodec("default factory for gzip codec", GzipCodec.class, codec);
Class klass = factory.getCodecClassByName("gzipcodec");
assertEquals(GzipCodec.class, klass);
codec = factory.getCodec(new Path("/tmp/foo.bz2")); codec = factory.getCodec(new Path("/tmp/foo.bz2"));
checkCodec("default factory for .bz2", BZip2Codec.class, codec); checkCodec("default factory for .bz2", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec); checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("bzip2");
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("bzip2codec");
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("BZIP2");
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByName("BZIP2CODEC");
checkCodec("default factory for bzip2 codec", BZip2Codec.class, codec);
codec = factory.getCodecByClassName(DeflateCodec.class.getCanonicalName());
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
codec = factory.getCodecByName("deflate");
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
codec = factory.getCodecByName("deflatecodec");
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
codec = factory.getCodecByName("DEFLATE");
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
codec = factory.getCodecByName("DEFLATECODEC");
checkCodec("default factory for deflate codec", DeflateCodec.class, codec);
factory = setClasses(new Class[0]); factory = setClasses(new Class[0]);
codec = factory.getCodec(new Path("/tmp/foo.bar")); codec = factory.getCodec(new Path("/tmp/foo.bar"));
assertEquals("empty codec bar codec", null, codec); assertEquals("empty codec bar codec", null, codec);
@ -164,20 +193,32 @@ public class TestCodecFactory extends TestCase {
assertEquals("full factory for .bz2", null, codec); assertEquals("full factory for .bz2", null, codec);
codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName()); codec = factory.getCodecByClassName(BZip2Codec.class.getCanonicalName());
assertEquals("full codec bzip2 codec", null, codec); assertEquals("full codec bzip2 codec", null, codec);
codec = factory.getCodec(new Path("/tmp/foo.bar")); codec = factory.getCodec(new Path("/tmp/foo.bar"));
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(BarCodec.class.getCanonicalName());
checkCodec("full factory bar codec", BarCodec.class, codec); checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodecByName("bar");
checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodecByName("BAR");
checkCodec("full factory bar codec", BarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar")); codec = factory.getCodec(new Path("/tmp/foo/baz.foo.bar"));
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooBarCodec.class.getCanonicalName());
checkCodec("full factory foo bar codec", FooBarCodec.class, codec); checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByName("foobar");
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodecByName("FOOBAR");
checkCodec("full factory foo bar codec", FooBarCodec.class, codec);
codec = factory.getCodec(new Path("/tmp/foo.foo")); codec = factory.getCodec(new Path("/tmp/foo.foo"));
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName()); codec = factory.getCodecByClassName(FooCodec.class.getCanonicalName());
checkCodec("full factory foo codec", FooCodec.class, codec); checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodecByName("foo");
checkCodec("full factory foo codec", FooCodec.class, codec);
codec = factory.getCodecByName("FOO");
checkCodec("full factory foo codec", FooCodec.class, codec);
} }
} }