mirror of https://github.com/apache/druid.git
CompressionUtils: Add support for decompressing xz, bz2, zip. (#5586)
Also switch various firehoses to the new method. Fixes #5585.
This commit is contained in:
parent
b86ed99d9a
commit
5ab17668c0
|
@ -91,7 +91,7 @@ public class StaticAzureBlobStoreFirehoseFactory extends PrefetchableTextFilesFi
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(AzureBlob object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(AzureBlob object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getPath().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static AzureByteSource makeByteSource(AzureStorage azureStorage, AzureBlob object)
|
private static AzureByteSource makeByteSource(AzureStorage azureStorage, AzureBlob object)
|
||||||
|
|
|
@ -101,7 +101,7 @@ public class StaticCloudFilesFirehoseFactory extends PrefetchableTextFilesFireho
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(CloudFilesBlob object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(CloudFilesBlob object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getPath().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -93,7 +93,7 @@ public class StaticGoogleBlobStoreFirehoseFactory extends PrefetchableTextFilesF
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(GoogleBlob object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(GoogleBlob object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getPath().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -134,28 +134,14 @@ public final class UriCacheGenerator implements CacheGenerator<UriExtractionName
|
||||||
catch (NumberFormatException ex) {
|
catch (NumberFormatException ex) {
|
||||||
log.debug(ex, "Failed to get last modified timestamp. Assuming no timestamp");
|
log.debug(ex, "Failed to get last modified timestamp. Assuming no timestamp");
|
||||||
}
|
}
|
||||||
final ByteSource source;
|
final ByteSource source = new ByteSource()
|
||||||
if (CompressionUtils.isGz(uriPath)) {
|
|
||||||
// Simple gzip stream
|
|
||||||
log.debug("Loading gz");
|
|
||||||
source = new ByteSource()
|
|
||||||
{
|
{
|
||||||
@Override
|
@Override
|
||||||
public InputStream openStream() throws IOException
|
public InputStream openStream() throws IOException
|
||||||
{
|
{
|
||||||
return CompressionUtils.gzipInputStream(puller.getInputStream(uri));
|
return CompressionUtils.decompress(puller.getInputStream(uri), uri.getPath());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
} else {
|
|
||||||
source = new ByteSource()
|
|
||||||
{
|
|
||||||
@Override
|
|
||||||
public InputStream openStream() throws IOException
|
|
||||||
{
|
|
||||||
return puller.getInputStream(uri);
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
final CacheScheduler.VersionedCache versionedCache = scheduler.createVersionedCache(entryId, version);
|
final CacheScheduler.VersionedCache versionedCache = scheduler.createVersionedCache(entryId, version);
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -212,7 +212,7 @@ public class StaticS3FirehoseFactory extends PrefetchableTextFilesFirehoseFactor
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(S3ObjectSummary object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(S3ObjectSummary object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getKey().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getKey());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -81,6 +81,14 @@
|
||||||
<groupId>org.mozilla</groupId>
|
<groupId>org.mozilla</groupId>
|
||||||
<artifactId>rhino</artifactId>
|
<artifactId>rhino</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-compress</artifactId>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.tukaani</groupId>
|
||||||
|
<artifactId>xz</artifactId>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.jayway.jsonpath</groupId>
|
<groupId>com.jayway.jsonpath</groupId>
|
||||||
<artifactId>json-path</artifactId>
|
<artifactId>json-path</artifactId>
|
||||||
|
|
|
@ -28,14 +28,18 @@ import com.google.common.io.ByteStreams;
|
||||||
import com.google.common.io.Files;
|
import com.google.common.io.Files;
|
||||||
import io.druid.java.util.common.io.NativeIO;
|
import io.druid.java.util.common.io.NativeIO;
|
||||||
import io.druid.java.util.common.logger.Logger;
|
import io.druid.java.util.common.logger.Logger;
|
||||||
|
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
|
||||||
|
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
import java.io.BufferedInputStream;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.FilterInputStream;
|
import java.io.FilterInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.OutputStream;
|
import java.io.OutputStream;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.util.Enumeration;
|
import java.util.Enumeration;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.zip.GZIPOutputStream;
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
@ -48,7 +52,9 @@ public class CompressionUtils
|
||||||
{
|
{
|
||||||
private static final Logger log = new Logger(CompressionUtils.class);
|
private static final Logger log = new Logger(CompressionUtils.class);
|
||||||
private static final int DEFAULT_RETRY_COUNT = 3;
|
private static final int DEFAULT_RETRY_COUNT = 3;
|
||||||
|
private static final String BZ2_SUFFIX = ".bz2";
|
||||||
private static final String GZ_SUFFIX = ".gz";
|
private static final String GZ_SUFFIX = ".gz";
|
||||||
|
private static final String XZ_SUFFIX = ".xz";
|
||||||
private static final String ZIP_SUFFIX = ".zip";
|
private static final String ZIP_SUFFIX = ".zip";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -313,7 +319,7 @@ public class CompressionUtils
|
||||||
*
|
*
|
||||||
* @return A GZIPInputStream that can handle concatenated gzip streams in the input
|
* @return A GZIPInputStream that can handle concatenated gzip streams in the input
|
||||||
*/
|
*/
|
||||||
public static GZIPInputStream gzipInputStream(final InputStream in) throws IOException
|
private static GZIPInputStream gzipInputStream(final InputStream in) throws IOException
|
||||||
{
|
{
|
||||||
return new GZIPInputStream(
|
return new GZIPInputStream(
|
||||||
new FilterInputStream(in)
|
new FilterInputStream(in)
|
||||||
|
@ -516,4 +522,42 @@ public class CompressionUtils
|
||||||
}
|
}
|
||||||
throw new IAE("[%s] is not a valid gz file name", fname);
|
throw new IAE("[%s] is not a valid gz file name", fname);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decompress an input stream from a file, based on the filename.
|
||||||
|
*/
|
||||||
|
public static InputStream decompress(final InputStream in, final String fileName) throws IOException
|
||||||
|
{
|
||||||
|
if (fileName.endsWith(GZ_SUFFIX)) {
|
||||||
|
return gzipInputStream(in);
|
||||||
|
} else if (fileName.endsWith(BZ2_SUFFIX)) {
|
||||||
|
return new BZip2CompressorInputStream(in, true);
|
||||||
|
} else if (fileName.endsWith(XZ_SUFFIX)) {
|
||||||
|
return new XZCompressorInputStream(in, true);
|
||||||
|
} else if (fileName.endsWith(ZIP_SUFFIX)) {
|
||||||
|
// This reads the first file in the archive.
|
||||||
|
final ZipInputStream zipIn = new ZipInputStream(in, StandardCharsets.UTF_8);
|
||||||
|
try {
|
||||||
|
final ZipEntry nextEntry = zipIn.getNextEntry();
|
||||||
|
if (nextEntry == null) {
|
||||||
|
zipIn.close();
|
||||||
|
|
||||||
|
// No files in the archive - return an empty stream.
|
||||||
|
return new ByteArrayInputStream(new byte[0]);
|
||||||
|
}
|
||||||
|
return zipIn;
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
try {
|
||||||
|
zipIn.close();
|
||||||
|
}
|
||||||
|
catch (IOException e2) {
|
||||||
|
e.addSuppressed(e2);
|
||||||
|
}
|
||||||
|
throw e;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return in;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,8 @@ import com.google.common.io.ByteSink;
|
||||||
import com.google.common.io.ByteSource;
|
import com.google.common.io.ByteSource;
|
||||||
import com.google.common.io.ByteStreams;
|
import com.google.common.io.ByteStreams;
|
||||||
import com.google.common.io.Files;
|
import com.google.common.io.Files;
|
||||||
|
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
|
||||||
|
import org.apache.commons.compress.compressors.xz.XZCompressorOutputStream;
|
||||||
import org.junit.Assert;
|
import org.junit.Assert;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
|
@ -53,6 +55,8 @@ import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
import java.util.zip.GZIPInputStream;
|
import java.util.zip.GZIPInputStream;
|
||||||
import java.util.zip.GZIPOutputStream;
|
import java.util.zip.GZIPOutputStream;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
public class CompressionUtilsTest
|
public class CompressionUtilsTest
|
||||||
{
|
{
|
||||||
|
@ -221,7 +225,6 @@ public class CompressionUtilsTest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGoodGzipByteSource() throws IOException
|
public void testGoodGzipByteSource() throws IOException
|
||||||
{
|
{
|
||||||
|
@ -230,7 +233,7 @@ public class CompressionUtilsTest
|
||||||
Assert.assertFalse(gzFile.exists());
|
Assert.assertFalse(gzFile.exists());
|
||||||
CompressionUtils.gzip(Files.asByteSource(testFile), Files.asByteSink(gzFile), Predicates.<Throwable>alwaysTrue());
|
CompressionUtils.gzip(Files.asByteSource(testFile), Files.asByteSink(gzFile), Predicates.<Throwable>alwaysTrue());
|
||||||
Assert.assertTrue(gzFile.exists());
|
Assert.assertTrue(gzFile.exists());
|
||||||
try (final InputStream inputStream = CompressionUtils.gzipInputStream(new FileInputStream(gzFile))) {
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(gzFile), gzFile.getName())) {
|
||||||
assertGoodDataStream(inputStream);
|
assertGoodDataStream(inputStream);
|
||||||
}
|
}
|
||||||
if (!testFile.delete()) {
|
if (!testFile.delete()) {
|
||||||
|
@ -244,6 +247,50 @@ public class CompressionUtilsTest
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDecompressBzip2() throws IOException
|
||||||
|
{
|
||||||
|
final File tmpDir = temporaryFolder.newFolder("testDecompressBzip2");
|
||||||
|
final File bzFile = new File(tmpDir, testFile.getName() + ".bz2");
|
||||||
|
Assert.assertFalse(bzFile.exists());
|
||||||
|
try (final OutputStream out = new BZip2CompressorOutputStream(new FileOutputStream(bzFile))) {
|
||||||
|
ByteStreams.copy(new FileInputStream(testFile), out);
|
||||||
|
}
|
||||||
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(bzFile), bzFile.getName())) {
|
||||||
|
assertGoodDataStream(inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDecompressXz() throws IOException
|
||||||
|
{
|
||||||
|
final File tmpDir = temporaryFolder.newFolder("testDecompressXz");
|
||||||
|
final File xzFile = new File(tmpDir, testFile.getName() + ".xz");
|
||||||
|
Assert.assertFalse(xzFile.exists());
|
||||||
|
try (final OutputStream out = new XZCompressorOutputStream(new FileOutputStream(xzFile))) {
|
||||||
|
ByteStreams.copy(new FileInputStream(testFile), out);
|
||||||
|
}
|
||||||
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(xzFile), xzFile.getName())) {
|
||||||
|
assertGoodDataStream(inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDecompressZip() throws IOException
|
||||||
|
{
|
||||||
|
final File tmpDir = temporaryFolder.newFolder("testDecompressZip");
|
||||||
|
final File zipFile = new File(tmpDir, testFile.getName() + ".zip");
|
||||||
|
Assert.assertFalse(zipFile.exists());
|
||||||
|
try (final ZipOutputStream out = new ZipOutputStream(new FileOutputStream(zipFile))) {
|
||||||
|
out.putNextEntry(new ZipEntry("cool.file"));
|
||||||
|
ByteStreams.copy(new FileInputStream(testFile), out);
|
||||||
|
out.closeEntry();
|
||||||
|
}
|
||||||
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(zipFile), zipFile.getName())) {
|
||||||
|
assertGoodDataStream(inputStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGoodGZStream() throws IOException
|
public void testGoodGZStream() throws IOException
|
||||||
{
|
{
|
||||||
|
@ -490,7 +537,7 @@ public class CompressionUtilsTest
|
||||||
}, Predicates.<Throwable>alwaysTrue()
|
}, Predicates.<Throwable>alwaysTrue()
|
||||||
);
|
);
|
||||||
Assert.assertTrue(gzFile.exists());
|
Assert.assertTrue(gzFile.exists());
|
||||||
try (final InputStream inputStream = CompressionUtils.gzipInputStream(new FileInputStream(gzFile))) {
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(gzFile), "file.gz")) {
|
||||||
assertGoodDataStream(inputStream);
|
assertGoodDataStream(inputStream);
|
||||||
}
|
}
|
||||||
if (!testFile.delete()) {
|
if (!testFile.delete()) {
|
||||||
|
@ -536,7 +583,7 @@ public class CompressionUtilsTest
|
||||||
Assert.assertFalse(gzFile.exists());
|
Assert.assertFalse(gzFile.exists());
|
||||||
CompressionUtils.gzip(Files.asByteSource(testFile), Files.asByteSink(gzFile), Predicates.<Throwable>alwaysTrue());
|
CompressionUtils.gzip(Files.asByteSource(testFile), Files.asByteSink(gzFile), Predicates.<Throwable>alwaysTrue());
|
||||||
Assert.assertTrue(gzFile.exists());
|
Assert.assertTrue(gzFile.exists());
|
||||||
try (final InputStream inputStream = CompressionUtils.gzipInputStream(new FileInputStream(gzFile))) {
|
try (final InputStream inputStream = CompressionUtils.decompress(new FileInputStream(gzFile), "file.gz")) {
|
||||||
assertGoodDataStream(inputStream);
|
assertGoodDataStream(inputStream);
|
||||||
}
|
}
|
||||||
if (testFile.exists() && !testFile.delete()) {
|
if (testFile.exists() && !testFile.delete()) {
|
||||||
|
|
10
pom.xml
10
pom.xml
|
@ -325,6 +325,16 @@
|
||||||
<artifactId>rhino</artifactId>
|
<artifactId>rhino</artifactId>
|
||||||
<version>1.7R5</version>
|
<version>1.7R5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.commons</groupId>
|
||||||
|
<artifactId>commons-compress</artifactId>
|
||||||
|
<version>1.16</version>
|
||||||
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.tukaani</groupId>
|
||||||
|
<artifactId>xz</artifactId>
|
||||||
|
<version>1.8</version>
|
||||||
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-annotations</artifactId>
|
<artifactId>jackson-annotations</artifactId>
|
||||||
|
|
|
@ -105,7 +105,7 @@ public class HttpFirehoseFactory extends PrefetchableTextFilesFirehoseFactory<UR
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(URI object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(URI object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getPath().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -22,10 +22,10 @@ package io.druid.segment.realtime.firehose;
|
||||||
import com.fasterxml.jackson.annotation.JsonCreator;
|
import com.fasterxml.jackson.annotation.JsonCreator;
|
||||||
import com.fasterxml.jackson.annotation.JsonProperty;
|
import com.fasterxml.jackson.annotation.JsonProperty;
|
||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
import io.druid.java.util.emitter.EmittingLogger;
|
|
||||||
import io.druid.data.input.impl.AbstractTextFilesFirehoseFactory;
|
import io.druid.data.input.impl.AbstractTextFilesFirehoseFactory;
|
||||||
import io.druid.data.input.impl.StringInputRowParser;
|
import io.druid.data.input.impl.StringInputRowParser;
|
||||||
import io.druid.java.util.common.CompressionUtils;
|
import io.druid.java.util.common.CompressionUtils;
|
||||||
|
import io.druid.java.util.emitter.EmittingLogger;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.filefilter.TrueFileFilter;
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
||||||
import org.apache.commons.io.filefilter.WildcardFileFilter;
|
import org.apache.commons.io.filefilter.WildcardFileFilter;
|
||||||
|
@ -97,6 +97,6 @@ public class LocalFirehoseFactory extends AbstractTextFilesFirehoseFactory<File>
|
||||||
@Override
|
@Override
|
||||||
protected InputStream wrapObjectStream(File object, InputStream stream) throws IOException
|
protected InputStream wrapObjectStream(File object, InputStream stream) throws IOException
|
||||||
{
|
{
|
||||||
return object.getPath().endsWith(".gz") ? CompressionUtils.gzipInputStream(stream) : stream;
|
return CompressionUtils.decompress(stream, object.getPath());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue