SOLR-10981: Support for stream.url or stream.file pointing to gzipped data

This commit is contained in:
Andrew Lundgren 2018-10-18 19:53:21 -04:00 committed by David Smiley
parent fd9164801e
commit 1a8188d92b
4 changed files with 230 additions and 79 deletions

View File

@ -209,6 +209,9 @@ Improvements
* SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data. It's detected by either a content
encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
================== 7.5.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
====
The source of the data can be compressed using gzip, and Solr will generally detect this.
The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
Gzip doesn't apply to `stream.body`.
== Debugging Requests
The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.

View File

@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.function.Predicate;
import java.util.zip.GZIPInputStream;
import org.apache.http.entity.ContentType;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.request.RequestWriter;
@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
*/
public abstract class ContentStreamBase implements ContentStream
{
public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
private static final String TEXT_CSV = "text/csv";
private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
protected String name;
protected String sourceInfo;
protected String contentType;
@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
}
return null;
}
protected String attemptToDetermineContentType() {
String type = null;
if (name != null) {
Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
if (XML_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_XML.getMimeType();
} else if (JSON_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_JSON.getMimeType();
} else if (CSV_SUF.stream().anyMatch(endsWith)) {
type = TEXT_CSV;
} else {
type = attemptToDetermineTypeFromFirstCharacter();
}
}
return type;
}
private String attemptToDetermineTypeFromFirstCharacter() {
String type = null;
try (InputStream stream = getStream()) {
// Last ditch effort to determine content, if the first non-white space
// is a '<' or '{', assume xml or json.
int data = stream.read();
while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
data = stream.read();
}
if ((char)data == '<') {
type = ContentType.APPLICATION_XML.getMimeType();
} else if ((char)data == '{') {
type = ContentType.APPLICATION_JSON.getMimeType();
}
} catch (Exception ex) {
// This code just eats, the exception and leaves
// the contentType untouched.
}
return type;
}
//------------------------------------------------------------------------
//------------------------------------------------------------------------
@ -81,14 +131,33 @@ public abstract class ContentStreamBase implements ContentStream
sourceInfo = "url";
}
@Override
public String getContentType() {
// for file:// streams that are octet-streams, try to determine the payload
// type from payload rather than just using the mime type.
if ("file".equals(url.getProtocol())) {
Predicate<String> equals = mimeType->mimeType.equals(contentType);
if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
String type = attemptToDetermineContentType();
contentType = ( type != null ) ? type : contentType;
}
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
URLConnection conn = this.url.openConnection();
contentType = conn.getContentType();
name = url.toExternalForm();
size = (long) conn.getContentLength();
return conn.getInputStream();
size = conn.getContentLengthLong();
InputStream is = conn.getInputStream();
String urlFile = url.getFile().toLowerCase(Locale.ROOT);
if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
is = new GZIPInputStream(is);
}
return is;
}
}
@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
@Override
public String getContentType() {
if(contentType==null) {
// TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
InputStream stream = null;
try {
stream = new FileInputStream(file);
char first = (char)stream.read();
if(first == '<') {
return "application/xml";
}
if(first == '{') {
return "application/json";
}
} catch(Exception ex) {
} finally {
if (stream != null) try {
stream.close();
} catch (IOException ioe) {}
}
contentType = attemptToDetermineContentType();
}
return contentType;
}
@Override
public InputStream getStream() throws IOException {
return new FileInputStream( file );
InputStream is = new FileInputStream( file );
String lowerName = name.toLowerCase(Locale.ROOT);
if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
is = new GZIPInputStream(is);
}
return is;
}
}
@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
}
public ByteArrayStream( byte[] bytes, String source, String contentType ) {
this.bytes = bytes;
this.bytes = bytes;
this.contentType = contentType;
name = source;

View File

@ -25,88 +25,174 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrResourceLoader;
/**
* Tests {@link ContentStream} such as "stream.file".
*/
public class ContentStreamTest extends SolrTestCaseJ4
{
public void testStringStream() throws IOException
{
public class ContentStreamTest extends SolrTestCaseJ4 {
public void testStringStream() throws IOException {
String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
ContentStreamBase stream = new ContentStreamBase.StringStream( input );
assertEquals( input.length(), stream.getSize().intValue() );
assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
assertEquals( input, IOUtils.toString( stream.getReader() ) );
ContentStreamBase stream = new ContentStreamBase.StringStream(input);
assertEquals(input.length(), stream.getSize().intValue());
assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
assertEquals(input, IOUtils.toString(stream.getReader()));
}
public void testFileStream() throws IOException
{
File file = null;
try (SolrResourceLoader loader = new SolrResourceLoader();
InputStream is = loader.openResource( "solrj/README" )) {
public void testFileStream() throws IOException {
File file = new File(createTempDir().toFile(), "README");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file)) {
assertNotNull(is);
file = new File(createTempDir().toFile(), "README");
try (FileOutputStream os = new FileOutputStream(file)) {
IOUtils.copy(is, os);
}
IOUtils.copy(is, os);
}
ContentStreamBase stream = new ContentStreamBase.FileStream(file);
InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(
new FileInputStream(file), StandardCharsets.UTF_8);
Reader r = stream.getReader();
try {
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(
new FileInputStream(file), StandardCharsets.UTF_8);
Reader r = stream.getReader()) {
assertEquals(file.length(), stream.getSize().intValue());
// Test the code that sets content based on < being the 1st character
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis, s));
assertTrue(IOUtils.contentEquals(isr, r));
} finally {
s.close();
r.close();
isr.close();
fis.close();
}
}
public void testURLStream() throws IOException
{
File file = null;
FileOutputStream os = null;
public void testFileStreamGZIP() throws IOException {
File file = new File(createTempDir().toFile(), "README.gz");
try (SolrResourceLoader loader = new SolrResourceLoader();
InputStream is = loader.openResource( "solrj/README" )) {
assertNotNull(is);
file = new File(createTempDir().toFile(), "README");
os = new FileOutputStream(file);
IOUtils.copy(is, os);
os.close();
is.close();
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
.toURI().toASCIIString()));
InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
FileInputStream fis2 = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
Reader r = stream.getReader();
try {
ContentStreamBase stream = new ContentStreamBase.FileStream(file);
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
assertEquals(file.length(), stream.getSize().intValue());
// Test the code that sets content based on < being the 1st character
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
}
}
public void testURLStream() throws IOException {
File file = new File(createTempDir().toFile(), "README");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file)) {
IOUtils.copy(is, os);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
FileInputStream fis2 = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
Reader r = stream.getReader()) {
// For File URLs, the content type is determined automatically by the mime type
// associated with the file extension,
// This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
//
// HTTP URLS, the content type is determined by the headers. Those are not tested here.
//
assertEquals("text/html", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis2, s));
assertEquals(file.length(), stream.getSize().intValue());
assertTrue(IOUtils.contentEquals(isr, r));
assertEquals(file.length(), stream.getSize().intValue());
} finally {
r.close();
s.close();
isr.close();
fis.close();
fis2.close();
}
}
public void testURLStreamGZIP() throws IOException {
File file = new File(createTempDir().toFile(), "README.gz");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
}
}
public void testURLStreamCSVGZIPExtention() throws IOException {
File file = new File(createTempDir().toFile(), "README.CSV.gz");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("text/csv", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
}
}
public void testURLStreamJSONGZIPExtention() throws IOException {
File file = new File(createTempDir().toFile(), "README.json.gzip");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("application/json", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
}
}
}