SOLR-10981: Support for stream.url or stream.file pointing to gzipped data

This commit is contained in:
Andrew Lundgren 2018-10-18 19:53:21 -04:00 committed by David Smiley
parent fd9164801e
commit 1a8188d92b
4 changed files with 230 additions and 79 deletions

View File

@ -209,6 +209,9 @@ Improvements
* SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble) * SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data. It's detected by either a content
encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
================== 7.5.0 ================== ================== 7.5.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release. Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.

View File

@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system. If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
==== ====
The source of the data can be compressed using gzip, and Solr will generally detect this.
The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
Gzip doesn't apply to `stream.body`.
== Debugging Requests == Debugging Requests
The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers. The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.

View File

@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
import java.net.URL; import java.net.URL;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.function.Predicate;
import java.util.zip.GZIPInputStream;
import org.apache.http.entity.ContentType;
import org.apache.solr.client.solrj.SolrRequest; import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.request.RequestWriter; import org.apache.solr.client.solrj.request.RequestWriter;
@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
*/ */
public abstract class ContentStreamBase implements ContentStream public abstract class ContentStreamBase implements ContentStream
{ {
public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name(); public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
private static final String TEXT_CSV = "text/csv";
private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
protected String name; protected String name;
protected String sourceInfo; protected String sourceInfo;
protected String contentType; protected String contentType;
@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
} }
return null; return null;
} }
protected String attemptToDetermineContentType() {
String type = null;
if (name != null) {
Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
if (XML_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_XML.getMimeType();
} else if (JSON_SUF.stream().anyMatch(endsWith)) {
type = ContentType.APPLICATION_JSON.getMimeType();
} else if (CSV_SUF.stream().anyMatch(endsWith)) {
type = TEXT_CSV;
} else {
type = attemptToDetermineTypeFromFirstCharacter();
}
}
return type;
}
private String attemptToDetermineTypeFromFirstCharacter() {
String type = null;
try (InputStream stream = getStream()) {
// Last ditch effort to determine content, if the first non-white space
// is a '<' or '{', assume xml or json.
int data = stream.read();
while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
data = stream.read();
}
if ((char)data == '<') {
type = ContentType.APPLICATION_XML.getMimeType();
} else if ((char)data == '{') {
type = ContentType.APPLICATION_JSON.getMimeType();
}
} catch (Exception ex) {
// This code just eats, the exception and leaves
// the contentType untouched.
}
return type;
}
//------------------------------------------------------------------------ //------------------------------------------------------------------------
//------------------------------------------------------------------------ //------------------------------------------------------------------------
@ -81,14 +131,33 @@ public abstract class ContentStreamBase implements ContentStream
sourceInfo = "url"; sourceInfo = "url";
} }
@Override
public String getContentType() {
// for file:// streams that are octet-streams, try to determine the payload
// type from payload rather than just using the mime type.
if ("file".equals(url.getProtocol())) {
Predicate<String> equals = mimeType->mimeType.equals(contentType);
if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
String type = attemptToDetermineContentType();
contentType = ( type != null ) ? type : contentType;
}
}
return contentType;
}
@Override @Override
public InputStream getStream() throws IOException { public InputStream getStream() throws IOException {
URLConnection conn = this.url.openConnection(); URLConnection conn = this.url.openConnection();
contentType = conn.getContentType(); contentType = conn.getContentType();
name = url.toExternalForm(); name = url.toExternalForm();
size = (long) conn.getContentLength(); size = conn.getContentLengthLong();
return conn.getInputStream(); InputStream is = conn.getInputStream();
String urlFile = url.getFile().toLowerCase(Locale.ROOT);
if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
is = new GZIPInputStream(is);
}
return is;
} }
} }
@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
@Override @Override
public String getContentType() { public String getContentType() {
if(contentType==null) { if(contentType==null) {
// TODO: this is buggy... does not allow for whitespace, JSON comments, etc. contentType = attemptToDetermineContentType();
InputStream stream = null;
try {
stream = new FileInputStream(file);
char first = (char)stream.read();
if(first == '<') {
return "application/xml";
}
if(first == '{') {
return "application/json";
}
} catch(Exception ex) {
} finally {
if (stream != null) try {
stream.close();
} catch (IOException ioe) {}
}
} }
return contentType; return contentType;
} }
@Override @Override
public InputStream getStream() throws IOException { public InputStream getStream() throws IOException {
return new FileInputStream( file ); InputStream is = new FileInputStream( file );
String lowerName = name.toLowerCase(Locale.ROOT);
if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
is = new GZIPInputStream(is);
}
return is;
} }
} }
@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
} }
public ByteArrayStream( byte[] bytes, String source, String contentType ) { public ByteArrayStream( byte[] bytes, String source, String contentType ) {
this.bytes = bytes; this.bytes = bytes;
this.contentType = contentType; this.contentType = contentType;
name = source; name = source;

View File

@ -25,88 +25,174 @@ import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.net.URL; import java.net.URL;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.solr.SolrTestCaseJ4; import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.core.SolrResourceLoader; import org.apache.solr.core.SolrResourceLoader;
/** /**
* Tests {@link ContentStream} such as "stream.file".
*/ */
public class ContentStreamTest extends SolrTestCaseJ4 public class ContentStreamTest extends SolrTestCaseJ4 {
{
public void testStringStream() throws IOException public void testStringStream() throws IOException {
{
String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s"; String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
ContentStreamBase stream = new ContentStreamBase.StringStream( input ); ContentStreamBase stream = new ContentStreamBase.StringStream(input);
assertEquals( input.length(), stream.getSize().intValue() ); assertEquals(input.length(), stream.getSize().intValue());
assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) ); assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
assertEquals( input, IOUtils.toString( stream.getReader() ) ); assertEquals(input, IOUtils.toString(stream.getReader()));
} }
public void testFileStream() throws IOException public void testFileStream() throws IOException {
{ File file = new File(createTempDir().toFile(), "README");
File file = null; try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
try (SolrResourceLoader loader = new SolrResourceLoader(); FileOutputStream os = new FileOutputStream(file)) {
InputStream is = loader.openResource( "solrj/README" )) {
assertNotNull(is); assertNotNull(is);
file = new File(createTempDir().toFile(), "README"); IOUtils.copy(is, os);
try (FileOutputStream os = new FileOutputStream(file)) {
IOUtils.copy(is, os);
}
} }
ContentStreamBase stream = new ContentStreamBase.FileStream(file); ContentStreamBase stream = new ContentStreamBase.FileStream(file);
InputStream s = stream.getStream(); try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file); FileInputStream fis = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader( InputStreamReader isr = new InputStreamReader(
new FileInputStream(file), StandardCharsets.UTF_8); new FileInputStream(file), StandardCharsets.UTF_8);
Reader r = stream.getReader(); Reader r = stream.getReader()) {
try {
assertEquals(file.length(), stream.getSize().intValue()); assertEquals(file.length(), stream.getSize().intValue());
// Test the code that sets content based on < being the 1st character
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis, s)); assertTrue(IOUtils.contentEquals(fis, s));
assertTrue(IOUtils.contentEquals(isr, r)); assertTrue(IOUtils.contentEquals(isr, r));
} finally {
s.close();
r.close();
isr.close();
fis.close();
} }
} }
public void testURLStream() throws IOException public void testFileStreamGZIP() throws IOException {
{ File file = new File(createTempDir().toFile(), "README.gz");
File file = null;
FileOutputStream os = null;
try (SolrResourceLoader loader = new SolrResourceLoader(); try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
InputStream is = loader.openResource( "solrj/README" )) { FileOutputStream os = new FileOutputStream(file);
assertNotNull(is); GZIPOutputStream zos = new GZIPOutputStream(os)) {
file = new File(createTempDir().toFile(), "README"); IOUtils.copy(is, zos);
os = new FileOutputStream(file);
IOUtils.copy(is, os);
os.close();
is.close();
} }
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file ContentStreamBase stream = new ContentStreamBase.FileStream(file);
.toURI().toASCIIString())); try (InputStream s = stream.getStream();
InputStream s = stream.getStream(); FileInputStream fis = new FileInputStream(file);
FileInputStream fis = new FileInputStream(file); GZIPInputStream zis = new GZIPInputStream(fis);
FileInputStream fis2 = new FileInputStream(file); InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8); FileInputStream fis2 = new FileInputStream(file);
Reader r = stream.getReader(); GZIPInputStream zis2 = new GZIPInputStream(fis2);
try { Reader r = stream.getReader()) {
assertEquals(file.length(), stream.getSize().intValue());
// Test the code that sets content based on < being the 1st character
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
}
}
public void testURLStream() throws IOException {
File file = new File(createTempDir().toFile(), "README");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file)) {
IOUtils.copy(is, os);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
FileInputStream fis2 = new FileInputStream(file);
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
Reader r = stream.getReader()) {
// For File URLs, the content type is determined automatically by the mime type
// associated with the file extension,
// This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
//
// HTTP URLS, the content type is determined by the headers. Those are not tested here.
//
assertEquals("text/html", stream.getContentType());
assertTrue(IOUtils.contentEquals(fis2, s)); assertTrue(IOUtils.contentEquals(fis2, s));
assertEquals(file.length(), stream.getSize().intValue()); assertEquals(file.length(), stream.getSize().intValue());
assertTrue(IOUtils.contentEquals(isr, r)); assertTrue(IOUtils.contentEquals(isr, r));
assertEquals(file.length(), stream.getSize().intValue()); assertEquals(file.length(), stream.getSize().intValue());
} finally { }
r.close(); }
s.close();
isr.close(); public void testURLStreamGZIP() throws IOException {
fis.close(); File file = new File(createTempDir().toFile(), "README.gz");
fis2.close();
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("application/xml", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
}
}
public void testURLStreamCSVGZIPExtention() throws IOException {
File file = new File(createTempDir().toFile(), "README.CSV.gz");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("text/csv", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
}
}
public void testURLStreamJSONGZIPExtention() throws IOException {
File file = new File(createTempDir().toFile(), "README.json.gzip");
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
FileOutputStream os = new FileOutputStream(file);
GZIPOutputStream zos = new GZIPOutputStream(os)) {
IOUtils.copy(is, zos);
}
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
try (InputStream s = stream.getStream();
FileInputStream fis = new FileInputStream(file);
GZIPInputStream zis = new GZIPInputStream(fis);
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
FileInputStream fis2 = new FileInputStream(file);
GZIPInputStream zis2 = new GZIPInputStream(fis2);
Reader r = stream.getReader()) {
// See the non-GZIP test case for an explanation of header handling.
assertEquals("application/json", stream.getContentType());
assertTrue(IOUtils.contentEquals(isr, r));
assertTrue(IOUtils.contentEquals(zis2, s));
assertEquals(file.length(), stream.getSize().intValue());
} }
} }
} }