mirror of https://github.com/apache/lucene.git
SOLR-10981: Support for stream.url or stream.file pointing to gzipped data
This commit is contained in:
parent
fd9164801e
commit
1a8188d92b
|
@ -209,6 +209,9 @@ Improvements
|
|||
|
||||
* SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
|
||||
|
||||
* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data. It's detected by either a content
|
||||
encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
|
||||
|
||||
================== 7.5.0 ==================
|
||||
|
||||
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
|
||||
|
|
|
@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
|
|||
If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
|
||||
====
|
||||
|
||||
The source of the data can be compressed using gzip, and Solr will generally detect this.
|
||||
The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
|
||||
Gzip doesn't apply to `stream.body`.
|
||||
|
||||
== Debugging Requests
|
||||
|
||||
The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.
|
||||
|
|
|
@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
|
|||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
|
||||
import org.apache.http.entity.ContentType;
|
||||
import org.apache.solr.client.solrj.SolrRequest;
|
||||
import org.apache.solr.client.solrj.request.RequestWriter;
|
||||
|
||||
|
@ -42,7 +47,13 @@ import org.apache.solr.client.solrj.request.RequestWriter;
|
|||
*/
|
||||
public abstract class ContentStreamBase implements ContentStream
|
||||
{
|
||||
|
||||
public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
|
||||
private static final String TEXT_CSV = "text/csv";
|
||||
private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
|
||||
private static final List<String> XML_SUF = Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
|
||||
private static final List<String> JSON_SUF = Arrays.asList(".json", ".json.gz", ".json.gzip");
|
||||
private static final List<String> CSV_SUF = Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
|
||||
|
||||
protected String name;
|
||||
protected String sourceInfo;
|
||||
|
@ -63,6 +74,45 @@ public abstract class ContentStreamBase implements ContentStream
|
|||
return null;
|
||||
}
|
||||
|
||||
protected String attemptToDetermineContentType() {
|
||||
String type = null;
|
||||
if (name != null) {
|
||||
Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
|
||||
|
||||
if (XML_SUF.stream().anyMatch(endsWith)) {
|
||||
type = ContentType.APPLICATION_XML.getMimeType();
|
||||
} else if (JSON_SUF.stream().anyMatch(endsWith)) {
|
||||
type = ContentType.APPLICATION_JSON.getMimeType();
|
||||
} else if (CSV_SUF.stream().anyMatch(endsWith)) {
|
||||
type = TEXT_CSV;
|
||||
} else {
|
||||
type = attemptToDetermineTypeFromFirstCharacter();
|
||||
}
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
private String attemptToDetermineTypeFromFirstCharacter() {
|
||||
String type = null;
|
||||
try (InputStream stream = getStream()) {
|
||||
// Last ditch effort to determine content, if the first non-white space
|
||||
// is a '<' or '{', assume xml or json.
|
||||
int data = stream.read();
|
||||
while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
|
||||
data = stream.read();
|
||||
}
|
||||
if ((char)data == '<') {
|
||||
type = ContentType.APPLICATION_XML.getMimeType();
|
||||
} else if ((char)data == '{') {
|
||||
type = ContentType.APPLICATION_JSON.getMimeType();
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
// This code just eats, the exception and leaves
|
||||
// the contentType untouched.
|
||||
}
|
||||
return type;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
|
@ -81,14 +131,33 @@ public abstract class ContentStreamBase implements ContentStream
|
|||
sourceInfo = "url";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getContentType() {
|
||||
// for file:// streams that are octet-streams, try to determine the payload
|
||||
// type from payload rather than just using the mime type.
|
||||
if ("file".equals(url.getProtocol())) {
|
||||
Predicate<String> equals = mimeType->mimeType.equals(contentType);
|
||||
if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
|
||||
String type = attemptToDetermineContentType();
|
||||
contentType = ( type != null ) ? type : contentType;
|
||||
}
|
||||
}
|
||||
return contentType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
URLConnection conn = this.url.openConnection();
|
||||
|
||||
contentType = conn.getContentType();
|
||||
name = url.toExternalForm();
|
||||
size = (long) conn.getContentLength();
|
||||
return conn.getInputStream();
|
||||
size = conn.getContentLengthLong();
|
||||
InputStream is = conn.getInputStream();
|
||||
String urlFile = url.getFile().toLowerCase(Locale.ROOT);
|
||||
if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
|
||||
is = new GZIPInputStream(is);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
|
|||
@Override
|
||||
public String getContentType() {
|
||||
if(contentType==null) {
|
||||
// TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
|
||||
InputStream stream = null;
|
||||
try {
|
||||
stream = new FileInputStream(file);
|
||||
char first = (char)stream.read();
|
||||
if(first == '<') {
|
||||
return "application/xml";
|
||||
}
|
||||
if(first == '{') {
|
||||
return "application/json";
|
||||
}
|
||||
} catch(Exception ex) {
|
||||
} finally {
|
||||
if (stream != null) try {
|
||||
stream.close();
|
||||
} catch (IOException ioe) {}
|
||||
}
|
||||
contentType = attemptToDetermineContentType();
|
||||
}
|
||||
return contentType;
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputStream getStream() throws IOException {
|
||||
return new FileInputStream( file );
|
||||
InputStream is = new FileInputStream( file );
|
||||
String lowerName = name.toLowerCase(Locale.ROOT);
|
||||
if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
|
||||
is = new GZIPInputStream(is);
|
||||
}
|
||||
return is;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -25,88 +25,174 @@ import java.io.InputStreamReader;
|
|||
import java.io.Reader;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
/**
|
||||
* Tests {@link ContentStream} such as "stream.file".
|
||||
*/
|
||||
public class ContentStreamTest extends SolrTestCaseJ4
|
||||
{
|
||||
public void testStringStream() throws IOException
|
||||
{
|
||||
public class ContentStreamTest extends SolrTestCaseJ4 {
|
||||
|
||||
public void testStringStream() throws IOException {
|
||||
String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
|
||||
ContentStreamBase stream = new ContentStreamBase.StringStream( input );
|
||||
assertEquals( input.length(), stream.getSize().intValue() );
|
||||
assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
|
||||
assertEquals( input, IOUtils.toString( stream.getReader() ) );
|
||||
ContentStreamBase stream = new ContentStreamBase.StringStream(input);
|
||||
assertEquals(input.length(), stream.getSize().intValue());
|
||||
assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
|
||||
assertEquals(input, IOUtils.toString(stream.getReader()));
|
||||
}
|
||||
|
||||
public void testFileStream() throws IOException
|
||||
{
|
||||
File file = null;
|
||||
try (SolrResourceLoader loader = new SolrResourceLoader();
|
||||
InputStream is = loader.openResource( "solrj/README" )) {
|
||||
public void testFileStream() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README");
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file)) {
|
||||
assertNotNull(is);
|
||||
file = new File(createTempDir().toFile(), "README");
|
||||
try (FileOutputStream os = new FileOutputStream(file)) {
|
||||
IOUtils.copy(is, os);
|
||||
}
|
||||
IOUtils.copy(is, os);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.FileStream(file);
|
||||
InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
InputStreamReader isr = new InputStreamReader(
|
||||
new FileInputStream(file), StandardCharsets.UTF_8);
|
||||
Reader r = stream.getReader();
|
||||
try {
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
InputStreamReader isr = new InputStreamReader(
|
||||
new FileInputStream(file), StandardCharsets.UTF_8);
|
||||
Reader r = stream.getReader()) {
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
// Test the code that sets content based on < being the 1st character
|
||||
assertEquals("application/xml", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(fis, s));
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
} finally {
|
||||
s.close();
|
||||
r.close();
|
||||
isr.close();
|
||||
fis.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testFileStreamGZIP() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README.gz");
|
||||
|
||||
public void testURLStream() throws IOException
|
||||
{
|
||||
File file = null;
|
||||
FileOutputStream os = null;
|
||||
|
||||
try (SolrResourceLoader loader = new SolrResourceLoader();
|
||||
InputStream is = loader.openResource( "solrj/README" )) {
|
||||
assertNotNull(is);
|
||||
file = new File(createTempDir().toFile(), "README");
|
||||
os = new FileOutputStream(file);
|
||||
IOUtils.copy(is, os);
|
||||
os.close();
|
||||
is.close();
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file);
|
||||
GZIPOutputStream zos = new GZIPOutputStream(os)) {
|
||||
IOUtils.copy(is, zos);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
|
||||
.toURI().toASCIIString()));
|
||||
InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
|
||||
Reader r = stream.getReader();
|
||||
try {
|
||||
ContentStreamBase stream = new ContentStreamBase.FileStream(file);
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
GZIPInputStream zis = new GZIPInputStream(fis);
|
||||
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
GZIPInputStream zis2 = new GZIPInputStream(fis2);
|
||||
Reader r = stream.getReader()) {
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
// Test the code that sets content based on < being the 1st character
|
||||
assertEquals("application/xml", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
assertTrue(IOUtils.contentEquals(zis2, s));
|
||||
}
|
||||
}
|
||||
|
||||
public void testURLStream() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README");
|
||||
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file)) {
|
||||
IOUtils.copy(is, os);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
|
||||
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
|
||||
Reader r = stream.getReader()) {
|
||||
// For File URLs, the content type is determined automatically by the mime type
|
||||
// associated with the file extension,
|
||||
// This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
|
||||
//
|
||||
// HTTP URLS, the content type is determined by the headers. Those are not tested here.
|
||||
//
|
||||
assertEquals("text/html", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(fis2, s));
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
} finally {
|
||||
r.close();
|
||||
s.close();
|
||||
isr.close();
|
||||
fis.close();
|
||||
fis2.close();
|
||||
}
|
||||
}
|
||||
|
||||
public void testURLStreamGZIP() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README.gz");
|
||||
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file);
|
||||
GZIPOutputStream zos = new GZIPOutputStream(os)) {
|
||||
IOUtils.copy(is, zos);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
GZIPInputStream zis = new GZIPInputStream(fis);
|
||||
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
GZIPInputStream zis2 = new GZIPInputStream(fis2);
|
||||
Reader r = stream.getReader()) {
|
||||
// See the non-GZIP test case for an explanation of header handling.
|
||||
assertEquals("application/xml", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
assertTrue(IOUtils.contentEquals(zis2, s));
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
}
|
||||
}
|
||||
|
||||
public void testURLStreamCSVGZIPExtention() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README.CSV.gz");
|
||||
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file);
|
||||
GZIPOutputStream zos = new GZIPOutputStream(os)) {
|
||||
IOUtils.copy(is, zos);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
GZIPInputStream zis = new GZIPInputStream(fis);
|
||||
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
GZIPInputStream zis2 = new GZIPInputStream(fis2);
|
||||
Reader r = stream.getReader()) {
|
||||
// See the non-GZIP test case for an explanation of header handling.
|
||||
assertEquals("text/csv", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
assertTrue(IOUtils.contentEquals(zis2, s));
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
}
|
||||
}
|
||||
|
||||
public void testURLStreamJSONGZIPExtention() throws IOException {
|
||||
File file = new File(createTempDir().toFile(), "README.json.gzip");
|
||||
|
||||
try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
|
||||
FileOutputStream os = new FileOutputStream(file);
|
||||
GZIPOutputStream zos = new GZIPOutputStream(os)) {
|
||||
IOUtils.copy(is, zos);
|
||||
}
|
||||
|
||||
ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
|
||||
try (InputStream s = stream.getStream();
|
||||
FileInputStream fis = new FileInputStream(file);
|
||||
GZIPInputStream zis = new GZIPInputStream(fis);
|
||||
InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
|
||||
FileInputStream fis2 = new FileInputStream(file);
|
||||
GZIPInputStream zis2 = new GZIPInputStream(fis2);
|
||||
Reader r = stream.getReader()) {
|
||||
// See the non-GZIP test case for an explanation of header handling.
|
||||
assertEquals("application/json", stream.getContentType());
|
||||
assertTrue(IOUtils.contentEquals(isr, r));
|
||||
assertTrue(IOUtils.contentEquals(zis2, s));
|
||||
assertEquals(file.length(), stream.getSize().intValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue