SOLR-10981: Support for stream.url or stream.file pointing to gzipped data

2018-10-18 19:53:21 -04:00 · 2018-10-18 19:53:21 -04:00 · 1a8188d92b
parent fd9164801e
commit 1a8188d92b
4 changed files with 230 additions and 79 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -209,6 +209,9 @@ Improvements

 * SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)

+* SOLR-10981: Support for stream.url or stream.file pointing to gzipped data.  It's detected by either a content
+  encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
+
 ==================  7.5.0 ==================

 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
--- a/solr/solr-ref-guide/src/content-streams.adoc
+++ b/solr/solr-ref-guide/src/content-streams.adoc
@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
 If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
 ====

+The source of the data can be compressed using gzip, and Solr will generally detect this.
+The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
+Gzip doesn't apply to `stream.body`.
+
 == Debugging Requests

 The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.
--- a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
 import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Locale;
+import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;

+import org.apache.http.entity.ContentType;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.request.RequestWriter;

@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
 */
 public abstract class ContentStreamBase implements ContentStream
 {
+
  public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
-  
+  private static final String TEXT_CSV = "text/csv";
+  private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
+  private static final List<String> XML_SUF =  Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
+  private static final List<String> JSON_SUF =  Arrays.asList(".json", ".json.gz", ".json.gzip");
+  private static final List<String> CSV_SUF =  Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
+
  protected String name;
  protected String sourceInfo;
  protected String contentType;
@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
    }
    return null;
  }
-  
+
+  protected String attemptToDetermineContentType() {
+    String type = null;
+    if (name != null) {
+      Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
+
+      if (XML_SUF.stream().anyMatch(endsWith)) {
+        type = ContentType.APPLICATION_XML.getMimeType();
+      } else if (JSON_SUF.stream().anyMatch(endsWith)) {
+        type = ContentType.APPLICATION_JSON.getMimeType();
+      } else if (CSV_SUF.stream().anyMatch(endsWith)) {
+        type = TEXT_CSV;
+      } else {
+        type = attemptToDetermineTypeFromFirstCharacter();
+      }
+    }
+    return type;
+  }
+
+  private String attemptToDetermineTypeFromFirstCharacter() {
+    String type = null;
+    try (InputStream stream = getStream()) {
+      // Last ditch effort to determine content, if the first non-white space
+      // is a '<' or '{', assume xml or json.
+      int data = stream.read();
+      while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
+        data = stream.read();
+      }
+      if ((char)data == '<') {
+        type = ContentType.APPLICATION_XML.getMimeType();
+      } else if ((char)data == '{') {
+        type = ContentType.APPLICATION_JSON.getMimeType();
+      }
+    } catch (Exception ex) {
+      // This code just eats, the exception and leaves
+      // the contentType untouched.
+    }
+    return type;
+  }
+
  //------------------------------------------------------------------------
  //------------------------------------------------------------------------
  
@ -81,14 +131,33 @@ public abstract class ContentStreamBase implements ContentStream
      sourceInfo = "url";
    }

+    @Override
+    public String getContentType() {
+      // for file:// streams that are octet-streams, try to determine the payload
+      // type from payload rather than just using the mime type.
+      if ("file".equals(url.getProtocol())) {
+        Predicate<String> equals = mimeType->mimeType.equals(contentType);
+        if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
+          String type = attemptToDetermineContentType();
+          contentType = ( type != null ) ? type : contentType;
+        }
+      }
+      return contentType;
+    }
+
    @Override
    public InputStream getStream() throws IOException {
      URLConnection conn = this.url.openConnection();
      
      contentType = conn.getContentType();
      name = url.toExternalForm();
-      size = (long) conn.getContentLength();
-      return conn.getInputStream();
+      size = conn.getContentLengthLong();
+      InputStream is = conn.getInputStream();
+      String urlFile = url.getFile().toLowerCase(Locale.ROOT);
+      if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
+        is = new GZIPInputStream(is);
+      }
+      return is;
    }
  }
  
@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
    @Override
    public String getContentType() {
      if(contentType==null) {
-        // TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
-        InputStream stream = null;
-        try {
-          stream = new FileInputStream(file);
-          char first = (char)stream.read();
-          if(first == '<') {
-            return "application/xml";
-          }
-          if(first == '{') {
-            return "application/json";
-          }
-        } catch(Exception ex) {
-        } finally {
-          if (stream != null) try {
-            stream.close();
-          } catch (IOException ioe) {}
-        }
+        contentType = attemptToDetermineContentType();
      }
      return contentType;
    }

    @Override
    public InputStream getStream() throws IOException {
-      return new FileInputStream( file );
+      InputStream is = new FileInputStream( file );
+      String lowerName = name.toLowerCase(Locale.ROOT);
+      if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
+        is = new GZIPInputStream(is);
+      }
+      return is;
    }
  }
  
@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
    }
    
    public ByteArrayStream( byte[] bytes, String source, String contentType ) {
-      this.bytes = bytes; 
+      this.bytes = bytes;
      
      this.contentType = contentType;
      name = source;
--- a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
+++ b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
@ -25,88 +25,174 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;

 import org.apache.commons.io.IOUtils;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.core.SolrResourceLoader;

 /**
+ * Tests {@link ContentStream} such as "stream.file".
 */
-public class ContentStreamTest extends SolrTestCaseJ4 
-{  
-  public void testStringStream() throws IOException 
-  {
+public class ContentStreamTest extends SolrTestCaseJ4 {
+
+  public void testStringStream() throws IOException {
    String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
-    ContentStreamBase stream = new ContentStreamBase.StringStream( input );
-    assertEquals( input.length(), stream.getSize().intValue() );
-    assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
-    assertEquals( input, IOUtils.toString( stream.getReader() ) );
+    ContentStreamBase stream = new ContentStreamBase.StringStream(input);
+    assertEquals(input.length(), stream.getSize().intValue());
+    assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
+    assertEquals(input, IOUtils.toString(stream.getReader()));
  }

-  public void testFileStream() throws IOException 
-  {
-    File file = null;
-    try (SolrResourceLoader loader = new SolrResourceLoader();
-         InputStream is = loader.openResource( "solrj/README" )) {
+  public void testFileStream() throws IOException {
+    File file = new File(createTempDir().toFile(), "README");
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file)) {
      assertNotNull(is);
-      file = new File(createTempDir().toFile(), "README");
-      try (FileOutputStream os = new FileOutputStream(file)) {
-        IOUtils.copy(is, os);
-      }
+      IOUtils.copy(is, os);
    }

    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
-    InputStream s = stream.getStream();
-    FileInputStream fis = new FileInputStream(file);
-    InputStreamReader isr = new InputStreamReader(
-        new FileInputStream(file), StandardCharsets.UTF_8);
-    Reader r = stream.getReader();
-    try {
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         InputStreamReader isr = new InputStreamReader(
+             new FileInputStream(file), StandardCharsets.UTF_8);
+         Reader r = stream.getReader()) {
      assertEquals(file.length(), stream.getSize().intValue());
+      // Test the code that sets content based on < being the 1st character
+      assertEquals("application/xml", stream.getContentType());
      assertTrue(IOUtils.contentEquals(fis, s));
      assertTrue(IOUtils.contentEquals(isr, r));
-    } finally {
-      s.close();
-      r.close();
-      isr.close();
-      fis.close();
    }
  }
-  

-  public void testURLStream() throws IOException 
-  {
-    File file = null;
-    FileOutputStream os = null;
+  public void testFileStreamGZIP() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.gz");

-    try (SolrResourceLoader loader = new SolrResourceLoader();
-         InputStream is = loader.openResource( "solrj/README" )) {
-      assertNotNull(is);
-      file = new File(createTempDir().toFile(), "README");
-      os = new FileOutputStream(file);
-      IOUtils.copy(is, os);
-      os.close();
-      is.close();
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
    }
-    
-    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
-        .toURI().toASCIIString()));
-    InputStream s = stream.getStream();
-    FileInputStream fis = new FileInputStream(file);
-    FileInputStream fis2 = new FileInputStream(file);
-    InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
-    Reader r = stream.getReader();
-    try {
+
+    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      assertEquals(file.length(), stream.getSize().intValue());
+      // Test the code that sets content based on < being the 1st character
+      assertEquals("application/xml", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+    }
+  }
+
+  public void testURLStream() throws IOException {
+    File file = new File(createTempDir().toFile(), "README");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file)) {
+      IOUtils.copy(is, os);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         FileInputStream fis2 = new FileInputStream(file);
+         InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
+         Reader r = stream.getReader()) {
+      // For File URLs, the content type is determined automatically by the mime type
+      // associated with the file extension,
+      // This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
+      //
+      // HTTP URLS, the content type is determined by the headers.  Those are not tested here.
+      //
+      assertEquals("text/html", stream.getContentType());
      assertTrue(IOUtils.contentEquals(fis2, s));
      assertEquals(file.length(), stream.getSize().intValue());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertEquals(file.length(), stream.getSize().intValue());
-    } finally {
-      r.close();
-      s.close();
-      isr.close();
-      fis.close();
-      fis2.close();
+    }
+  }
+
+  public void testURLStreamGZIP() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.gz");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("application/xml", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
+    }
+  }
+
+  public void testURLStreamCSVGZIPExtention() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.CSV.gz");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("text/csv", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
+    }
+  }
+
+  public void testURLStreamJSONGZIPExtention() throws IOException {
+    File file = new File(createTempDir().toFile(), "README.json.gzip");
+
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
+         FileOutputStream os = new FileOutputStream(file);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
+      IOUtils.copy(is, zos);
+    }
+
+    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
+         Reader r = stream.getReader()) {
+      // See the non-GZIP test case for an explanation of header handling.
+      assertEquals("application/json", stream.getContentType());
+      assertTrue(IOUtils.contentEquals(isr, r));
+      assertTrue(IOUtils.contentEquals(zis2, s));
+      assertEquals(file.length(), stream.getSize().intValue());
    }
  }
 }