SOLR-10981: Support for stream.url or stream.file pointing to gzipped data

2018-10-18 19:53:21 -04:00 · 2018-10-18 19:53:21 -04:00 · 1a8188d92b
parent fd9164801e
commit 1a8188d92b
4 changed files with 230 additions and 79 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -209,6 +209,9 @@ Improvements
 * SOLR-12806: use autoscaling policies with strict=false to prioritize node allocation (noble)
 * SOLR-10981: Support for stream.url or stream.file pointing to gzipped data.  It's detected by either a content
  encoding header or file extension. (Andrew Lundgren via David Smiley, Jan Høydahl)
 ==================  7.5.0 ==================
 Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release.
--- a/solr/solr-ref-guide/src/content-streams.adoc
+++ b/solr/solr-ref-guide/src/content-streams.adoc
@ -78,6 +78,10 @@ curl -X POST -H 'Content-type: application/json' -d '{"set-property": {"requestD
 If `enableRemoteStreaming="true"` is used, be aware that this allows _anyone_ to send a request to any URL or local file. If the <<Debugging Requests,DumpRequestHandler>> is enabled, it will allow anyone to view any file on your system.
 ====
 The source of the data can be compressed using gzip, and Solr will generally detect this.
 The detection is based on either the presence of a `Content-Encoding: gzip` HTTP header or the file ending with .gz or .gzip.
 Gzip doesn't apply to `stream.body`.
 == Debugging Requests
 The implicit "dump" RequestHandler (see <<implicit-requesthandlers.adoc#implicit-requesthandlers,Implicit RequestHandlers>>) simply outputs the contents of the Solr QueryRequest using the specified writer type `wt`. This is a useful tool to help understand what streams are available to the RequestHandlers.
--- a/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
+++ b/solr/solrj/src/java/org/apache/solr/common/util/ContentStreamBase.java
@ -29,8 +29,13 @@ import java.io.UnsupportedEncodingException;
 import java.net.URL;
 import java.net.URLConnection;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Locale;
 import java.util.function.Predicate;
 import java.util.zip.GZIPInputStream;
 import org.apache.http.entity.ContentType;
 import org.apache.solr.client.solrj.SolrRequest;
 import org.apache.solr.client.solrj.request.RequestWriter;
@ -42,8 +47,14 @@ import org.apache.solr.client.solrj.request.RequestWriter;
 */
 public abstract class ContentStreamBase implements ContentStream
 {
  public static final String DEFAULT_CHARSET = StandardCharsets.UTF_8.name();
-  
+  private static final String TEXT_CSV = "text/csv";
  private static final List<String> UNHELPFUL_TYPES = Arrays.asList(ContentType.APPLICATION_OCTET_STREAM.getMimeType(), "application/gzip", "content/unknown");
  private static final List<String> XML_SUF =  Arrays.asList(".xml", ".xml.gz", ".xml.gzip");
  private static final List<String> JSON_SUF =  Arrays.asList(".json", ".json.gz", ".json.gzip");
  private static final List<String> CSV_SUF =  Arrays.asList(".csv", ".csv.gz", ".csv.gzip");
  protected String name;
  protected String sourceInfo;
  protected String contentType;
@ -62,7 +73,46 @@ public abstract class ContentStreamBase implements ContentStream
    }
    return null;
  }
-  
+
  protected String attemptToDetermineContentType() {
    String type = null;
    if (name != null) {
      Predicate<String> endsWith = suffix->name.toLowerCase(Locale.ROOT).endsWith(suffix);
      if (XML_SUF.stream().anyMatch(endsWith)) {
        type = ContentType.APPLICATION_XML.getMimeType();
      } else if (JSON_SUF.stream().anyMatch(endsWith)) {
        type = ContentType.APPLICATION_JSON.getMimeType();
      } else if (CSV_SUF.stream().anyMatch(endsWith)) {
        type = TEXT_CSV;
      } else {
        type = attemptToDetermineTypeFromFirstCharacter();
      }
    }
    return type;
  }
  private String attemptToDetermineTypeFromFirstCharacter() {
    String type = null;
    try (InputStream stream = getStream()) {
      // Last ditch effort to determine content, if the first non-white space
      // is a '<' or '{', assume xml or json.
      int data = stream.read();
      while (( data != -1 ) && ( ( (char)data ) == ' ' )) {
        data = stream.read();
      }
      if ((char)data == '<') {
        type = ContentType.APPLICATION_XML.getMimeType();
      } else if ((char)data == '{') {
        type = ContentType.APPLICATION_JSON.getMimeType();
      }
    } catch (Exception ex) {
      // This code just eats, the exception and leaves
      // the contentType untouched.
    }
    return type;
  }
  //------------------------------------------------------------------------
  //------------------------------------------------------------------------
@ -81,14 +131,33 @@ public abstract class ContentStreamBase implements ContentStream
      sourceInfo = "url";
    }
    @Override
    public String getContentType() {
      // for file:// streams that are octet-streams, try to determine the payload
      // type from payload rather than just using the mime type.
      if ("file".equals(url.getProtocol())) {
        Predicate<String> equals = mimeType->mimeType.equals(contentType);
        if (UNHELPFUL_TYPES.stream().anyMatch(equals)) {
          String type = attemptToDetermineContentType();
          contentType = ( type != null ) ? type : contentType;
        }
      }
      return contentType;
    }
    @Override
    public InputStream getStream() throws IOException {
      URLConnection conn = this.url.openConnection();
      contentType = conn.getContentType();
      name = url.toExternalForm();
-      size = (long) conn.getContentLength();
+      size = conn.getContentLengthLong();
-      return conn.getInputStream();
+      InputStream is = conn.getInputStream();
      String urlFile = url.getFile().toLowerCase(Locale.ROOT);
      if( "gzip".equals(conn.getContentEncoding()) || urlFile.endsWith( ".gz" ) || urlFile.endsWith( ".gzip" )){
        is = new GZIPInputStream(is);
      }
      return is;
    }
  }
@ -111,30 +180,19 @@ public abstract class ContentStreamBase implements ContentStream
    @Override
    public String getContentType() {
      if(contentType==null) {
-        // TODO: this is buggy... does not allow for whitespace, JSON comments, etc.
+        contentType = attemptToDetermineContentType();
        InputStream stream = null;
        try {
          stream = new FileInputStream(file);
          char first = (char)stream.read();
          if(first == '<') {
            return "application/xml";
          }
          if(first == '{') {
            return "application/json";
          }
        } catch(Exception ex) {
        } finally {
          if (stream != null) try {
            stream.close();
          } catch (IOException ioe) {}
        }
      }
      return contentType;
    }
    @Override
    public InputStream getStream() throws IOException {
-      return new FileInputStream( file );
+      InputStream is = new FileInputStream( file );
      String lowerName = name.toLowerCase(Locale.ROOT);
      if(lowerName.endsWith(".gz") || lowerName.endsWith(".gzip")) {
        is = new GZIPInputStream(is);
      }
      return is;
    }
  }
@ -273,7 +331,7 @@ public abstract class ContentStreamBase implements ContentStream
    }
    public ByteArrayStream( byte[] bytes, String source, String contentType ) {
-      this.bytes = bytes; 
+      this.bytes = bytes;
      this.contentType = contentType;
      name = source;
--- a/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
+++ b/solr/solrj/src/test/org/apache/solr/common/util/ContentStreamTest.java
@ -25,88 +25,174 @@ import java.io.InputStreamReader;
 import java.io.Reader;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.solr.SolrTestCaseJ4;
 import org.apache.solr.core.SolrResourceLoader;
 /**
 * Tests {@link ContentStream} such as "stream.file".
 */
-public class ContentStreamTest extends SolrTestCaseJ4 
+public class ContentStreamTest extends SolrTestCaseJ4 {
-{  
+
-  public void testStringStream() throws IOException 
+  public void testStringStream() throws IOException {
  {
    String input = "aads ghaskdgasgldj asl sadg ajdsg &jag # @ hjsakg hsakdg hjkas s";
-    ContentStreamBase stream = new ContentStreamBase.StringStream( input );
+    ContentStreamBase stream = new ContentStreamBase.StringStream(input);
-    assertEquals( input.length(), stream.getSize().intValue() );
+    assertEquals(input.length(), stream.getSize().intValue());
-    assertEquals( input, IOUtils.toString( stream.getStream(), "UTF-8" ) );
+    assertEquals(input, IOUtils.toString(stream.getStream(), "UTF-8"));
-    assertEquals( input, IOUtils.toString( stream.getReader() ) );
+    assertEquals(input, IOUtils.toString(stream.getReader()));
  }
-  public void testFileStream() throws IOException 
+  public void testFileStream() throws IOException {
-  {
+    File file = new File(createTempDir().toFile(), "README");
-    File file = null;
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
-    try (SolrResourceLoader loader = new SolrResourceLoader();
+         FileOutputStream os = new FileOutputStream(file)) {
         InputStream is = loader.openResource( "solrj/README" )) {
      assertNotNull(is);
-      file = new File(createTempDir().toFile(), "README");
+      IOUtils.copy(is, os);
      try (FileOutputStream os = new FileOutputStream(file)) {
        IOUtils.copy(is, os);
      }
    }
    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
-    InputStream s = stream.getStream();
+    try (InputStream s = stream.getStream();
-    FileInputStream fis = new FileInputStream(file);
+         FileInputStream fis = new FileInputStream(file);
-    InputStreamReader isr = new InputStreamReader(
+         InputStreamReader isr = new InputStreamReader(
-        new FileInputStream(file), StandardCharsets.UTF_8);
+             new FileInputStream(file), StandardCharsets.UTF_8);
-    Reader r = stream.getReader();
+         Reader r = stream.getReader()) {
    try {
      assertEquals(file.length(), stream.getSize().intValue());
      // Test the code that sets content based on < being the 1st character
      assertEquals("application/xml", stream.getContentType());
      assertTrue(IOUtils.contentEquals(fis, s));
      assertTrue(IOUtils.contentEquals(isr, r));
    } finally {
      s.close();
      r.close();
      isr.close();
      fis.close();
    }
  }
-  public void testURLStream() throws IOException 
+  public void testFileStreamGZIP() throws IOException {
-  {
+    File file = new File(createTempDir().toFile(), "README.gz");
    File file = null;
    FileOutputStream os = null;
-    try (SolrResourceLoader loader = new SolrResourceLoader();
+    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
-         InputStream is = loader.openResource( "solrj/README" )) {
+         FileOutputStream os = new FileOutputStream(file);
-      assertNotNull(is);
+         GZIPOutputStream zos = new GZIPOutputStream(os)) {
-      file = new File(createTempDir().toFile(), "README");
+      IOUtils.copy(is, zos);
      os = new FileOutputStream(file);
      IOUtils.copy(is, os);
      os.close();
      is.close();
    }
-    
+
-    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file
+    ContentStreamBase stream = new ContentStreamBase.FileStream(file);
-        .toURI().toASCIIString()));
+    try (InputStream s = stream.getStream();
-    InputStream s = stream.getStream();
+         FileInputStream fis = new FileInputStream(file);
-    FileInputStream fis = new FileInputStream(file);
+         GZIPInputStream zis = new GZIPInputStream(fis);
-    FileInputStream fis2 = new FileInputStream(file);
+         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
-    InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
+         FileInputStream fis2 = new FileInputStream(file);
-    Reader r = stream.getReader();
+         GZIPInputStream zis2 = new GZIPInputStream(fis2);
-    try {
+         Reader r = stream.getReader()) {
      assertEquals(file.length(), stream.getSize().intValue());
      // Test the code that sets content based on < being the 1st character
      assertEquals("application/xml", stream.getContentType());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertTrue(IOUtils.contentEquals(zis2, s));
    }
  }
  public void testURLStream() throws IOException {
    File file = new File(createTempDir().toFile(), "README");
    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
         FileOutputStream os = new FileOutputStream(file)) {
      IOUtils.copy(is, os);
    }
    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
    try (InputStream s = stream.getStream();
         FileInputStream fis = new FileInputStream(file);
         FileInputStream fis2 = new FileInputStream(file);
         InputStreamReader isr = new InputStreamReader(fis, StandardCharsets.UTF_8);
         Reader r = stream.getReader()) {
      // For File URLs, the content type is determined automatically by the mime type
      // associated with the file extension,
      // This is inconsistent from the FileStream as that code tries to guess the content based on the 1st character.
      //
      // HTTP URLS, the content type is determined by the headers.  Those are not tested here.
      //
      assertEquals("text/html", stream.getContentType());
      assertTrue(IOUtils.contentEquals(fis2, s));
      assertEquals(file.length(), stream.getSize().intValue());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertEquals(file.length(), stream.getSize().intValue());
-    } finally {
+    }
-      r.close();
+  }
-      s.close();
+
-      isr.close();
+  public void testURLStreamGZIP() throws IOException {
-      fis.close();
+    File file = new File(createTempDir().toFile(), "README.gz");
-      fis2.close();
+
    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
         FileOutputStream os = new FileOutputStream(file);
         GZIPOutputStream zos = new GZIPOutputStream(os)) {
      IOUtils.copy(is, zos);
    }
    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
    try (InputStream s = stream.getStream();
         FileInputStream fis = new FileInputStream(file);
         GZIPInputStream zis = new GZIPInputStream(fis);
         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
         FileInputStream fis2 = new FileInputStream(file);
         GZIPInputStream zis2 = new GZIPInputStream(fis2);
         Reader r = stream.getReader()) {
      // See the non-GZIP test case for an explanation of header handling.
      assertEquals("application/xml", stream.getContentType());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertTrue(IOUtils.contentEquals(zis2, s));
      assertEquals(file.length(), stream.getSize().intValue());
    }
  }
  public void testURLStreamCSVGZIPExtention() throws IOException {
    File file = new File(createTempDir().toFile(), "README.CSV.gz");
    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
         FileOutputStream os = new FileOutputStream(file);
         GZIPOutputStream zos = new GZIPOutputStream(os)) {
      IOUtils.copy(is, zos);
    }
    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
    try (InputStream s = stream.getStream();
         FileInputStream fis = new FileInputStream(file);
         GZIPInputStream zis = new GZIPInputStream(fis);
         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
         FileInputStream fis2 = new FileInputStream(file);
         GZIPInputStream zis2 = new GZIPInputStream(fis2);
         Reader r = stream.getReader()) {
      // See the non-GZIP test case for an explanation of header handling.
      assertEquals("text/csv", stream.getContentType());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertTrue(IOUtils.contentEquals(zis2, s));
      assertEquals(file.length(), stream.getSize().intValue());
    }
  }
  public void testURLStreamJSONGZIPExtention() throws IOException {
    File file = new File(createTempDir().toFile(), "README.json.gzip");
    try (InputStream is = new SolrResourceLoader().openResource("solrj/README");
         FileOutputStream os = new FileOutputStream(file);
         GZIPOutputStream zos = new GZIPOutputStream(os)) {
      IOUtils.copy(is, zos);
    }
    ContentStreamBase stream = new ContentStreamBase.URLStream(new URL(file.toURI().toASCIIString()));
    try (InputStream s = stream.getStream();
         FileInputStream fis = new FileInputStream(file);
         GZIPInputStream zis = new GZIPInputStream(fis);
         InputStreamReader isr = new InputStreamReader(zis, StandardCharsets.UTF_8);
         FileInputStream fis2 = new FileInputStream(file);
         GZIPInputStream zis2 = new GZIPInputStream(fis2);
         Reader r = stream.getReader()) {
      // See the non-GZIP test case for an explanation of header handling.
      assertEquals("application/json", stream.getContentType());
      assertTrue(IOUtils.contentEquals(isr, r));
      assertTrue(IOUtils.contentEquals(zis2, s));
      assertEquals(file.length(), stream.getSize().intValue());
    }
  }
 }