SOLR-4283: Improvements for URL-Decoding

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1430396 13f79535-47bb-0310-9956-ffa450edef68
2013-01-08 17:22:14 +00:00 · 2013-01-08 17:22:14 +00:00 · 9ee4b4d5fa
parent d65a7fc29a
commit 9ee4b4d5fa
4 changed files with 190 additions and 57 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -195,12 +195,14 @@ New Features
  that can be set to false to not filter. Its useful when there is already a spatial
  filter query but you also need to sort or boost by distance. (David Smiley)

-* SOLR-4265: Solr now parses request parameters (in URL or sent with POST using
-  content-type application/x-www-form-urlencoded) in its dispatcher code. It no
+* SOLR-4265, SOLR-4283: Solr now parses request parameters (in URL or sent with POST
+  using content-type application/x-www-form-urlencoded) in its dispatcher code. It no
  longer relies on special configuration settings in Tomcat or other web containers
-  to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Also
-  the maximum length of x-www-form-urlencoded POST parameters can now be configured
-  through the requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in
+  to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Query
+  strings passed in via the URL need to be properly-%-escaped, UTF-8 encoded
+  bytes, otherwise Solr refuses to handle the request. The maximum length of
+  x-www-form-urlencoded POST parameters can now be configured through the
+  requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in
  solrconfig.xml (defaults to 2 MiB). Solr now works out of the box with
  e.g. Tomcat, JBoss,...  (Uwe Schindler, Dawid Weiss, Alex Rocher)

--- a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java
+++ b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java
@ -20,9 +20,13 @@ package org.apache.solr.servlet;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
+import java.io.ByteArrayOutputStream;
+import java.nio.ByteBuffer;
+import java.nio.charset.CharacterCodingException;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CodingErrorAction;
 import java.net.URL;
-import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@ -32,20 +36,20 @@ import java.util.Locale;
 import java.util.Map;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.apache.commons.io.IOUtils;
-import org.apache.commons.io.input.BoundedInputStream;

 import javax.servlet.http.HttpServletRequest;

 import org.apache.commons.fileupload.FileItem;
 import org.apache.commons.fileupload.disk.DiskFileItemFactory;
 import org.apache.commons.fileupload.servlet.ServletFileUpload;
+import org.apache.lucene.util.IOUtils;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.MultiMapSolrParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.util.ContentStream;
 import org.apache.solr.common.util.ContentStreamBase;
+import org.apache.solr.common.util.FastInputStream;
 import org.apache.solr.core.Config;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.request.SolrQueryRequest;
@ -197,35 +201,138 @@ public class SolrRequestParsers
   */
  public static MultiMapSolrParams parseQueryString(String queryString) {
    Map<String,String[]> map = new HashMap<String, String[]>();
-    parseQueryString(queryString, "UTF-8", map);
+    parseQueryString(queryString, map);
    return new MultiMapSolrParams(map);
  }

  /**
-   * Given a url-encoded query string, map it into the given map
+   * Given a url-encoded query string (UTF-8), map it into the given map
   * @param queryString as given from URL
-   * @param charset to be used to decode %-encoding
   * @param map place all parameters in this map
   */
-  static void parseQueryString(String queryString, String charset, Map<String,String[]> map) {
+  static void parseQueryString(final String queryString, final Map<String,String[]> map) {
    if (queryString != null && queryString.length() > 0) {
      try {
-        for( String kv : queryString.split( "&" ) ) {
-          int idx = kv.indexOf( '=' );
-          if( idx >= 0 ) {
-            String name = URLDecoder.decode( kv.substring( 0, idx ), charset);
-            String value = URLDecoder.decode( kv.substring( idx+1 ), charset);
-            MultiMapSolrParams.addParam( name, value, map );
+        final int len = queryString.length();
+        // this input stream emulates to get the raw bytes from the URL as passed to servlet container, it disallows any byte > 127 and enforces to %-escape them:
+        final InputStream in = new InputStream() {
+          int pos = 0;
+          @Override
+          public int read() {
+            if (pos < len) {
+              final char ch = queryString.charAt(pos);
+              if (ch > 127) {
+                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: The query string contains a not-%-escaped byte > 127 at position " + pos);
+              }
+              pos++;
+              return ch;
            } else {
-            String name = URLDecoder.decode( kv, charset );
-            MultiMapSolrParams.addParam( name, "", map );
+              return -1;
+            }
+          }
+        };
+        parseFormDataContent(in, Long.MAX_VALUE, IOUtils.CHARSET_UTF_8, map);
+      } catch (IOException ioe) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe);
      }
    }
  }
-      catch( UnsupportedEncodingException uex ) {
-        throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, uex );
+  
+  /**
+   * Given a url-encoded form from POST content (as InputStream), map it into the given map.
+   * The given InputStream should be buffered!
+   * @param postContent to be parsed
+   * @param charset to be used to decode resulting bytes after %-decoding
+   * @param map place all parameters in this map
+   */
+  @SuppressWarnings("fallthrough")
+  static long parseFormDataContent(final InputStream postContent, final long maxLen, final Charset charset, final Map<String,String[]> map) throws IOException {
+    final CharsetDecoder charsetDecoder = charset.newDecoder()
+      .onMalformedInput(CodingErrorAction.REPORT)
+      .onUnmappableCharacter(CodingErrorAction.REPORT);
+    long len = 0L, keyPos = 0L, valuePos = 0L;
+    final ByteArrayOutputStream2 keyStream = new ByteArrayOutputStream2(),
+      valueStream = new ByteArrayOutputStream2();
+    ByteArrayOutputStream2 currentStream = keyStream;
+    for(;;) {
+      int b = postContent.read();
+      switch (b) {
+        case -1: // end of stream
+        case '&': // separator
+          if (keyStream.size() > 0) {
+            final String key = decodeChars(keyStream, keyPos, charsetDecoder), value = decodeChars(valueStream, valuePos, charsetDecoder);
+            MultiMapSolrParams.addParam(key, value, map);
+          } else if (valueStream.size() > 0) {
+            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded invalid: missing key");
+          }
+          keyStream.reset();
+          valueStream.reset();
+          keyPos = valuePos = len + 1;
+          currentStream = keyStream;
+          break;
+        case '+': // space replacement
+          currentStream.write(' ');
+          break;
+        case '%': // escape
+          final int upper = digit16(b = postContent.read());
+          len++;
+          final int lower = digit16(b = postContent.read());
+          len++;
+          currentStream.write(((upper << 4) + lower));
+          break;
+        case '=': // kv separator
+          if (currentStream == keyStream) {
+            valuePos = len + 1;
+            currentStream = valueStream;
+            break;
+          }
+          // fall-through
+        default:
+          currentStream.write(b);
+      }
+      if (b == -1) {
+        break;
+      }
+      len++;
+      if (len > maxLen) {
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + (maxLen/1024L) + " KB");
      }
    }
+    return len;
+  }
+  
+  private static String decodeChars(ByteArrayOutputStream2 stream, long position, CharsetDecoder charsetDecoder) {
+    try {
+      return charsetDecoder.decode(ByteBuffer.wrap(stream.buffer(), 0, stream.size())).toString();
+    } catch (CharacterCodingException cce) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+        "URLDecoder: Invalid character encoding detected after position " + position +
+        " of query string / form data (while parsing as " + charsetDecoder.charset().name() + ")"
+      );
+    }
+  }
+  
+  /** Makes the buffer of ByteArrayOutputStream available without copy. */
+  static final class ByteArrayOutputStream2 extends ByteArrayOutputStream {
+    byte[] buffer() {
+      return buf;
+    }
+  }
+  
+  private static int digit16(int b) {
+    if (b == -1) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Incomplete trailing escape (%) pattern");
+    }
+    if (b >= '0' && b <= '9') {
+      return b - '0';
+    }
+    if (b >= 'A' && b <= 'F') {
+      return b - ('A' - 10);
+    }
+    if (b >= 'a' && b <= 'f') {
+      return b - ('a' - 10);
+    }
+    throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Invalid digit (" + ((char) b) + ") in escape (%) pattern");
  }
  
  public boolean isHandleSelect() {
@ -404,15 +511,12 @@ class FormDataRequestParser implements SolrRequestParser
      throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Not application/x-www-form-urlencoded content: "+req.getContentType() );
    }
    
-    String charset = ContentStreamBase.getCharsetFromContentType(req.getContentType());
-    if (charset == null) charset = "UTF-8";
-    
    final Map<String,String[]> map = new HashMap<String, String[]>();
    
    // also add possible URL parameters and include into the map (parsed using UTF-8):
    final String qs = req.getQueryString();
    if (qs != null) {
-      SolrRequestParsers.parseQueryString(qs, "UTF-8", map);
+      SolrRequestParsers.parseQueryString(qs, map);
    }
    
    // may be -1, so we check again later. But if its already greater we can stop processing!
@ -424,26 +528,21 @@ class FormDataRequestParser implements SolrRequestParser
    }
    
    // get query String from request body, using the charset given in content-type:
-    final InputStream in;
+    final String cs = ContentStreamBase.getCharsetFromContentType(req.getContentType());
+    final Charset charset = (cs == null) ? IOUtils.CHARSET_UTF_8 : Charset.forName(cs);
+    InputStream in = null;
    try {
      in = req.getInputStream();
-    } catch (IllegalStateException ise) {
-      throw (SolrException) getParameterIncompatibilityException().initCause(ise);
-    }
-    try {
-      final String data = IOUtils.toString(new BoundedInputStream(in, maxLength), charset);
-      // if there is remaining data in the underlying stream, throw exception:
-      if (in.read() != -1) {
-        // read remaining data and throw away:
-        while (IOUtils.skip(in, 1024L) > 0);
-        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + uploadLimitKB + " KB");
-      }
-      if (data.length() == 0 && totalLength > 0L) {
+      final long bytesRead = SolrRequestParsers.parseFormDataContent(FastInputStream.wrap(in), maxLength, charset, map);
+      if (bytesRead == 0L && totalLength > 0L) {
        throw getParameterIncompatibilityException();
      }
-      SolrRequestParsers.parseQueryString(data, charset, map);
+    } catch (IOException ioe) {
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe);
+    } catch (IllegalStateException ise) {
+      throw (SolrException) getParameterIncompatibilityException().initCause(ise);
    } finally {
-      IOUtils.closeQuietly(in);
+      IOUtils.closeWhileHandlingException(in);
    }
    
    return new MultiMapSolrParams(map);
--- a/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java
+++ b/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java
@ -17,6 +17,8 @@

 package org.apache.solr.servlet;

+import java.net.URLEncoder;
+
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.util.AbstractSolrTestCase;
 import org.junit.BeforeClass;
@ -74,7 +76,7 @@ public class DirectSolrConnectionTest extends AbstractSolrTestCase
    
    // Test using the Stream body parameter
    for( String cmd : cmds ) {
-      direct.request( "/update?"+CommonParams.STREAM_BODY+"="+cmd, null );
+      direct.request( "/update?"+CommonParams.STREAM_BODY+"="+URLEncoder.encode(cmd, "UTF-8"), null );
    }
    String got = direct.request( getIt, null );
    assertTrue( got.indexOf( value ) > 0 );
--- a/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java
+++ b/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java
@ -25,7 +25,6 @@ import java.io.ByteArrayInputStream;
 import java.net.HttpURLConnection;
 import java.net.SocketTimeoutException;
 import java.net.URL;
-import java.net.URLConnection;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
@ -115,7 +114,6 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 {
  @Test
  public void testStreamURL() throws Exception
  {
-    boolean ok = false;
    String url = "http://www.apache.org/dist/lucene/solr/";
    byte[] bytes = null;
    try {
@ -152,19 +150,51 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 {
  }
  
  @Test
-  public void testUrlParamParsing()
+  public void testUrlParamParsing() throws Exception
  {
-    String[][] teststr = new String[][] {
+    final String[][] teststr = new String[][] {
      { "this is simple", "this%20is%20simple" },
      { "this is simple", "this+is+simple" },
      { "\u00FC", "%C3%BC" },   // lower-case "u" with diaeresis/umlaut
      { "\u0026", "%26" },      // &
-      { "\u20AC", "%E2%82%AC" } // euro
+      { "", "" },               // empty
+      { "\u20AC", "%E2%82%ac" } // euro, also with lowercase escapes
    };
    
    for( String[] tst : teststr ) {
-      MultiMapSolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] );
+      SolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] );
      assertEquals( tst[0], params.get( "val" ) );
+      params = SolrRequestParsers.parseQueryString( "val="+tst[1]+"&" );
+      assertEquals( tst[0], params.get( "val" ) );
+      params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&" );
+      assertEquals( tst[0], params.get( "val" ) );
+      params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&&&val="+tst[1]+"&" );
+      assertArrayEquals(new String[]{tst[0],tst[0]}, params.getParams("val") );
+   }
+    
+    SolrParams params = SolrRequestParsers.parseQueryString("val");
+    assertEquals("", params.get("val"));
+    
+    params = SolrRequestParsers.parseQueryString("val&foo=bar=bar&muh&");
+    assertEquals("", params.get("val"));
+    assertEquals("bar=bar", params.get("foo"));
+    assertEquals("", params.get("muh"));
+    
+    final String[] invalid = {
+      "q=h%FCllo",     // non-UTF-8
+      "q=h\u00FCllo",  // encoded string is not pure US-ASCII
+      "q=hallo%",      // incomplete escape
+      "q=hallo%1",     // incomplete escape
+      "q=hallo%XX123", // invalid digit 'X' in escape
+      "=hallo"         // missing key
+    };
+    for (String s : invalid) {
+      try {
+        SolrRequestParsers.parseQueryString(s);
+        fail("Should throw SolrException");
+      } catch (SolrException se) {
+        // pass
+      }
    }
  }
  
@ -172,7 +202,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 {
  public void testStandardParseParamsAndFillStreams() throws Exception
  {
    final String getParams = "qt=%C3%BC&dup=foo", postParams = "q=hello&d%75p=bar";
-    final byte[] postBytes = postParams.getBytes("UTF-8");
+    final byte[] postBytes = postParams.getBytes("US-ASCII");
    
    // Set up the expected behavior
    final String[] ct = new String[] {
@ -224,7 +254,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 {
    expect(request.getContentLength()).andReturn(-1).anyTimes();
    expect(request.getQueryString()).andReturn(null).anyTimes();
    expect(request.getInputStream()).andReturn(new ServletInputStream() {
-      private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("UTF-8"));
+      private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("US-ASCII"));
      @Override public int read() { return in.read(); }
    });
    replay(request);