From 9ee4b4d5fa1c5d8de3daa118f99d9d671a55f58b Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Tue, 8 Jan 2013 17:22:14 +0000 Subject: [PATCH] SOLR-4283: Improvements for URL-Decoding git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1430396 13f79535-47bb-0310-9956-ffa450edef68 --- solr/CHANGES.txt | 12 +- .../solr/servlet/SolrRequestParsers.java | 185 ++++++++++++++---- .../servlet/DirectSolrConnectionTest.java | 4 +- .../solr/servlet/SolrRequestParserTest.java | 46 ++++- 4 files changed, 190 insertions(+), 57 deletions(-) diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 555fc83f3fb..0d7bdf5e1b7 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -195,12 +195,14 @@ New Features that can be set to false to not filter. Its useful when there is already a spatial filter query but you also need to sort or boost by distance. (David Smiley) -* SOLR-4265: Solr now parses request parameters (in URL or sent with POST using - content-type application/x-www-form-urlencoded) in its dispatcher code. It no +* SOLR-4265, SOLR-4283: Solr now parses request parameters (in URL or sent with POST + using content-type application/x-www-form-urlencoded) in its dispatcher code. It no longer relies on special configuration settings in Tomcat or other web containers - to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Also - the maximum length of x-www-form-urlencoded POST parameters can now be configured - through the requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in + to enable UTF-8 encoding, which is mandatory for correct Solr behaviour. Query + strings passed in via the URL need to be properly-%-escaped, UTF-8 encoded + bytes, otherwise Solr refuses to handle the request. The maximum length of + x-www-form-urlencoded POST parameters can now be configured through the + requestDispatcher/requestParsers/@formdataUploadLimitInKB setting in solrconfig.xml (defaults to 2 MiB). Solr now works out of the box with e.g. Tomcat, JBoss,... (Uwe Schindler, Dawid Weiss, Alex Rocher) diff --git a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java index 54783966777..3a7caf35b57 100644 --- a/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java +++ b/solr/core/src/java/org/apache/solr/servlet/SolrRequestParsers.java @@ -20,9 +20,13 @@ package org.apache.solr.servlet; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.UnsupportedEncodingException; +import java.io.ByteArrayOutputStream; +import java.nio.ByteBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.Charset; +import java.nio.charset.CharsetDecoder; +import java.nio.charset.CodingErrorAction; import java.net.URL; -import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -32,20 +36,20 @@ import java.util.Locale; import java.util.Map; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.input.BoundedInputStream; import javax.servlet.http.HttpServletRequest; import org.apache.commons.fileupload.FileItem; import org.apache.commons.fileupload.disk.DiskFileItemFactory; import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.lucene.util.IOUtils; import org.apache.solr.common.SolrException; import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.MultiMapSolrParams; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.common.util.FastInputStream; import org.apache.solr.core.Config; import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; @@ -71,7 +75,7 @@ public class SolrRequestParsers /** Default instance for e.g. admin requests. Limits to 2 MB uploads and does not allow remote streams. */ public static final SolrRequestParsers DEFAULT = new SolrRequestParsers(); - + /** * Pass in an xml configuration. A null configuration will enable * everything with maximum values. @@ -197,37 +201,140 @@ public class SolrRequestParsers */ public static MultiMapSolrParams parseQueryString(String queryString) { Map map = new HashMap(); - parseQueryString(queryString, "UTF-8", map); + parseQueryString(queryString, map); return new MultiMapSolrParams(map); } /** - * Given a url-encoded query string, map it into the given map + * Given a url-encoded query string (UTF-8), map it into the given map * @param queryString as given from URL - * @param charset to be used to decode %-encoding * @param map place all parameters in this map */ - static void parseQueryString(String queryString, String charset, Map map) { - if( queryString != null && queryString.length() > 0 ) { + static void parseQueryString(final String queryString, final Map map) { + if (queryString != null && queryString.length() > 0) { try { - for( String kv : queryString.split( "&" ) ) { - int idx = kv.indexOf( '=' ); - if( idx >= 0 ) { - String name = URLDecoder.decode( kv.substring( 0, idx ), charset); - String value = URLDecoder.decode( kv.substring( idx+1 ), charset); - MultiMapSolrParams.addParam( name, value, map ); - } else { - String name = URLDecoder.decode( kv, charset ); - MultiMapSolrParams.addParam( name, "", map ); + final int len = queryString.length(); + // this input stream emulates to get the raw bytes from the URL as passed to servlet container, it disallows any byte > 127 and enforces to %-escape them: + final InputStream in = new InputStream() { + int pos = 0; + @Override + public int read() { + if (pos < len) { + final char ch = queryString.charAt(pos); + if (ch > 127) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: The query string contains a not-%-escaped byte > 127 at position " + pos); + } + pos++; + return ch; + } else { + return -1; + } } - } - } - catch( UnsupportedEncodingException uex ) { - throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, uex ); + }; + parseFormDataContent(in, Long.MAX_VALUE, IOUtils.CHARSET_UTF_8, map); + } catch (IOException ioe) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe); } } } - + + /** + * Given a url-encoded form from POST content (as InputStream), map it into the given map. + * The given InputStream should be buffered! + * @param postContent to be parsed + * @param charset to be used to decode resulting bytes after %-decoding + * @param map place all parameters in this map + */ + @SuppressWarnings("fallthrough") + static long parseFormDataContent(final InputStream postContent, final long maxLen, final Charset charset, final Map map) throws IOException { + final CharsetDecoder charsetDecoder = charset.newDecoder() + .onMalformedInput(CodingErrorAction.REPORT) + .onUnmappableCharacter(CodingErrorAction.REPORT); + long len = 0L, keyPos = 0L, valuePos = 0L; + final ByteArrayOutputStream2 keyStream = new ByteArrayOutputStream2(), + valueStream = new ByteArrayOutputStream2(); + ByteArrayOutputStream2 currentStream = keyStream; + for(;;) { + int b = postContent.read(); + switch (b) { + case -1: // end of stream + case '&': // separator + if (keyStream.size() > 0) { + final String key = decodeChars(keyStream, keyPos, charsetDecoder), value = decodeChars(valueStream, valuePos, charsetDecoder); + MultiMapSolrParams.addParam(key, value, map); + } else if (valueStream.size() > 0) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded invalid: missing key"); + } + keyStream.reset(); + valueStream.reset(); + keyPos = valuePos = len + 1; + currentStream = keyStream; + break; + case '+': // space replacement + currentStream.write(' '); + break; + case '%': // escape + final int upper = digit16(b = postContent.read()); + len++; + final int lower = digit16(b = postContent.read()); + len++; + currentStream.write(((upper << 4) + lower)); + break; + case '=': // kv separator + if (currentStream == keyStream) { + valuePos = len + 1; + currentStream = valueStream; + break; + } + // fall-through + default: + currentStream.write(b); + } + if (b == -1) { + break; + } + len++; + if (len > maxLen) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + (maxLen/1024L) + " KB"); + } + } + return len; + } + + private static String decodeChars(ByteArrayOutputStream2 stream, long position, CharsetDecoder charsetDecoder) { + try { + return charsetDecoder.decode(ByteBuffer.wrap(stream.buffer(), 0, stream.size())).toString(); + } catch (CharacterCodingException cce) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, + "URLDecoder: Invalid character encoding detected after position " + position + + " of query string / form data (while parsing as " + charsetDecoder.charset().name() + ")" + ); + } + } + + /** Makes the buffer of ByteArrayOutputStream available without copy. */ + static final class ByteArrayOutputStream2 extends ByteArrayOutputStream { + byte[] buffer() { + return buf; + } + } + + private static int digit16(int b) { + if (b == -1) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Incomplete trailing escape (%) pattern"); + } + if (b >= '0' && b <= '9') { + return b - '0'; + } + if (b >= 'A' && b <= 'F') { + return b - ('A' - 10); + } + if (b >= 'a' && b <= 'f') { + return b - ('a' - 10); + } + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "URLDecoder: Invalid digit (" + ((char) b) + ") in escape (%) pattern"); + } + public boolean isHandleSelect() { return handleSelect; } @@ -404,15 +511,12 @@ class FormDataRequestParser implements SolrRequestParser throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Not application/x-www-form-urlencoded content: "+req.getContentType() ); } - String charset = ContentStreamBase.getCharsetFromContentType(req.getContentType()); - if (charset == null) charset = "UTF-8"; - final Map map = new HashMap(); // also add possible URL parameters and include into the map (parsed using UTF-8): final String qs = req.getQueryString(); if (qs != null) { - SolrRequestParsers.parseQueryString(qs, "UTF-8", map); + SolrRequestParsers.parseQueryString(qs, map); } // may be -1, so we check again later. But if its already greater we can stop processing! @@ -424,26 +528,21 @@ class FormDataRequestParser implements SolrRequestParser } // get query String from request body, using the charset given in content-type: - final InputStream in; + final String cs = ContentStreamBase.getCharsetFromContentType(req.getContentType()); + final Charset charset = (cs == null) ? IOUtils.CHARSET_UTF_8 : Charset.forName(cs); + InputStream in = null; try { in = req.getInputStream(); - } catch (IllegalStateException ise) { - throw (SolrException) getParameterIncompatibilityException().initCause(ise); - } - try { - final String data = IOUtils.toString(new BoundedInputStream(in, maxLength), charset); - // if there is remaining data in the underlying stream, throw exception: - if (in.read() != -1) { - // read remaining data and throw away: - while (IOUtils.skip(in, 1024L) > 0); - throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "application/x-www-form-urlencoded content exceeds upload limit of " + uploadLimitKB + " KB"); - } - if (data.length() == 0 && totalLength > 0L) { + final long bytesRead = SolrRequestParsers.parseFormDataContent(FastInputStream.wrap(in), maxLength, charset, map); + if (bytesRead == 0L && totalLength > 0L) { throw getParameterIncompatibilityException(); } - SolrRequestParsers.parseQueryString(data, charset, map); + } catch (IOException ioe) { + throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, ioe); + } catch (IllegalStateException ise) { + throw (SolrException) getParameterIncompatibilityException().initCause(ise); } finally { - IOUtils.closeQuietly(in); + IOUtils.closeWhileHandlingException(in); } return new MultiMapSolrParams(map); diff --git a/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java b/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java index 08c3766b593..bd566fcf592 100644 --- a/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/DirectSolrConnectionTest.java @@ -17,6 +17,8 @@ package org.apache.solr.servlet; +import java.net.URLEncoder; + import org.apache.solr.common.params.CommonParams; import org.apache.solr.util.AbstractSolrTestCase; import org.junit.BeforeClass; @@ -74,7 +76,7 @@ public class DirectSolrConnectionTest extends AbstractSolrTestCase // Test using the Stream body parameter for( String cmd : cmds ) { - direct.request( "/update?"+CommonParams.STREAM_BODY+"="+cmd, null ); + direct.request( "/update?"+CommonParams.STREAM_BODY+"="+URLEncoder.encode(cmd, "UTF-8"), null ); } String got = direct.request( getIt, null ); assertTrue( got.indexOf( value ) > 0 ); diff --git a/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java b/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java index 8776b1f0afe..fb6d83f7a50 100644 --- a/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java +++ b/solr/core/src/test/org/apache/solr/servlet/SolrRequestParserTest.java @@ -25,7 +25,6 @@ import java.io.ByteArrayInputStream; import java.net.HttpURLConnection; import java.net.SocketTimeoutException; import java.net.URL; -import java.net.URLConnection; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -115,7 +114,6 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { @Test public void testStreamURL() throws Exception { - boolean ok = false; String url = "http://www.apache.org/dist/lucene/solr/"; byte[] bytes = null; try { @@ -152,19 +150,51 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { } @Test - public void testUrlParamParsing() + public void testUrlParamParsing() throws Exception { - String[][] teststr = new String[][] { + final String[][] teststr = new String[][] { { "this is simple", "this%20is%20simple" }, { "this is simple", "this+is+simple" }, { "\u00FC", "%C3%BC" }, // lower-case "u" with diaeresis/umlaut { "\u0026", "%26" }, // & - { "\u20AC", "%E2%82%AC" } // euro + { "", "" }, // empty + { "\u20AC", "%E2%82%ac" } // euro, also with lowercase escapes }; for( String[] tst : teststr ) { - MultiMapSolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] ); + SolrParams params = SolrRequestParsers.parseQueryString( "val="+tst[1] ); assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "val="+tst[1]+"&" ); + assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&" ); + assertEquals( tst[0], params.get( "val" ) ); + params = SolrRequestParsers.parseQueryString( "&&val="+tst[1]+"&&&val="+tst[1]+"&" ); + assertArrayEquals(new String[]{tst[0],tst[0]}, params.getParams("val") ); + } + + SolrParams params = SolrRequestParsers.parseQueryString("val"); + assertEquals("", params.get("val")); + + params = SolrRequestParsers.parseQueryString("val&foo=bar=bar&muh&"); + assertEquals("", params.get("val")); + assertEquals("bar=bar", params.get("foo")); + assertEquals("", params.get("muh")); + + final String[] invalid = { + "q=h%FCllo", // non-UTF-8 + "q=h\u00FCllo", // encoded string is not pure US-ASCII + "q=hallo%", // incomplete escape + "q=hallo%1", // incomplete escape + "q=hallo%XX123", // invalid digit 'X' in escape + "=hallo" // missing key + }; + for (String s : invalid) { + try { + SolrRequestParsers.parseQueryString(s); + fail("Should throw SolrException"); + } catch (SolrException se) { + // pass + } } } @@ -172,7 +202,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { public void testStandardParseParamsAndFillStreams() throws Exception { final String getParams = "qt=%C3%BC&dup=foo", postParams = "q=hello&d%75p=bar"; - final byte[] postBytes = postParams.getBytes("UTF-8"); + final byte[] postBytes = postParams.getBytes("US-ASCII"); // Set up the expected behavior final String[] ct = new String[] { @@ -224,7 +254,7 @@ public class SolrRequestParserTest extends SolrTestCaseJ4 { expect(request.getContentLength()).andReturn(-1).anyTimes(); expect(request.getQueryString()).andReturn(null).anyTimes(); expect(request.getInputStream()).andReturn(new ServletInputStream() { - private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("UTF-8")); + private final ByteArrayInputStream in = new ByteArrayInputStream(large.toString().getBytes("US-ASCII")); @Override public int read() { return in.read(); } }); replay(request);