From 8b8bad757224db04d58f785fd65ea78accf918d4 Mon Sep 17 00:00:00 2001 From: kimchy Date: Fri, 7 Jan 2011 14:34:34 +0200 Subject: [PATCH] build on aferreira improvements to handle empty parameters with no =, and improve component decoding logic --- .idea/dictionaries/kimchy.xml | 1 + .../elasticsearch/rest/support/RestUtils.java | 180 +++++++++++++++--- .../rest/util/RestUtilsTests.java | 19 +- 3 files changed, 166 insertions(+), 34 deletions(-) diff --git a/.idea/dictionaries/kimchy.xml b/.idea/dictionaries/kimchy.xml index 2da3534d3bb..e7be07e84ea 100644 --- a/.idea/dictionaries/kimchy.xml +++ b/.idea/dictionaries/kimchy.xml @@ -23,6 +23,7 @@ camelcase canonicalhost charfilter + charsets checksum chunking closeable diff --git a/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java b/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java index 375c702900e..2959143829e 100644 --- a/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java +++ b/modules/elasticsearch/src/main/java/org/elasticsearch/rest/support/RestUtils.java @@ -19,9 +19,9 @@ package org.elasticsearch.rest.support; -import java.io.UnsupportedEncodingException; -import java.net.URLDecoder; -import java.nio.charset.UnsupportedCharsetException; +import org.elasticsearch.common.base.Charsets; + +import java.nio.charset.Charset; import java.util.Map; /** @@ -29,43 +29,171 @@ import java.util.Map; */ public class RestUtils { - public static void decodeQueryString(String queryString, int fromIndex, Map params) { + public static void decodeQueryString(String s, int fromIndex, Map params) { if (fromIndex < 0) { return; } - if (fromIndex >= queryString.length()) { + if (fromIndex >= s.length()) { return; } - int toIndex; - while ((toIndex = queryString.indexOf('&', fromIndex)) >= 0) { - int idx = queryString.indexOf('=', fromIndex); - if (fromIndex < idx && idx < toIndex) { - params.put(decodeComponent(queryString.substring(fromIndex, idx)), decodeComponent(queryString.substring(idx + 1, toIndex))); + + String name = null; + int pos = fromIndex; // Beginning of the unprocessed region + int i; // End of the unprocessed region + char c = 0; // Current character + for (i = fromIndex; i < s.length(); i++) { + c = s.charAt(i); + if (c == '=' && name == null) { + if (pos != i) { + name = decodeComponent(s.substring(pos, i)); + } + pos = i + 1; + } else if (c == '&') { + if (name == null && pos != i) { + // We haven't seen an `=' so far but moved forward. + // Must be a param of the form '&a&' so add it with + // an empty value. + addParam(params, decodeComponent(s.substring(pos, i)), ""); + } else if (name != null) { + addParam(params, name, decodeComponent(s.substring(pos, i))); + name = null; + } + pos = i + 1; } - fromIndex = toIndex + 1; } - int idx = queryString.indexOf('=', fromIndex); - if (idx < 0) { - return; + + if (pos != i) { // Are there characters we haven't dealt with? + if (name == null) { // Yes and we haven't seen any `='. + addParam(params, decodeComponent(s.substring(pos, i)), ""); + } else { // Yes and this must be the last value. + addParam(params, name, decodeComponent(s.substring(pos, i))); + } + } else if (name != null) { // Have we seen a name without value? + addParam(params, name, ""); } - params.put(decodeComponent(queryString.substring(fromIndex, idx)), decodeComponent(queryString.substring(idx + 1))); } - public static String decodeComponent(String s) { + private static void addParam(Map params, String name, String value) { + params.put(name, value); + } + + /** + * Decodes a bit of an URL encoded by a browser. + *

+ * This is equivalent to calling {@link #decodeComponent(String, Charset)} + * with the UTF-8 charset (recommended to comply with RFC 3986, Section 2). + * + * @param s The string to decode (can be empty). + * @return The decoded string, or {@code s} if there's nothing to decode. + * If the string to decode is {@code null}, returns an empty string. + * @throws IllegalArgumentException if the string contains a malformed + * escape sequence. + */ + public static String decodeComponent(final String s) { + return decodeComponent(s, Charsets.UTF_8); + } + + /** + * Decodes a bit of an URL encoded by a browser. + *

+ * The string is expected to be encoded as per RFC 3986, Section 2. + * This is the encoding used by JavaScript functions {@code encodeURI} + * and {@code encodeURIComponent}, but not {@code escape}. For example + * in this encoding, é (in Unicode {@code U+00E9} or in UTF-8 + * {@code 0xC3 0xA9}) is encoded as {@code %C3%A9} or {@code %c3%a9}. + *

+ * This is essentially equivalent to calling + * {@link java.net.URLDecoder URLDecoder}.{@link + * java.net.URLDecoder#decode(String, String)} + * except that it's over 2x faster and generates less garbage for the GC. + * Actually this function doesn't allocate any memory if there's nothing + * to decode, the argument itself is returned. + * + * @param s The string to decode (can be empty). + * @param charset The charset to use to decode the string (should really + * be {@link Charsets#UTF_8}. + * @return The decoded string, or {@code s} if there's nothing to decode. + * If the string to decode is {@code null}, returns an empty string. + * @throws IllegalArgumentException if the string contains a malformed + * escape sequence. + */ + @SuppressWarnings("fallthrough") + public static String decodeComponent(final String s, final Charset charset) { if (s == null) { return ""; } - int numChars = s.length(); - for (int i = 0; i < numChars; i++) { - // do an initial check if it requires decoding do it and return - if (s.charAt(i) == '+' || s.charAt(i) == '%') { - try { - return URLDecoder.decode(s, "UTF8"); - } catch (UnsupportedEncodingException e) { - throw new UnsupportedCharsetException("UTF8"); - } + final int size = s.length(); + boolean modified = false; + for (int i = 0; i < size; i++) { + final char c = s.charAt(i); + switch (c) { + case '%': + i++; // We can skip at least one char, e.g. `%%'. + // Fall through. + case '+': + modified = true; + break; } } - return s; + if (!modified) { + return s; + } + final byte[] buf = new byte[size]; + int pos = 0; // position in `buf'. + for (int i = 0; i < size; i++) { + char c = s.charAt(i); + switch (c) { + case '+': + buf[pos++] = ' '; // "+" -> " " + break; + case '%': + if (i == size - 1) { + throw new IllegalArgumentException("unterminated escape" + + " sequence at end of string: " + s); + } + c = s.charAt(++i); + if (c == '%') { + buf[pos++] = '%'; // "%%" -> "%" + break; + } else if (i == size - 1) { + throw new IllegalArgumentException("partial escape" + + " sequence at end of string: " + s); + } + c = decodeHexNibble(c); + final char c2 = decodeHexNibble(s.charAt(++i)); + if (c == Character.MAX_VALUE || c2 == Character.MAX_VALUE) { + throw new IllegalArgumentException( + "invalid escape sequence `%" + s.charAt(i - 1) + + s.charAt(i) + "' at index " + (i - 2) + + " of: " + s); + } + c = (char) (c * 16 + c2); + // Fall through. + default: + buf[pos++] = (byte) c; + break; + } + } + return new String(buf, 0, pos, charset); + } + + /** + * Helper to decode half of a hexadecimal number from a string. + * + * @param c The ASCII character of the hexadecimal number to decode. + * Must be in the range {@code [0-9a-fA-F]}. + * @return The hexadecimal value represented in the ASCII character + * given, or {@link Character#MAX_VALUE} if the character is invalid. + */ + private static char decodeHexNibble(final char c) { + if ('0' <= c && c <= '9') { + return (char) (c - '0'); + } else if ('a' <= c && c <= 'f') { + return (char) (c - 'a' + 10); + } else if ('A' <= c && c <= 'F') { + return (char) (c - 'A' + 10); + } else { + return Character.MAX_VALUE; + } } } diff --git a/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java b/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java index e5f06b43368..84414b93a12 100644 --- a/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java +++ b/modules/elasticsearch/src/test/java/org/elasticsearch/rest/util/RestUtilsTests.java @@ -84,37 +84,40 @@ public class RestUtilsTests { params.clear(); uri = "something?="; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(1)); - assertThat(params.get(""), equalTo("")); + assertThat(params.size(), equalTo(0)); params.clear(); uri = "something?&="; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(1)); - assertThat(params.get(""), equalTo("")); + assertThat(params.size(), equalTo(0)); params.clear(); uri = "something?a"; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(0)); + assertThat(params.size(), equalTo(1)); + assertThat(params.get("a"), equalTo("")); params.clear(); uri = "something?p=v&a"; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(1)); + assertThat(params.size(), equalTo(2)); + assertThat(params.get("a"), equalTo("")); assertThat(params.get("p"), equalTo("v")); params.clear(); uri = "something?p=v&a&p1=v1"; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(2)); + assertThat(params.size(), equalTo(3)); + assertThat(params.get("a"), equalTo("")); assertThat(params.get("p"), equalTo("v")); assertThat(params.get("p1"), equalTo("v1")); params.clear(); uri = "something?p=v&a&b&p1=v1"; RestUtils.decodeQueryString(uri, uri.indexOf('?') + 1, params); - assertThat(params.size(), equalTo(2)); + assertThat(params.size(), equalTo(4)); + assertThat(params.get("a"), equalTo("")); + assertThat(params.get("b"), equalTo("")); assertThat(params.get("p"), equalTo("v")); assertThat(params.get("p1"), equalTo("v1")); }