diff --git a/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java b/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java index ffd969f3c..3d4c2383f 100644 --- a/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java +++ b/src/main/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaper.java @@ -20,8 +20,10 @@ import java.io.Writer; /** - * Translate XML numeric entities of the form &#[xX]?\d+; to + * Translate XML numeric entities of the form &#[xX]?\d+;? to * the specific codepoint. + * + * Note that the semi-colon is optional. * * @since 3.0 * @version $Id$ @@ -33,7 +35,9 @@ public class NumericEntityUnescaper extends CharSequenceTranslator { */ @Override public int translate(CharSequence input, int index, Writer out) throws IOException { - if(input.charAt(index) == '&' && index < (input.length() - 1) && input.charAt(index + 1) == '#') { + int seqEnd = input.length(); + // Uses -2 to ensure there is something after the &# + if(input.charAt(index) == '&' && index < seqEnd - 2 && input.charAt(index + 1) == '#') { int start = index + 2; boolean isHex = false; @@ -41,10 +45,19 @@ public int translate(CharSequence input, int index, Writer out) throws IOExcepti if(firstChar == 'x' || firstChar == 'X') { start++; isHex = true; + + // Check there's more than just an x after the &# + if(start == seqEnd) { + return 0; + } } int end = start; - while(input.charAt(end) != ';') { + // Note that this supports character codes without a ; on the end + while(end < seqEnd && ( (input.charAt(end) >= '0' && input.charAt(end) <= '9') || + (input.charAt(end) >= 'a' && input.charAt(end) <= 'f') || + (input.charAt(end) >= 'A' && input.charAt(end) <= 'F') ) ) + { end++; } @@ -56,6 +69,7 @@ public int translate(CharSequence input, int index, Writer out) throws IOExcepti entityValue = Integer.parseInt(input.subSequence(start, end).toString(), 10); } } catch(NumberFormatException nfe) { + System.err.println("FAIL: " + input.subSequence(start, end) + "[" + start +"]["+ end +"]"); return 0; } @@ -66,7 +80,10 @@ public int translate(CharSequence input, int index, Writer out) throws IOExcepti } else { out.write(entityValue); } - return 2 + (end - start) + (isHex ? 1 : 0) + 1; + + boolean semiNext = (end != seqEnd) && (input.charAt(end) == ';'); + + return 2 + (end - start) + (isHex ? 1 : 0) + (semiNext ? 1 : 0); } return 0; } diff --git a/src/test/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaperTest.java b/src/test/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaperTest.java index 9e2d2496c..6cfa772cc 100644 --- a/src/test/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaperTest.java +++ b/src/test/java/org/apache/commons/lang3/text/translate/NumericEntityUnescaperTest.java @@ -36,11 +36,20 @@ public void testSupplementaryUnescaping() { public void testOutOfBounds() { NumericEntityUnescaper neu = new NumericEntityUnescaper(); - String input = "Test &"; - String expected = input; + + assertEquals("Failed to ignore when last character is &", "Test &", neu.translate("Test &")); + assertEquals("Failed to ignore when last character is &", "Test &#", neu.translate("Test &#")); + assertEquals("Failed to ignore when last character is &", "Test &#x", neu.translate("Test &#x")); + assertEquals("Failed to ignore when last character is &", "Test &#X", neu.translate("Test &#X")); + } + + public void testUnfinishedEntity() { + NumericEntityUnescaper neu = new NumericEntityUnescaper(); + String input = "Test 0 not test"; + String expected = "Test \u0030 not test"; String result = neu.translate(input); - assertEquals("Failed to ignore when last character is &", expected, result); + assertEquals("Failed to support unfinished entities (i.e. missing semi-colon", expected, result); } }