Adding a test and a fix for LANG-100. This is a bug in which the randomly created String can sometimes be illegal unicode; because the code does not consider when relationships exist between characters. High and low surrogates are now dealt with, but I'm skipping private high surrogates because I can't find out what to do. Need to go plod very slowly through the spec. This site was very useful: http://www.alanwood.net/unicode/private_use_high_surrogates.html
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@417319 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cac478de51
commit
b014341965
|
@ -18,6 +18,14 @@ package org.apache.commons.lang;
|
|||
import java.util.Random;
|
||||
/**
|
||||
* <p>Operations for random <code>String</code>s.</p>
|
||||
* <p>Currently <em>private high surrogate</em> characters are ignored.
|
||||
* These are unicode characters that fall between the values 56192 (db80)
|
||||
* and 56319 (dbff) as we don't know how to handle them.
|
||||
* High and low surrogates are correctly dealt with - that is if a
|
||||
* high surrogate is randomly chosen, 55296 (d800) to 56191 (db7f)
|
||||
* then it is followed by a low surrogate. If a low surrogate is chosen,
|
||||
* 56320 (dc00) to 57343 (dfff) then it is placed after a randomly
|
||||
* chosen high surrogate. </p>
|
||||
*
|
||||
* @author GenerationJava Core library
|
||||
* @author <a href="mailto:bayard@generationjava.com">Henri Yandell</a>
|
||||
|
@ -243,8 +251,32 @@ public class RandomStringUtils {
|
|||
}
|
||||
if ((letters && Character.isLetter(ch))
|
||||
|| (numbers && Character.isDigit(ch))
|
||||
|| (!letters && !numbers)) {
|
||||
buffer[count] = ch;
|
||||
|| (!letters && !numbers))
|
||||
{
|
||||
if(ch >= 56320 && ch <= 57343) {
|
||||
if(count == 0) {
|
||||
count++;
|
||||
} else {
|
||||
// low surrogate, insert high surrogate after putting it in
|
||||
buffer[count] = ch;
|
||||
count--;
|
||||
buffer[count] = (char) (55296 + random.nextInt(128));
|
||||
}
|
||||
} else if(ch >= 55296 && ch <= 56191) {
|
||||
if(count == 0) {
|
||||
count++;
|
||||
} else {
|
||||
// high surrogate, insert low surrogate before putting it in
|
||||
buffer[count] = (char) (56320 + random.nextInt(128));
|
||||
count--;
|
||||
buffer[count] = ch;
|
||||
}
|
||||
} else if(ch >= 56192 && ch <= 56319) {
|
||||
// private high surrogate, no effing clue, so skip it
|
||||
count++;
|
||||
} else {
|
||||
buffer[count] = ch;
|
||||
}
|
||||
} else {
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -315,7 +315,33 @@ public class RandomStringUtilsTest extends junit.framework.TestCase {
|
|||
}
|
||||
return sumSq;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Checks if the string got by {@link RandomStringUtils#random(int)}
|
||||
* can be converted to UTF-8 and back without loss.
|
||||
*
|
||||
* @author stefanhoehne@fastmail.fm
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testLang100() throws Exception {
|
||||
int size = 5000;
|
||||
String encoding = "UTF-8";
|
||||
String orig = RandomStringUtils.random(size);
|
||||
byte[] bytes = orig.getBytes(encoding);
|
||||
String copy = new String(bytes, encoding);
|
||||
|
||||
// for a verbose compare:
|
||||
for (int i=0; i < orig.length() && i < copy.length(); i++) {
|
||||
char o = orig.charAt(i);
|
||||
char c = copy.charAt(i);
|
||||
assertEquals("differs at " + i + "(" + Integer.toHexString((new Character(o)).hashCode()) + "," +
|
||||
Integer.toHexString((new Character(c)).hashCode()) + ")", o, c);
|
||||
}
|
||||
// compare length also
|
||||
assertEquals(orig.length(), copy.length());
|
||||
// just to be complete
|
||||
assertEquals(orig, copy);
|
||||
}
|
||||
|
||||
public static void main(String args[]) {
|
||||
TestRunner.run(suite());
|
||||
|
|
Loading…
Reference in New Issue