mirror of
https://github.com/apache/commons-lang.git
synced 2025-02-08 02:58:33 +00:00
Adding a test and a fix for LANG-100. This is a bug in which the randomly created String can sometimes be illegal unicode; because the code does not consider when relationships exist between characters. High and low surrogates are now dealt with, but I'm skipping private high surrogates because I can't find out what to do. Need to go plod very slowly through the spec. This site was very useful: http://www.alanwood.net/unicode/private_use_high_surrogates.html
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@417319 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
cac478de51
commit
b014341965
@ -18,6 +18,14 @@
|
|||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
/**
|
/**
|
||||||
* <p>Operations for random <code>String</code>s.</p>
|
* <p>Operations for random <code>String</code>s.</p>
|
||||||
|
* <p>Currently <em>private high surrogate</em> characters are ignored.
|
||||||
|
* These are unicode characters that fall between the values 56192 (db80)
|
||||||
|
* and 56319 (dbff) as we don't know how to handle them.
|
||||||
|
* High and low surrogates are correctly dealt with - that is if a
|
||||||
|
* high surrogate is randomly chosen, 55296 (d800) to 56191 (db7f)
|
||||||
|
* then it is followed by a low surrogate. If a low surrogate is chosen,
|
||||||
|
* 56320 (dc00) to 57343 (dfff) then it is placed after a randomly
|
||||||
|
* chosen high surrogate. </p>
|
||||||
*
|
*
|
||||||
* @author GenerationJava Core library
|
* @author GenerationJava Core library
|
||||||
* @author <a href="mailto:bayard@generationjava.com">Henri Yandell</a>
|
* @author <a href="mailto:bayard@generationjava.com">Henri Yandell</a>
|
||||||
@ -243,8 +251,32 @@ public static String random(int count, int start, int end, boolean letters, bool
|
|||||||
}
|
}
|
||||||
if ((letters && Character.isLetter(ch))
|
if ((letters && Character.isLetter(ch))
|
||||||
|| (numbers && Character.isDigit(ch))
|
|| (numbers && Character.isDigit(ch))
|
||||||
|| (!letters && !numbers)) {
|
|| (!letters && !numbers))
|
||||||
|
{
|
||||||
|
if(ch >= 56320 && ch <= 57343) {
|
||||||
|
if(count == 0) {
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
// low surrogate, insert high surrogate after putting it in
|
||||||
buffer[count] = ch;
|
buffer[count] = ch;
|
||||||
|
count--;
|
||||||
|
buffer[count] = (char) (55296 + random.nextInt(128));
|
||||||
|
}
|
||||||
|
} else if(ch >= 55296 && ch <= 56191) {
|
||||||
|
if(count == 0) {
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
// high surrogate, insert low surrogate before putting it in
|
||||||
|
buffer[count] = (char) (56320 + random.nextInt(128));
|
||||||
|
count--;
|
||||||
|
buffer[count] = ch;
|
||||||
|
}
|
||||||
|
} else if(ch >= 56192 && ch <= 56319) {
|
||||||
|
// private high surrogate, no effing clue, so skip it
|
||||||
|
count++;
|
||||||
|
} else {
|
||||||
|
buffer[count] = ch;
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
@ -316,6 +316,32 @@ private double chiSquare(int[] expected, int[] observed) {
|
|||||||
return sumSq;
|
return sumSq;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Checks if the string got by {@link RandomStringUtils#random(int)}
|
||||||
|
* can be converted to UTF-8 and back without loss.
|
||||||
|
*
|
||||||
|
* @author stefanhoehne@fastmail.fm
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
public void testLang100() throws Exception {
|
||||||
|
int size = 5000;
|
||||||
|
String encoding = "UTF-8";
|
||||||
|
String orig = RandomStringUtils.random(size);
|
||||||
|
byte[] bytes = orig.getBytes(encoding);
|
||||||
|
String copy = new String(bytes, encoding);
|
||||||
|
|
||||||
|
// for a verbose compare:
|
||||||
|
for (int i=0; i < orig.length() && i < copy.length(); i++) {
|
||||||
|
char o = orig.charAt(i);
|
||||||
|
char c = copy.charAt(i);
|
||||||
|
assertEquals("differs at " + i + "(" + Integer.toHexString((new Character(o)).hashCode()) + "," +
|
||||||
|
Integer.toHexString((new Character(c)).hashCode()) + ")", o, c);
|
||||||
|
}
|
||||||
|
// compare length also
|
||||||
|
assertEquals(orig.length(), copy.length());
|
||||||
|
// just to be complete
|
||||||
|
assertEquals(orig, copy);
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String args[]) {
|
public static void main(String args[]) {
|
||||||
TestRunner.run(suite());
|
TestRunner.run(suite());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user