Adding a test and a fix for LANG-100. This is a bug in which the randomly created String can sometimes be illegal unicode; because the code does not consider when relationships exist between characters. High and low surrogates are now dealt with, but I'm skipping private high surrogates because I can't find out what to do. Need to go plod very slowly through the spec. This site was very useful: http://www.alanwood.net/unicode/private_use_high_surrogates.html

git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@417319 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Henri Yandell 2006-06-27 00:28:43 +00:00
parent cac478de51
commit b014341965
2 changed files with 61 additions and 3 deletions

View File

@ -18,6 +18,14 @@
import java.util.Random; import java.util.Random;
/** /**
* <p>Operations for random <code>String</code>s.</p> * <p>Operations for random <code>String</code>s.</p>
* <p>Currently <em>private high surrogate</em> characters are ignored.
* These are unicode characters that fall between the values 56192 (db80)
* and 56319 (dbff) as we don't know how to handle them.
* High and low surrogates are correctly dealt with - that is if a
* high surrogate is randomly chosen, 55296 (d800) to 56191 (db7f)
* then it is followed by a low surrogate. If a low surrogate is chosen,
* 56320 (dc00) to 57343 (dfff) then it is placed after a randomly
* chosen high surrogate. </p>
* *
* @author GenerationJava Core library * @author GenerationJava Core library
* @author <a href="mailto:bayard@generationjava.com">Henri Yandell</a> * @author <a href="mailto:bayard@generationjava.com">Henri Yandell</a>
@ -243,8 +251,32 @@ public static String random(int count, int start, int end, boolean letters, bool
} }
if ((letters && Character.isLetter(ch)) if ((letters && Character.isLetter(ch))
|| (numbers && Character.isDigit(ch)) || (numbers && Character.isDigit(ch))
|| (!letters && !numbers)) { || (!letters && !numbers))
buffer[count] = ch; {
if(ch >= 56320 && ch <= 57343) {
if(count == 0) {
count++;
} else {
// low surrogate, insert high surrogate after putting it in
buffer[count] = ch;
count--;
buffer[count] = (char) (55296 + random.nextInt(128));
}
} else if(ch >= 55296 && ch <= 56191) {
if(count == 0) {
count++;
} else {
// high surrogate, insert low surrogate before putting it in
buffer[count] = (char) (56320 + random.nextInt(128));
count--;
buffer[count] = ch;
}
} else if(ch >= 56192 && ch <= 56319) {
// private high surrogate, no effing clue, so skip it
count++;
} else {
buffer[count] = ch;
}
} else { } else {
count++; count++;
} }

View File

@ -315,7 +315,33 @@ private double chiSquare(int[] expected, int[] observed) {
} }
return sumSq; return sumSq;
} }
/**
* Checks if the string got by {@link RandomStringUtils#random(int)}
* can be converted to UTF-8 and back without loss.
*
* @author stefanhoehne@fastmail.fm
* @throws Exception
*/
public void testLang100() throws Exception {
int size = 5000;
String encoding = "UTF-8";
String orig = RandomStringUtils.random(size);
byte[] bytes = orig.getBytes(encoding);
String copy = new String(bytes, encoding);
// for a verbose compare:
for (int i=0; i < orig.length() && i < copy.length(); i++) {
char o = orig.charAt(i);
char c = copy.charAt(i);
assertEquals("differs at " + i + "(" + Integer.toHexString((new Character(o)).hashCode()) + "," +
Integer.toHexString((new Character(c)).hashCode()) + ")", o, c);
}
// compare length also
assertEquals(orig.length(), copy.length());
// just to be complete
assertEquals(orig, copy);
}
public static void main(String args[]) { public static void main(String args[]) {
TestRunner.run(suite()); TestRunner.run(suite());