Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273)
An optimization introduced in commitf382d61a03
incorrectly assumed that letters and numbers/digits are ASCII. For example, `RandomStringUtils.random(1, 0x4e00, 0x4e01, true, false)` would take too long to terminate and would not output the correct answer, that is the string with a single character `0x4e00`. The issue is that `end` would be replaced by `'z' + 1 = 123` (the latest ASCII letter + 1) there:f382d61a03/src/main/java/org/apache/commons/lang3/RandomStringUtils.java (L266-L270)
Thus, we would have `end < start` and `gap = end - start < 0`. This PR fixes this issue by restricting the optimization to cases where only ASCII characters are requested, that is `end < 0x7f`. This PR also slightly clarifies the code, using constant variables instead of '0', '9', 'A', and 'z'. The PR also includes two regression tests. Co-authored-by: Fabrice Benhamouda <yfabrice@amazon.com>
This commit is contained in:
parent
d1bf9bfe74
commit
fd866a9c2d
|
@ -277,45 +277,50 @@ public class RandomStringUtils {
|
|||
end = Character.MAX_CODE_POINT;
|
||||
}
|
||||
|
||||
// Optimize generation of full alphanumerical characters
|
||||
// Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
|
||||
// In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
|
||||
// Instead we can pick directly from the right set of 62 characters, which requires
|
||||
// picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
|
||||
if (chars == null && letters && numbers && start <= '0' && end >= 'z' + 1) {
|
||||
return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
|
||||
}
|
||||
// Optimizations and tests when chars == null and using ASCII characters (end <= 0x7f)
|
||||
if (chars == null && end <= 0x7f) {
|
||||
final int zeroDigitAscii = 48; // '0'
|
||||
final int lastDigitAscii = 57; // '9'
|
||||
final int firstLetterAscii = 65; // 'A'
|
||||
final int lastLetterAscii = 122; // 'z'
|
||||
|
||||
// Optimize start and end when filtering by letters and/or numbers:
|
||||
// The range provided may be too large since we filter anyway afterward.
|
||||
// Note the use of Math.min/max (as opposed to setting start to '0' for example),
|
||||
// since it is possible the range start/end excludes some of the letters/numbers,
|
||||
// e.g., it is possible that start already is '1' when numbers = true, and start
|
||||
// needs to stay equal to '1' in that case.
|
||||
if (chars == null) {
|
||||
// Optimize generation of full alphanumerical characters
|
||||
// Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
|
||||
// In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
|
||||
// Instead we can pick directly from the right set of 62 characters, which requires
|
||||
// picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
|
||||
if (letters && numbers && start <= zeroDigitAscii && end >= lastLetterAscii + 1) {
|
||||
return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
|
||||
}
|
||||
|
||||
if (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii) {
|
||||
throw new IllegalArgumentException(
|
||||
"Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
|
||||
+ "or greater then (" + firstLetterAscii + ") for generating letters.");
|
||||
}
|
||||
|
||||
// Optimize start and end when filtering by letters and/or numbers:
|
||||
// The range provided may be too large since we filter anyway afterward.
|
||||
// Note the use of Math.min/max (as opposed to setting start to '0' for example),
|
||||
// since it is possible the range start/end excludes some of the letters/numbers,
|
||||
// e.g., it is possible that start already is '1' when numbers = true, and start
|
||||
// needs to stay equal to '1' in that case.
|
||||
// Note that because of the above test, we will always have start < end
|
||||
// even after this optimization.
|
||||
if (letters && numbers) {
|
||||
start = Math.max('0', start);
|
||||
end = Math.min('z' + 1, end);
|
||||
start = Math.max(zeroDigitAscii, start);
|
||||
end = Math.min(lastLetterAscii + 1, end);
|
||||
} else if (numbers) {
|
||||
// just numbers, no letters
|
||||
start = Math.max('0', start);
|
||||
end = Math.min('9' + 1, end);
|
||||
start = Math.max(zeroDigitAscii, start);
|
||||
end = Math.min(lastDigitAscii + 1, end);
|
||||
} else if (letters) {
|
||||
// just letters, no numbers
|
||||
start = Math.max('A', start);
|
||||
end = Math.min('z' + 1, end);
|
||||
start = Math.max(firstLetterAscii, start);
|
||||
end = Math.min(lastLetterAscii + 1, end);
|
||||
}
|
||||
}
|
||||
|
||||
final int zeroDigitAscii = 48;
|
||||
final int firstLetterAscii = 65;
|
||||
|
||||
if (chars == null && (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
|
||||
+ "or greater then (" + firstLetterAscii + ") for generating letters.");
|
||||
}
|
||||
|
||||
final StringBuilder builder = new StringBuilder(count);
|
||||
final int gap = end - start;
|
||||
final int gapBits = Integer.SIZE - Integer.numberOfLeadingZeros(gap);
|
||||
|
|
|
@ -732,4 +732,74 @@ public class RandomStringUtilsTest extends AbstractLangTest {
|
|||
assertNotEquals(r1, r3);
|
||||
assertNotEquals(r2, r3);
|
||||
}
|
||||
|
||||
/**
|
||||
* Test {@code RandomStringUtils.random} works appropriately when letters=true
|
||||
* and the range does not only include ASCII letters.
|
||||
* Fails with probability less than 2^-40 (in practice this never happens).
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@MethodSource("randomProvider")
|
||||
void testNonASCIILetters(final RandomStringUtils rsu) {
|
||||
// Check that the following create a string with 10 characters 0x4e00 (a non-ASCII letter)
|
||||
String r1 = rsu.next(10, 0x4e00, 0x4e01, true, false);
|
||||
assertEquals(10, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
|
||||
}
|
||||
|
||||
// Same with both letters=true and numbers=true
|
||||
r1 = rsu.next(10, 0x4e00, 0x4e01, true, true);
|
||||
assertEquals(10, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
|
||||
}
|
||||
|
||||
// Check that at least one letter is not ASCII
|
||||
boolean found = false;
|
||||
r1 = rsu.next(40, 'F', 0x3000, true, false);
|
||||
assertEquals(40, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertTrue(Character.isLetter(r1.charAt(i)), "characters not all letters");
|
||||
if (r1.charAt(i) > 0x7f) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assertTrue(found, "no non-ASCII letter generated");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test {@code RandomStringUtils.random} works appropriately when numbers=true
|
||||
* and the range does not only include ASCII numbers/digits.
|
||||
* Fails with probability less than 2^-40 (in practice this never happens).
|
||||
*/
|
||||
@ParameterizedTest
|
||||
@MethodSource("randomProvider")
|
||||
void testNonASCIINumbers(final RandomStringUtils rsu) {
|
||||
// Check that the following create a string with 10 characters 0x0660 (a non-ASCII digit)
|
||||
String r1 = rsu.next(10, 0x0660, 0x0661, false, true);
|
||||
assertEquals(10, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
|
||||
}
|
||||
|
||||
// Same with both letters=true and numbers=true
|
||||
r1 = rsu.next(10, 0x0660, 0x0661, true, true);
|
||||
assertEquals(10, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
|
||||
}
|
||||
|
||||
// Check that at least one letter is not ASCII
|
||||
boolean found = false;
|
||||
r1 = rsu.next(40, 'F', 0x3000, false, true);
|
||||
assertEquals(40, r1.length(), "wrong length");
|
||||
for (int i = 0; i < r1.length(); i++) {
|
||||
assertTrue(Character.isDigit(r1.charAt(i)), "characters not all numbers");
|
||||
if (r1.charAt(i) > 0x7f) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assertTrue(found, "no non-ASCII number generated");
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue