Fix handling of non-ASCII letters & numbers in RandomStringUtils (#1273)

An optimization introduced in commit
f382d61a03
incorrectly assumed that letters and numbers/digits are ASCII.

For example, `RandomStringUtils.random(1, 0x4e00, 0x4e01, true, false)`
would take too long to terminate and would not output the correct
answer, that is the string with a single character `0x4e00`.
The issue is that `end` would be replaced by `'z' + 1 = 123` (the latest
ASCII letter + 1) there: f382d61a03/src/main/java/org/apache/commons/lang3/RandomStringUtils.java (L266-L270)
 Thus, we would have `end < start` and `gap = end - start < 0`.

This PR fixes this issue by restricting the optimization to cases
where only ASCII characters are requested, that is `end < 0x7f`.
This PR also slightly clarifies the code, using constant variables
instead of '0', '9', 'A', and 'z'.

The PR also includes two regression tests.

Co-authored-by: Fabrice Benhamouda <yfabrice@amazon.com>
This commit is contained in:
Fabrice Benhamouda 2024-09-10 20:47:24 -04:00 committed by GitHub
parent d1bf9bfe74
commit fd866a9c2d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 105 additions and 30 deletions

View File

@ -277,45 +277,50 @@ public class RandomStringUtils {
end = Character.MAX_CODE_POINT;
}
// Optimize generation of full alphanumerical characters
// Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
// In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
// Instead we can pick directly from the right set of 62 characters, which requires
// picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
if (chars == null && letters && numbers && start <= '0' && end >= 'z' + 1) {
return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
}
// Optimizations and tests when chars == null and using ASCII characters (end <= 0x7f)
if (chars == null && end <= 0x7f) {
final int zeroDigitAscii = 48; // '0'
final int lastDigitAscii = 57; // '9'
final int firstLetterAscii = 65; // 'A'
final int lastLetterAscii = 122; // 'z'
// Optimize start and end when filtering by letters and/or numbers:
// The range provided may be too large since we filter anyway afterward.
// Note the use of Math.min/max (as opposed to setting start to '0' for example),
// since it is possible the range start/end excludes some of the letters/numbers,
// e.g., it is possible that start already is '1' when numbers = true, and start
// needs to stay equal to '1' in that case.
if (chars == null) {
// Optimize generation of full alphanumerical characters
// Normally, we would need to pick a 7-bit integer, since gap = 'z' - '0' + 1 = 75 > 64
// In turn, this would make us reject the sampling with probability 1 - 62 / 2^7 > 1 / 2
// Instead we can pick directly from the right set of 62 characters, which requires
// picking a 6-bit integer and only rejecting with probability 2 / 64 = 1 / 32
if (letters && numbers && start <= zeroDigitAscii && end >= lastLetterAscii + 1) {
return random(count, 0, 0, false, false, ALPHANUMERICAL_CHARS, random);
}
if (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii) {
throw new IllegalArgumentException(
"Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
+ "or greater then (" + firstLetterAscii + ") for generating letters.");
}
// Optimize start and end when filtering by letters and/or numbers:
// The range provided may be too large since we filter anyway afterward.
// Note the use of Math.min/max (as opposed to setting start to '0' for example),
// since it is possible the range start/end excludes some of the letters/numbers,
// e.g., it is possible that start already is '1' when numbers = true, and start
// needs to stay equal to '1' in that case.
// Note that because of the above test, we will always have start < end
// even after this optimization.
if (letters && numbers) {
start = Math.max('0', start);
end = Math.min('z' + 1, end);
start = Math.max(zeroDigitAscii, start);
end = Math.min(lastLetterAscii + 1, end);
} else if (numbers) {
// just numbers, no letters
start = Math.max('0', start);
end = Math.min('9' + 1, end);
start = Math.max(zeroDigitAscii, start);
end = Math.min(lastDigitAscii + 1, end);
} else if (letters) {
// just letters, no numbers
start = Math.max('A', start);
end = Math.min('z' + 1, end);
start = Math.max(firstLetterAscii, start);
end = Math.min(lastLetterAscii + 1, end);
}
}
final int zeroDigitAscii = 48;
final int firstLetterAscii = 65;
if (chars == null && (numbers && end <= zeroDigitAscii || letters && end <= firstLetterAscii)) {
throw new IllegalArgumentException(
"Parameter end (" + end + ") must be greater then (" + zeroDigitAscii + ") for generating digits "
+ "or greater then (" + firstLetterAscii + ") for generating letters.");
}
final StringBuilder builder = new StringBuilder(count);
final int gap = end - start;
final int gapBits = Integer.SIZE - Integer.numberOfLeadingZeros(gap);

View File

@ -732,4 +732,74 @@ public class RandomStringUtilsTest extends AbstractLangTest {
assertNotEquals(r1, r3);
assertNotEquals(r2, r3);
}
/**
* Test {@code RandomStringUtils.random} works appropriately when letters=true
* and the range does not only include ASCII letters.
* Fails with probability less than 2^-40 (in practice this never happens).
*/
@ParameterizedTest
@MethodSource("randomProvider")
void testNonASCIILetters(final RandomStringUtils rsu) {
// Check that the following create a string with 10 characters 0x4e00 (a non-ASCII letter)
String r1 = rsu.next(10, 0x4e00, 0x4e01, true, false);
assertEquals(10, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
}
// Same with both letters=true and numbers=true
r1 = rsu.next(10, 0x4e00, 0x4e01, true, true);
assertEquals(10, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertEquals(0x4e00, r1.charAt(i), "characters not all equal to 0x4e00");
}
// Check that at least one letter is not ASCII
boolean found = false;
r1 = rsu.next(40, 'F', 0x3000, true, false);
assertEquals(40, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertTrue(Character.isLetter(r1.charAt(i)), "characters not all letters");
if (r1.charAt(i) > 0x7f) {
found = true;
}
}
assertTrue(found, "no non-ASCII letter generated");
}
/**
* Test {@code RandomStringUtils.random} works appropriately when numbers=true
* and the range does not only include ASCII numbers/digits.
* Fails with probability less than 2^-40 (in practice this never happens).
*/
@ParameterizedTest
@MethodSource("randomProvider")
void testNonASCIINumbers(final RandomStringUtils rsu) {
// Check that the following create a string with 10 characters 0x0660 (a non-ASCII digit)
String r1 = rsu.next(10, 0x0660, 0x0661, false, true);
assertEquals(10, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
}
// Same with both letters=true and numbers=true
r1 = rsu.next(10, 0x0660, 0x0661, true, true);
assertEquals(10, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertEquals(0x0660, r1.charAt(i), "characters not all equal to 0x0660");
}
// Check that at least one letter is not ASCII
boolean found = false;
r1 = rsu.next(40, 'F', 0x3000, false, true);
assertEquals(40, r1.length(), "wrong length");
for (int i = 0; i < r1.length(); i++) {
assertTrue(Character.isDigit(r1.charAt(i)), "characters not all numbers");
if (r1.charAt(i) > 0x7f) {
found = true;
}
}
assertTrue(found, "no non-ASCII number generated");
}
}