Improve CharSet testing
bug 22095, from Phil Steitz Rewrite CharSet parsing, much neater and simpler now git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137565 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
34f6fa8a78
commit
d43b319902
|
@ -67,8 +67,9 @@ import java.util.Set;
|
|||
*
|
||||
* @author Henri Yandell
|
||||
* @author Stephen Colebourne
|
||||
* @author Phil Steitz
|
||||
* @since 1.0
|
||||
* @version $Id: CharSet.java,v 1.10 2003/08/02 18:18:33 scolebourne Exp $
|
||||
* @version $Id: CharSet.java,v 1.11 2003/08/04 00:50:14 scolebourne Exp $
|
||||
*/
|
||||
public class CharSet implements Serializable {
|
||||
|
||||
|
@ -126,10 +127,26 @@ public class CharSet implements Serializable {
|
|||
* - set containing all the characters from the individual sets</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>The matching order is:</p>
|
||||
* <ol
|
||||
* <li>Negated multi character range, such as "^a-e"
|
||||
* <li>Ordinary multi character range, such as "a-e"
|
||||
* <li>Negated single character, such as "^a"
|
||||
* <li>Ordinary single character, such as "a"
|
||||
* </ol>
|
||||
* <p>Matching works left to right. Once a match is found the
|
||||
* search starts again from the next character.</p>
|
||||
*
|
||||
* <p>If the same range is defined twice using the same syntax, only
|
||||
* one range will be kept.
|
||||
* Thus, "a-ca-c" creates only one range of "a-c".
|
||||
* However, "a-cabc" creates two ranges as they are defined differently.</p>
|
||||
* Thus, "a-ca-c" creates only one range of "a-c".</p>
|
||||
*
|
||||
* <p>If the start and end of a range are in the wrong order,
|
||||
* they are reversed. Thus "a-e" is the same as "e-a".
|
||||
* As a result, "a-ee-a" would create only one range,
|
||||
* as the "a-e" and "e-a" are the same.</p>
|
||||
*
|
||||
* <p>The set of characters represented is the union of the specified ranges.</p>
|
||||
*
|
||||
* <p>All CharSet objects returned by this method will be immutable.</p>
|
||||
*
|
||||
|
@ -180,71 +197,26 @@ public class CharSet implements Serializable {
|
|||
}
|
||||
|
||||
int len = str.length();
|
||||
switch (len) {
|
||||
case 0:
|
||||
// do nothing
|
||||
break;
|
||||
|
||||
case 1:
|
||||
set.add(new CharRange(str.charAt(0)));
|
||||
break;
|
||||
|
||||
default:
|
||||
int start = -1;
|
||||
boolean negated = false;
|
||||
for (int i = 0; i < len; i++) {
|
||||
char ch = str.charAt(i);
|
||||
if (ch == '-') {
|
||||
if (start == -1) {
|
||||
// dash found not as range separator
|
||||
// treat as ordinary start block char
|
||||
start = ch;
|
||||
} else if (i == len - 1) {
|
||||
// dash is last character, store two single characters
|
||||
set.add(new CharRange((char) start, (char) start, negated));
|
||||
set.add(DASH);
|
||||
start = -1;
|
||||
negated = false;
|
||||
} else {
|
||||
// range block found, store it
|
||||
set.add(new CharRange((char) start, str.charAt(++i), negated));
|
||||
start = -1;
|
||||
negated = false;
|
||||
}
|
||||
} else if (ch == '^') {
|
||||
if (start == -1) {
|
||||
if (negated) {
|
||||
// double negate, treat second as ordinary start block char
|
||||
start = ch;
|
||||
} else {
|
||||
// negate next block
|
||||
negated = true;
|
||||
}
|
||||
} else {
|
||||
// previous block has ended, store it
|
||||
set.add(new CharRange((char) start, (char) start, negated));
|
||||
start = -1;
|
||||
negated = true;
|
||||
}
|
||||
} else {
|
||||
if (start == -1) {
|
||||
// start of block
|
||||
start = ch;
|
||||
} else {
|
||||
// previous block has ended, store it, and start next block
|
||||
set.add(new CharRange((char) start, (char) start, negated));
|
||||
start = ch;
|
||||
negated = false;
|
||||
}
|
||||
}
|
||||
int pos = 0;
|
||||
while (pos < len) {
|
||||
int remainder = (len - pos);
|
||||
if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
|
||||
// negated range
|
||||
set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true));
|
||||
pos += 4;
|
||||
} else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
|
||||
// range
|
||||
set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2)));
|
||||
pos += 3;
|
||||
} else if (remainder >= 2 && str.charAt(pos) == '^') {
|
||||
// negated char
|
||||
set.add(new CharRange(str.charAt(pos + 1), true));
|
||||
pos += 2;
|
||||
} else {
|
||||
// char
|
||||
set.add(new CharRange(str.charAt(pos)));
|
||||
pos += 1;
|
||||
}
|
||||
// handle leftovers
|
||||
if (start != -1) {
|
||||
set.add(new CharRange((char) start, (char) start, negated));
|
||||
} else if (negated) {
|
||||
set.add(NEGATE);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -62,8 +62,9 @@ package org.apache.commons.lang;
|
|||
*
|
||||
* @author <a href="bayard@generationjava.com">Henri Yandell</a>
|
||||
* @author Stephen Colebourne
|
||||
* @author Phil Steitz
|
||||
* @since 1.0
|
||||
* @version $Id: CharSetUtils.java,v 1.20 2003/08/02 18:18:33 scolebourne Exp $
|
||||
* @version $Id: CharSetUtils.java,v 1.21 2003/08/04 00:50:14 scolebourne Exp $
|
||||
*/
|
||||
public class CharSetUtils {
|
||||
|
||||
|
@ -80,13 +81,12 @@ public class CharSetUtils {
|
|||
// Factory
|
||||
//-----------------------------------------------------------------------
|
||||
/**
|
||||
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of
|
||||
* <p>Creates a <code>CharSet</code> instance which allows a certain amount of
|
||||
* set logic to be performed.</p>
|
||||
* <p>The syntax is:</p>
|
||||
* <ul>
|
||||
* <li>"aeio" which implies 'a','e',..</li>
|
||||
* <li>"^e" implies not e. However it only negates, it's not
|
||||
* a set in itself due to the size of that set in unicode.</li>
|
||||
* <li>"^e" implies not e.</li>
|
||||
* <li>"ej-m" implies e,j->m. e,j,k,l,m.</li>
|
||||
* </ul>
|
||||
*
|
||||
|
@ -94,6 +94,7 @@ public class CharSetUtils {
|
|||
* CharSetUtils.evaluateSet(null) = null
|
||||
* CharSetUtils.evaluateSet("") = CharSet matching nothing
|
||||
* CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e
|
||||
* CharSetUtils.evaluateSet("abe-g") = CharSet matching a,b,e,f,g
|
||||
* </pre>
|
||||
*
|
||||
* @param set the set, may be null
|
||||
|
@ -109,13 +110,12 @@ public class CharSetUtils {
|
|||
}
|
||||
|
||||
/**
|
||||
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of
|
||||
* <p>Creates a <code>CharSet</code> instance which allows a certain amount of
|
||||
* set logic to be performed.</p>
|
||||
* <p>The syntax is:</p>
|
||||
* <ul>
|
||||
* <li>"aeio" which implies 'a','e',..</li>
|
||||
* <li>"^e" implies not e. However it only negates, it's not
|
||||
* a set in itself due to the size of that set in unicode.</li>
|
||||
* <li>"^e" implies not e.</li>
|
||||
* <li>"ej-m" implies e,j->m. e,j,k,l,m.</li>
|
||||
* </ul>
|
||||
*
|
||||
|
|
|
@ -64,7 +64,8 @@ import junit.textui.TestRunner;
|
|||
* Unit tests {@link org.apache.commons.lang.CharSet}.
|
||||
*
|
||||
* @author Stephen Colebourne
|
||||
* @version $Id: CharSetTest.java,v 1.1 2003/08/02 18:18:33 scolebourne Exp $
|
||||
* @author Phil Steitz
|
||||
* @version $Id: CharSetTest.java,v 1.2 2003/08/04 00:50:14 scolebourne Exp $
|
||||
*/
|
||||
public class CharSetTest extends TestCase {
|
||||
|
||||
|
@ -278,57 +279,105 @@ public class CharSetTest extends TestCase {
|
|||
set = CharSet.getInstance("^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(1, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
|
||||
|
||||
set = CharSet.getInstance("^^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(1, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
|
||||
|
||||
set = CharSet.getInstance("^^^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(2, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); // "^"
|
||||
|
||||
set = CharSet.getInstance("^^^^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(1, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" x2
|
||||
|
||||
set = CharSet.getInstance("a^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(2, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); // "a"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
|
||||
|
||||
set = CharSet.getInstance("^a-");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(2, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); // "^a"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
|
||||
|
||||
set = CharSet.getInstance("^^-c");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(1, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); // "^^-c"
|
||||
|
||||
set = CharSet.getInstance("^c-^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(1, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
|
||||
|
||||
set = CharSet.getInstance("^c-^d");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(2, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('d')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); // "d"
|
||||
|
||||
set = CharSet.getInstance("^^-");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(2, array.length);
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
|
||||
}
|
||||
|
||||
public void testConstructor_String_oddCombinations() {
|
||||
CharSet set;
|
||||
CharRange[] array = null;
|
||||
|
||||
set = CharSet.getInstance("a-^c");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^'))); // "a-^"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
|
||||
assertEquals(false, set.contains('b'));
|
||||
assertEquals(true, set.contains('^'));
|
||||
assertEquals(true, set.contains('_')); // between ^ and a
|
||||
assertEquals(true, set.contains('c'));
|
||||
|
||||
set = CharSet.getInstance("^a-^c");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^', true))); // "^a-^"
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
|
||||
assertEquals(true, set.contains('b'));
|
||||
assertEquals(false, set.contains('^'));
|
||||
assertEquals(false, set.contains('_')); // between ^ and a
|
||||
|
||||
set = CharSet.getInstance("a- ^-- "); //contains everything
|
||||
array = set.getCharRanges();
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', ' '))); // "a- "
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('-', ' ', true))); // "^-- "
|
||||
assertEquals(true, set.contains('#'));
|
||||
assertEquals(true, set.contains('^'));
|
||||
assertEquals(true, set.contains('a'));
|
||||
assertEquals(true, set.contains('*'));
|
||||
assertEquals(true, set.contains('A'));
|
||||
|
||||
set = CharSet.getInstance("^-b");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "^-b"
|
||||
assertEquals(true, set.contains('b'));
|
||||
assertEquals(true, set.contains('_')); // between ^ and a
|
||||
assertEquals(false, set.contains('A'));
|
||||
assertEquals(true, set.contains('^'));
|
||||
|
||||
set = CharSet.getInstance("b-^");
|
||||
array = set.getCharRanges();
|
||||
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "b-^"
|
||||
assertEquals(true, set.contains('b'));
|
||||
assertEquals(true, set.contains('^'));
|
||||
assertEquals(true, set.contains('a')); // between ^ and b
|
||||
assertEquals(false, set.contains('c'));
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
@ -377,6 +426,7 @@ public class CharSetTest extends TestCase {
|
|||
//-----------------------------------------------------------------------
|
||||
public void testContains_Char() {
|
||||
CharSet btod = CharSet.getInstance("b-d");
|
||||
CharSet dtob = CharSet.getInstance("d-b");
|
||||
CharSet bcd = CharSet.getInstance("bcd");
|
||||
CharSet bd = CharSet.getInstance("bd");
|
||||
CharSet notbtod = CharSet.getInstance("^b-d");
|
||||
|
@ -404,6 +454,16 @@ public class CharSetTest extends TestCase {
|
|||
assertEquals(false, notbtod.contains('c'));
|
||||
assertEquals(false, notbtod.contains('d'));
|
||||
assertEquals(true, notbtod.contains('e'));
|
||||
|
||||
assertEquals(false, dtob.contains('a'));
|
||||
assertEquals(true, dtob.contains('b'));
|
||||
assertEquals(true, dtob.contains('c'));
|
||||
assertEquals(true, dtob.contains('d'));
|
||||
assertEquals(false, dtob.contains('e'));
|
||||
|
||||
CharRange[] array = dtob.getCharRanges();
|
||||
assertEquals("[b-d]", dtob.toString());
|
||||
assertEquals(1, array.length);
|
||||
}
|
||||
|
||||
//-----------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue