Improve CharSet testing

bug 22095, from Phil Steitz
Rewrite CharSet parsing, much neater and simpler now


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137565 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2003-08-04 00:50:14 +00:00
parent 34f6fa8a78
commit d43b319902
3 changed files with 122 additions and 90 deletions

View File

@ -67,8 +67,9 @@ import java.util.Set;
*
* @author Henri Yandell
* @author Stephen Colebourne
* @author Phil Steitz
* @since 1.0
* @version $Id: CharSet.java,v 1.10 2003/08/02 18:18:33 scolebourne Exp $
* @version $Id: CharSet.java,v 1.11 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSet implements Serializable {
@ -126,10 +127,26 @@ public class CharSet implements Serializable {
* - set containing all the characters from the individual sets</li>
* </ul>
*
* <p>The matching order is:</p>
* <ol
* <li>Negated multi character range, such as "^a-e"
* <li>Ordinary multi character range, such as "a-e"
* <li>Negated single character, such as "^a"
* <li>Ordinary single character, such as "a"
* </ol>
* <p>Matching works left to right. Once a match is found the
* search starts again from the next character.</p>
*
* <p>If the same range is defined twice using the same syntax, only
* one range will be kept.
* Thus, "a-ca-c" creates only one range of "a-c".
* However, "a-cabc" creates two ranges as they are defined differently.</p>
* Thus, "a-ca-c" creates only one range of "a-c".</p>
*
* <p>If the start and end of a range are in the wrong order,
* they are reversed. Thus "a-e" is the same as "e-a".
* As a result, "a-ee-a" would create only one range,
* as the "a-e" and "e-a" are the same.</p>
*
* <p>The set of characters represented is the union of the specified ranges.</p>
*
* <p>All CharSet objects returned by this method will be immutable.</p>
*
@ -180,71 +197,26 @@ public class CharSet implements Serializable {
}
int len = str.length();
switch (len) {
case 0:
// do nothing
break;
case 1:
set.add(new CharRange(str.charAt(0)));
break;
default:
int start = -1;
boolean negated = false;
for (int i = 0; i < len; i++) {
char ch = str.charAt(i);
if (ch == '-') {
if (start == -1) {
// dash found not as range separator
// treat as ordinary start block char
start = ch;
} else if (i == len - 1) {
// dash is last character, store two single characters
set.add(new CharRange((char) start, (char) start, negated));
set.add(DASH);
start = -1;
negated = false;
int pos = 0;
while (pos < len) {
int remainder = (len - pos);
if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
// negated range
set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true));
pos += 4;
} else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
// range
set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2)));
pos += 3;
} else if (remainder >= 2 && str.charAt(pos) == '^') {
// negated char
set.add(new CharRange(str.charAt(pos + 1), true));
pos += 2;
} else {
// range block found, store it
set.add(new CharRange((char) start, str.charAt(++i), negated));
start = -1;
negated = false;
// char
set.add(new CharRange(str.charAt(pos)));
pos += 1;
}
} else if (ch == '^') {
if (start == -1) {
if (negated) {
// double negate, treat second as ordinary start block char
start = ch;
} else {
// negate next block
negated = true;
}
} else {
// previous block has ended, store it
set.add(new CharRange((char) start, (char) start, negated));
start = -1;
negated = true;
}
} else {
if (start == -1) {
// start of block
start = ch;
} else {
// previous block has ended, store it, and start next block
set.add(new CharRange((char) start, (char) start, negated));
start = ch;
negated = false;
}
}
}
// handle leftovers
if (start != -1) {
set.add(new CharRange((char) start, (char) start, negated));
} else if (negated) {
set.add(NEGATE);
}
break;
}
}

View File

@ -62,8 +62,9 @@ package org.apache.commons.lang;
*
* @author <a href="bayard@generationjava.com">Henri Yandell</a>
* @author Stephen Colebourne
* @author Phil Steitz
* @since 1.0
* @version $Id: CharSetUtils.java,v 1.20 2003/08/02 18:18:33 scolebourne Exp $
* @version $Id: CharSetUtils.java,v 1.21 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSetUtils {
@ -80,13 +81,12 @@ public class CharSetUtils {
// Factory
//-----------------------------------------------------------------------
/**
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of
* <p>Creates a <code>CharSet</code> instance which allows a certain amount of
* set logic to be performed.</p>
* <p>The syntax is:</p>
* <ul>
* <li>&quot;aeio&quot; which implies 'a','e',..</li>
* <li>&quot;^e&quot; implies not e. However it only negates, it's not
* a set in itself due to the size of that set in unicode.</li>
* <li>&quot;^e&quot; implies not e.</li>
* <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li>
* </ul>
*
@ -94,6 +94,7 @@ public class CharSetUtils {
* CharSetUtils.evaluateSet(null) = null
* CharSetUtils.evaluateSet("") = CharSet matching nothing
* CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e
* CharSetUtils.evaluateSet("abe-g") = CharSet matching a,b,e,f,g
* </pre>
*
* @param set the set, may be null
@ -109,13 +110,12 @@ public class CharSetUtils {
}
/**
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of
* <p>Creates a <code>CharSet</code> instance which allows a certain amount of
* set logic to be performed.</p>
* <p>The syntax is:</p>
* <ul>
* <li>&quot;aeio&quot; which implies 'a','e',..</li>
* <li>&quot;^e&quot; implies not e. However it only negates, it's not
* a set in itself due to the size of that set in unicode.</li>
* <li>&quot;^e&quot; implies not e.</li>
* <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li>
* </ul>
*

View File

@ -64,7 +64,8 @@ import junit.textui.TestRunner;
* Unit tests {@link org.apache.commons.lang.CharSet}.
*
* @author Stephen Colebourne
* @version $Id: CharSetTest.java,v 1.1 2003/08/02 18:18:33 scolebourne Exp $
* @author Phil Steitz
* @version $Id: CharSetTest.java,v 1.2 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSetTest extends TestCase {
@ -278,57 +279,105 @@ public class CharSetTest extends TestCase {
set = CharSet.getInstance("^");
array = set.getCharRanges();
assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^^");
array = set.getCharRanges();
assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
set = CharSet.getInstance("^^^");
array = set.getCharRanges();
assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); // "^"
set = CharSet.getInstance("^^^^");
array = set.getCharRanges();
assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" x2
set = CharSet.getInstance("a^");
array = set.getCharRanges();
assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('a')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); // "a"
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^a-");
array = set.getCharRanges();
assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); // "^a"
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
set = CharSet.getInstance("^^-c");
array = set.getCharRanges();
assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); // "^^-c"
set = CharSet.getInstance("^c-^");
array = set.getCharRanges();
assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
set = CharSet.getInstance("^c-^d");
array = set.getCharRanges();
assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('d')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); // "d"
set = CharSet.getInstance("^^-");
array = set.getCharRanges();
assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
}
public void testConstructor_String_oddCombinations() {
CharSet set;
CharRange[] array = null;
set = CharSet.getInstance("a-^c");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^'))); // "a-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
assertEquals(false, set.contains('b'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('_')); // between ^ and a
assertEquals(true, set.contains('c'));
set = CharSet.getInstance("^a-^c");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^', true))); // "^a-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
assertEquals(true, set.contains('b'));
assertEquals(false, set.contains('^'));
assertEquals(false, set.contains('_')); // between ^ and a
set = CharSet.getInstance("a- ^-- "); //contains everything
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', ' '))); // "a- "
assertEquals(true, ArrayUtils.contains(array, new CharRange('-', ' ', true))); // "^-- "
assertEquals(true, set.contains('#'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('a'));
assertEquals(true, set.contains('*'));
assertEquals(true, set.contains('A'));
set = CharSet.getInstance("^-b");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "^-b"
assertEquals(true, set.contains('b'));
assertEquals(true, set.contains('_')); // between ^ and a
assertEquals(false, set.contains('A'));
assertEquals(true, set.contains('^'));
set = CharSet.getInstance("b-^");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "b-^"
assertEquals(true, set.contains('b'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('a')); // between ^ and b
assertEquals(false, set.contains('c'));
}
//-----------------------------------------------------------------------
@ -377,6 +426,7 @@ public class CharSetTest extends TestCase {
//-----------------------------------------------------------------------
public void testContains_Char() {
CharSet btod = CharSet.getInstance("b-d");
CharSet dtob = CharSet.getInstance("d-b");
CharSet bcd = CharSet.getInstance("bcd");
CharSet bd = CharSet.getInstance("bd");
CharSet notbtod = CharSet.getInstance("^b-d");
@ -404,6 +454,16 @@ public class CharSetTest extends TestCase {
assertEquals(false, notbtod.contains('c'));
assertEquals(false, notbtod.contains('d'));
assertEquals(true, notbtod.contains('e'));
assertEquals(false, dtob.contains('a'));
assertEquals(true, dtob.contains('b'));
assertEquals(true, dtob.contains('c'));
assertEquals(true, dtob.contains('d'));
assertEquals(false, dtob.contains('e'));
CharRange[] array = dtob.getCharRanges();
assertEquals("[b-d]", dtob.toString());
assertEquals(1, array.length);
}
//-----------------------------------------------------------------------