Improve CharSet testing

bug 22095, from Phil Steitz
Rewrite CharSet parsing, much neater and simpler now


git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137565 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Stephen Colebourne 2003-08-04 00:50:14 +00:00
parent 34f6fa8a78
commit d43b319902
3 changed files with 122 additions and 90 deletions

View File

@ -67,8 +67,9 @@ import java.util.Set;
* *
* @author Henri Yandell * @author Henri Yandell
* @author Stephen Colebourne * @author Stephen Colebourne
* @author Phil Steitz
* @since 1.0 * @since 1.0
* @version $Id: CharSet.java,v 1.10 2003/08/02 18:18:33 scolebourne Exp $ * @version $Id: CharSet.java,v 1.11 2003/08/04 00:50:14 scolebourne Exp $
*/ */
public class CharSet implements Serializable { public class CharSet implements Serializable {
@ -126,10 +127,26 @@ public class CharSet implements Serializable {
* - set containing all the characters from the individual sets</li> * - set containing all the characters from the individual sets</li>
* </ul> * </ul>
* *
* <p>The matching order is:</p>
* <ol
* <li>Negated multi character range, such as "^a-e"
* <li>Ordinary multi character range, such as "a-e"
* <li>Negated single character, such as "^a"
* <li>Ordinary single character, such as "a"
* </ol>
* <p>Matching works left to right. Once a match is found the
* search starts again from the next character.</p>
*
* <p>If the same range is defined twice using the same syntax, only * <p>If the same range is defined twice using the same syntax, only
* one range will be kept. * one range will be kept.
* Thus, "a-ca-c" creates only one range of "a-c". * Thus, "a-ca-c" creates only one range of "a-c".</p>
* However, "a-cabc" creates two ranges as they are defined differently.</p> *
* <p>If the start and end of a range are in the wrong order,
* they are reversed. Thus "a-e" is the same as "e-a".
* As a result, "a-ee-a" would create only one range,
* as the "a-e" and "e-a" are the same.</p>
*
* <p>The set of characters represented is the union of the specified ranges.</p>
* *
* <p>All CharSet objects returned by this method will be immutable.</p> * <p>All CharSet objects returned by this method will be immutable.</p>
* *
@ -180,71 +197,26 @@ public class CharSet implements Serializable {
} }
int len = str.length(); int len = str.length();
switch (len) { int pos = 0;
case 0: while (pos < len) {
// do nothing int remainder = (len - pos);
break; if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
// negated range
case 1: set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true));
set.add(new CharRange(str.charAt(0))); pos += 4;
break; } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
// range
default: set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2)));
int start = -1; pos += 3;
boolean negated = false; } else if (remainder >= 2 && str.charAt(pos) == '^') {
for (int i = 0; i < len; i++) { // negated char
char ch = str.charAt(i); set.add(new CharRange(str.charAt(pos + 1), true));
if (ch == '-') { pos += 2;
if (start == -1) { } else {
// dash found not as range separator // char
// treat as ordinary start block char set.add(new CharRange(str.charAt(pos)));
start = ch; pos += 1;
} else if (i == len - 1) {
// dash is last character, store two single characters
set.add(new CharRange((char) start, (char) start, negated));
set.add(DASH);
start = -1;
negated = false;
} else {
// range block found, store it
set.add(new CharRange((char) start, str.charAt(++i), negated));
start = -1;
negated = false;
}
} else if (ch == '^') {
if (start == -1) {
if (negated) {
// double negate, treat second as ordinary start block char
start = ch;
} else {
// negate next block
negated = true;
}
} else {
// previous block has ended, store it
set.add(new CharRange((char) start, (char) start, negated));
start = -1;
negated = true;
}
} else {
if (start == -1) {
// start of block
start = ch;
} else {
// previous block has ended, store it, and start next block
set.add(new CharRange((char) start, (char) start, negated));
start = ch;
negated = false;
}
}
} }
// handle leftovers
if (start != -1) {
set.add(new CharRange((char) start, (char) start, negated));
} else if (negated) {
set.add(NEGATE);
}
break;
} }
} }

View File

@ -62,8 +62,9 @@ package org.apache.commons.lang;
* *
* @author <a href="bayard@generationjava.com">Henri Yandell</a> * @author <a href="bayard@generationjava.com">Henri Yandell</a>
* @author Stephen Colebourne * @author Stephen Colebourne
* @author Phil Steitz
* @since 1.0 * @since 1.0
* @version $Id: CharSetUtils.java,v 1.20 2003/08/02 18:18:33 scolebourne Exp $ * @version $Id: CharSetUtils.java,v 1.21 2003/08/04 00:50:14 scolebourne Exp $
*/ */
public class CharSetUtils { public class CharSetUtils {
@ -80,13 +81,12 @@ public class CharSetUtils {
// Factory // Factory
//----------------------------------------------------------------------- //-----------------------------------------------------------------------
/** /**
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of * <p>Creates a <code>CharSet</code> instance which allows a certain amount of
* set logic to be performed.</p> * set logic to be performed.</p>
* <p>The syntax is:</p> * <p>The syntax is:</p>
* <ul> * <ul>
* <li>&quot;aeio&quot; which implies 'a','e',..</li> * <li>&quot;aeio&quot; which implies 'a','e',..</li>
* <li>&quot;^e&quot; implies not e. However it only negates, it's not * <li>&quot;^e&quot; implies not e.</li>
* a set in itself due to the size of that set in unicode.</li>
* <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li> * <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li>
* </ul> * </ul>
* *
@ -94,6 +94,7 @@ public class CharSetUtils {
* CharSetUtils.evaluateSet(null) = null * CharSetUtils.evaluateSet(null) = null
* CharSetUtils.evaluateSet("") = CharSet matching nothing * CharSetUtils.evaluateSet("") = CharSet matching nothing
* CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e * CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e
* CharSetUtils.evaluateSet("abe-g") = CharSet matching a,b,e,f,g
* </pre> * </pre>
* *
* @param set the set, may be null * @param set the set, may be null
@ -109,13 +110,12 @@ public class CharSetUtils {
} }
/** /**
* <p>Creates a <code>CharSetUtils</code> object which allows a certain amount of * <p>Creates a <code>CharSet</code> instance which allows a certain amount of
* set logic to be performed.</p> * set logic to be performed.</p>
* <p>The syntax is:</p> * <p>The syntax is:</p>
* <ul> * <ul>
* <li>&quot;aeio&quot; which implies 'a','e',..</li> * <li>&quot;aeio&quot; which implies 'a','e',..</li>
* <li>&quot;^e&quot; implies not e. However it only negates, it's not * <li>&quot;^e&quot; implies not e.</li>
* a set in itself due to the size of that set in unicode.</li>
* <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li> * <li>&quot;ej-m&quot; implies e,j->m. e,j,k,l,m.</li>
* </ul> * </ul>
* *

View File

@ -64,7 +64,8 @@ import junit.textui.TestRunner;
* Unit tests {@link org.apache.commons.lang.CharSet}. * Unit tests {@link org.apache.commons.lang.CharSet}.
* *
* @author Stephen Colebourne * @author Stephen Colebourne
* @version $Id: CharSetTest.java,v 1.1 2003/08/02 18:18:33 scolebourne Exp $ * @author Phil Steitz
* @version $Id: CharSetTest.java,v 1.2 2003/08/04 00:50:14 scolebourne Exp $
*/ */
public class CharSetTest extends TestCase { public class CharSetTest extends TestCase {
@ -278,59 +279,107 @@ public class CharSetTest extends TestCase {
set = CharSet.getInstance("^"); set = CharSet.getInstance("^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(1, array.length); assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^^"); set = CharSet.getInstance("^^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(1, array.length); assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
set = CharSet.getInstance("^^^"); set = CharSet.getInstance("^^^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(2, array.length); assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); // "^"
set = CharSet.getInstance("^^^^"); set = CharSet.getInstance("^^^^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(1, array.length); assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" x2
set = CharSet.getInstance("a^"); set = CharSet.getInstance("a^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(2, array.length); assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); // "a"
assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^a-"); set = CharSet.getInstance("^a-");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(2, array.length); assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); // "^a"
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
set = CharSet.getInstance("^^-c"); set = CharSet.getInstance("^^-c");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(1, array.length); assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); // "^^-c"
set = CharSet.getInstance("^c-^"); set = CharSet.getInstance("^c-^");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(1, array.length); assertEquals(1, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
set = CharSet.getInstance("^c-^d"); set = CharSet.getInstance("^c-^d");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(2, array.length); assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); // "d"
set = CharSet.getInstance("^^-"); set = CharSet.getInstance("^^-");
array = set.getCharRanges(); array = set.getCharRanges();
assertEquals(2, array.length); assertEquals(2, array.length);
assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
} }
public void testConstructor_String_oddCombinations() {
CharSet set;
CharRange[] array = null;
set = CharSet.getInstance("a-^c");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^'))); // "a-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
assertEquals(false, set.contains('b'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('_')); // between ^ and a
assertEquals(true, set.contains('c'));
set = CharSet.getInstance("^a-^c");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^', true))); // "^a-^"
assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
assertEquals(true, set.contains('b'));
assertEquals(false, set.contains('^'));
assertEquals(false, set.contains('_')); // between ^ and a
set = CharSet.getInstance("a- ^-- "); //contains everything
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('a', ' '))); // "a- "
assertEquals(true, ArrayUtils.contains(array, new CharRange('-', ' ', true))); // "^-- "
assertEquals(true, set.contains('#'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('a'));
assertEquals(true, set.contains('*'));
assertEquals(true, set.contains('A'));
set = CharSet.getInstance("^-b");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "^-b"
assertEquals(true, set.contains('b'));
assertEquals(true, set.contains('_')); // between ^ and a
assertEquals(false, set.contains('A'));
assertEquals(true, set.contains('^'));
set = CharSet.getInstance("b-^");
array = set.getCharRanges();
assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "b-^"
assertEquals(true, set.contains('b'));
assertEquals(true, set.contains('^'));
assertEquals(true, set.contains('a')); // between ^ and b
assertEquals(false, set.contains('c'));
}
//----------------------------------------------------------------------- //-----------------------------------------------------------------------
public void testEquals_Object() { public void testEquals_Object() {
CharSet abc = CharSet.getInstance("abc"); CharSet abc = CharSet.getInstance("abc");
@ -377,6 +426,7 @@ public class CharSetTest extends TestCase {
//----------------------------------------------------------------------- //-----------------------------------------------------------------------
public void testContains_Char() { public void testContains_Char() {
CharSet btod = CharSet.getInstance("b-d"); CharSet btod = CharSet.getInstance("b-d");
CharSet dtob = CharSet.getInstance("d-b");
CharSet bcd = CharSet.getInstance("bcd"); CharSet bcd = CharSet.getInstance("bcd");
CharSet bd = CharSet.getInstance("bd"); CharSet bd = CharSet.getInstance("bd");
CharSet notbtod = CharSet.getInstance("^b-d"); CharSet notbtod = CharSet.getInstance("^b-d");
@ -404,6 +454,16 @@ public class CharSetTest extends TestCase {
assertEquals(false, notbtod.contains('c')); assertEquals(false, notbtod.contains('c'));
assertEquals(false, notbtod.contains('d')); assertEquals(false, notbtod.contains('d'));
assertEquals(true, notbtod.contains('e')); assertEquals(true, notbtod.contains('e'));
assertEquals(false, dtob.contains('a'));
assertEquals(true, dtob.contains('b'));
assertEquals(true, dtob.contains('c'));
assertEquals(true, dtob.contains('d'));
assertEquals(false, dtob.contains('e'));
CharRange[] array = dtob.getCharRanges();
assertEquals("[b-d]", dtob.toString());
assertEquals(1, array.length);
} }
//----------------------------------------------------------------------- //-----------------------------------------------------------------------