From d43b3199020157f47f230f97d626f70846e1e3dc Mon Sep 17 00:00:00 2001 From: Stephen Colebourne Date: Mon, 4 Aug 2003 00:50:14 +0000 Subject: [PATCH] Improve CharSet testing bug 22095, from Phil Steitz Rewrite CharSet parsing, much neater and simpler now git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137565 13f79535-47bb-0310-9956-ffa450edef68 --- src/java/org/apache/commons/lang/CharSet.java | 106 +++++++----------- .../org/apache/commons/lang/CharSetUtils.java | 14 +-- .../org/apache/commons/lang/CharSetTest.java | 92 ++++++++++++--- 3 files changed, 122 insertions(+), 90 deletions(-) diff --git a/src/java/org/apache/commons/lang/CharSet.java b/src/java/org/apache/commons/lang/CharSet.java index c17eafb0b..9d5789b3c 100644 --- a/src/java/org/apache/commons/lang/CharSet.java +++ b/src/java/org/apache/commons/lang/CharSet.java @@ -67,8 +67,9 @@ import java.util.Set; * * @author Henri Yandell * @author Stephen Colebourne + * @author Phil Steitz * @since 1.0 - * @version $Id: CharSet.java,v 1.10 2003/08/02 18:18:33 scolebourne Exp $ + * @version $Id: CharSet.java,v 1.11 2003/08/04 00:50:14 scolebourne Exp $ */ public class CharSet implements Serializable { @@ -126,10 +127,26 @@ public class CharSet implements Serializable { * - set containing all the characters from the individual sets * * + *

The matching order is:

+ *
    Negated multi character range, such as "^a-e" + *
  1. Ordinary multi character range, such as "a-e" + *
  2. Negated single character, such as "^a" + *
  3. Ordinary single character, such as "a" + *
+ *

Matching works left to right. Once a match is found the + * search starts again from the next character.

+ * *

If the same range is defined twice using the same syntax, only * one range will be kept. - * Thus, "a-ca-c" creates only one range of "a-c". - * However, "a-cabc" creates two ranges as they are defined differently.

+ * Thus, "a-ca-c" creates only one range of "a-c".

+ * + *

If the start and end of a range are in the wrong order, + * they are reversed. Thus "a-e" is the same as "e-a". + * As a result, "a-ee-a" would create only one range, + * as the "a-e" and "e-a" are the same.

+ * + *

The set of characters represented is the union of the specified ranges.

* *

All CharSet objects returned by this method will be immutable.

* @@ -180,71 +197,26 @@ public class CharSet implements Serializable { } int len = str.length(); - switch (len) { - case 0: - // do nothing - break; - - case 1: - set.add(new CharRange(str.charAt(0))); - break; - - default: - int start = -1; - boolean negated = false; - for (int i = 0; i < len; i++) { - char ch = str.charAt(i); - if (ch == '-') { - if (start == -1) { - // dash found not as range separator - // treat as ordinary start block char - start = ch; - } else if (i == len - 1) { - // dash is last character, store two single characters - set.add(new CharRange((char) start, (char) start, negated)); - set.add(DASH); - start = -1; - negated = false; - } else { - // range block found, store it - set.add(new CharRange((char) start, str.charAt(++i), negated)); - start = -1; - negated = false; - } - } else if (ch == '^') { - if (start == -1) { - if (negated) { - // double negate, treat second as ordinary start block char - start = ch; - } else { - // negate next block - negated = true; - } - } else { - // previous block has ended, store it - set.add(new CharRange((char) start, (char) start, negated)); - start = -1; - negated = true; - } - } else { - if (start == -1) { - // start of block - start = ch; - } else { - // previous block has ended, store it, and start next block - set.add(new CharRange((char) start, (char) start, negated)); - start = ch; - negated = false; - } - } + int pos = 0; + while (pos < len) { + int remainder = (len - pos); + if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') { + // negated range + set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true)); + pos += 4; + } else if (remainder >= 3 && str.charAt(pos + 1) == '-') { + // range + set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2))); + pos += 3; + } else if (remainder >= 2 && str.charAt(pos) == '^') { + // negated char + set.add(new CharRange(str.charAt(pos + 1), true)); + pos += 2; + } else { + // char + set.add(new CharRange(str.charAt(pos))); + pos += 1; } - // handle leftovers - if (start != -1) { - set.add(new CharRange((char) start, (char) start, negated)); - } else if (negated) { - set.add(NEGATE); - } - break; } } diff --git a/src/java/org/apache/commons/lang/CharSetUtils.java b/src/java/org/apache/commons/lang/CharSetUtils.java index b460c6afa..d23fb3d76 100644 --- a/src/java/org/apache/commons/lang/CharSetUtils.java +++ b/src/java/org/apache/commons/lang/CharSetUtils.java @@ -62,8 +62,9 @@ package org.apache.commons.lang; * * @author Henri Yandell * @author Stephen Colebourne + * @author Phil Steitz * @since 1.0 - * @version $Id: CharSetUtils.java,v 1.20 2003/08/02 18:18:33 scolebourne Exp $ + * @version $Id: CharSetUtils.java,v 1.21 2003/08/04 00:50:14 scolebourne Exp $ */ public class CharSetUtils { @@ -80,13 +81,12 @@ public class CharSetUtils { // Factory //----------------------------------------------------------------------- /** - *

Creates a CharSetUtils object which allows a certain amount of + *

Creates a CharSet instance which allows a certain amount of * set logic to be performed.

*

The syntax is:

* * @@ -94,6 +94,7 @@ public class CharSetUtils { * CharSetUtils.evaluateSet(null) = null * CharSetUtils.evaluateSet("") = CharSet matching nothing * CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e + * CharSetUtils.evaluateSet("abe-g") = CharSet matching a,b,e,f,g * * * @param set the set, may be null @@ -109,13 +110,12 @@ public class CharSetUtils { } /** - *

Creates a CharSetUtils object which allows a certain amount of + *

Creates a CharSet instance which allows a certain amount of * set logic to be performed.

*

The syntax is:

* * diff --git a/src/test/org/apache/commons/lang/CharSetTest.java b/src/test/org/apache/commons/lang/CharSetTest.java index e55b4a1dc..7d7010513 100644 --- a/src/test/org/apache/commons/lang/CharSetTest.java +++ b/src/test/org/apache/commons/lang/CharSetTest.java @@ -64,7 +64,8 @@ import junit.textui.TestRunner; * Unit tests {@link org.apache.commons.lang.CharSet}. * * @author Stephen Colebourne - * @version $Id: CharSetTest.java,v 1.1 2003/08/02 18:18:33 scolebourne Exp $ + * @author Phil Steitz + * @version $Id: CharSetTest.java,v 1.2 2003/08/04 00:50:14 scolebourne Exp $ */ public class CharSetTest extends TestCase { @@ -278,59 +279,107 @@ public class CharSetTest extends TestCase { set = CharSet.getInstance("^"); array = set.getCharRanges(); assertEquals(1, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^" set = CharSet.getInstance("^^"); array = set.getCharRanges(); assertEquals(1, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" set = CharSet.getInstance("^^^"); array = set.getCharRanges(); assertEquals(2, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); // "^" set = CharSet.getInstance("^^^^"); array = set.getCharRanges(); assertEquals(1, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" x2 set = CharSet.getInstance("a^"); array = set.getCharRanges(); assertEquals(2, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); // "a" + assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^" set = CharSet.getInstance("^a-"); array = set.getCharRanges(); assertEquals(2, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); - assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); // "^a" + assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-" set = CharSet.getInstance("^^-c"); array = set.getCharRanges(); assertEquals(1, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); // "^^-c" set = CharSet.getInstance("^c-^"); array = set.getCharRanges(); assertEquals(1, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^" set = CharSet.getInstance("^c-^d"); array = set.getCharRanges(); assertEquals(2, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); - assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^" + assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); // "d" set = CharSet.getInstance("^^-"); array = set.getCharRanges(); assertEquals(2, array.length); - assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); - assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" + assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-" } + public void testConstructor_String_oddCombinations() { + CharSet set; + CharRange[] array = null; + + set = CharSet.getInstance("a-^c"); + array = set.getCharRanges(); + assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^'))); // "a-^" + assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c" + assertEquals(false, set.contains('b')); + assertEquals(true, set.contains('^')); + assertEquals(true, set.contains('_')); // between ^ and a + assertEquals(true, set.contains('c')); + + set = CharSet.getInstance("^a-^c"); + array = set.getCharRanges(); + assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^', true))); // "^a-^" + assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c" + assertEquals(true, set.contains('b')); + assertEquals(false, set.contains('^')); + assertEquals(false, set.contains('_')); // between ^ and a + + set = CharSet.getInstance("a- ^-- "); //contains everything + array = set.getCharRanges(); + assertEquals(true, ArrayUtils.contains(array, new CharRange('a', ' '))); // "a- " + assertEquals(true, ArrayUtils.contains(array, new CharRange('-', ' ', true))); // "^-- " + assertEquals(true, set.contains('#')); + assertEquals(true, set.contains('^')); + assertEquals(true, set.contains('a')); + assertEquals(true, set.contains('*')); + assertEquals(true, set.contains('A')); + + set = CharSet.getInstance("^-b"); + array = set.getCharRanges(); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "^-b" + assertEquals(true, set.contains('b')); + assertEquals(true, set.contains('_')); // between ^ and a + assertEquals(false, set.contains('A')); + assertEquals(true, set.contains('^')); + + set = CharSet.getInstance("b-^"); + array = set.getCharRanges(); + assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "b-^" + assertEquals(true, set.contains('b')); + assertEquals(true, set.contains('^')); + assertEquals(true, set.contains('a')); // between ^ and b + assertEquals(false, set.contains('c')); + } + //----------------------------------------------------------------------- public void testEquals_Object() { CharSet abc = CharSet.getInstance("abc"); @@ -377,6 +426,7 @@ public class CharSetTest extends TestCase { //----------------------------------------------------------------------- public void testContains_Char() { CharSet btod = CharSet.getInstance("b-d"); + CharSet dtob = CharSet.getInstance("d-b"); CharSet bcd = CharSet.getInstance("bcd"); CharSet bd = CharSet.getInstance("bd"); CharSet notbtod = CharSet.getInstance("^b-d"); @@ -404,6 +454,16 @@ public class CharSetTest extends TestCase { assertEquals(false, notbtod.contains('c')); assertEquals(false, notbtod.contains('d')); assertEquals(true, notbtod.contains('e')); + + assertEquals(false, dtob.contains('a')); + assertEquals(true, dtob.contains('b')); + assertEquals(true, dtob.contains('c')); + assertEquals(true, dtob.contains('d')); + assertEquals(false, dtob.contains('e')); + + CharRange[] array = dtob.getCharRanges(); + assertEquals("[b-d]", dtob.toString()); + assertEquals(1, array.length); } //-----------------------------------------------------------------------