From d43b3199020157f47f230f97d626f70846e1e3dc Mon Sep 17 00:00:00 2001
From: Stephen Colebourne
Date: Mon, 4 Aug 2003 00:50:14 +0000
Subject: [PATCH] Improve CharSet testing bug 22095, from Phil Steitz Rewrite
CharSet parsing, much neater and simpler now
git-svn-id: https://svn.apache.org/repos/asf/jakarta/commons/proper/lang/trunk@137565 13f79535-47bb-0310-9956-ffa450edef68
---
src/java/org/apache/commons/lang/CharSet.java | 106 +++++++-----------
.../org/apache/commons/lang/CharSetUtils.java | 14 +--
.../org/apache/commons/lang/CharSetTest.java | 92 ++++++++++++---
3 files changed, 122 insertions(+), 90 deletions(-)
diff --git a/src/java/org/apache/commons/lang/CharSet.java b/src/java/org/apache/commons/lang/CharSet.java
index c17eafb0b..9d5789b3c 100644
--- a/src/java/org/apache/commons/lang/CharSet.java
+++ b/src/java/org/apache/commons/lang/CharSet.java
@@ -67,8 +67,9 @@ import java.util.Set;
*
* @author Henri Yandell
* @author Stephen Colebourne
+ * @author Phil Steitz
* @since 1.0
- * @version $Id: CharSet.java,v 1.10 2003/08/02 18:18:33 scolebourne Exp $
+ * @version $Id: CharSet.java,v 1.11 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSet implements Serializable {
@@ -126,10 +127,26 @@ public class CharSet implements Serializable {
* - set containing all the characters from the individual sets
*
*
+ * The matching order is:
+ * Negated multi character range, such as "^a-e"
+ * - Ordinary multi character range, such as "a-e"
+ *
- Negated single character, such as "^a"
+ *
- Ordinary single character, such as "a"
+ *
+ * Matching works left to right. Once a match is found the
+ * search starts again from the next character.
+ *
* If the same range is defined twice using the same syntax, only
* one range will be kept.
- * Thus, "a-ca-c" creates only one range of "a-c".
- * However, "a-cabc" creates two ranges as they are defined differently.
+ * Thus, "a-ca-c" creates only one range of "a-c".
+ *
+ * If the start and end of a range are in the wrong order,
+ * they are reversed. Thus "a-e" is the same as "e-a".
+ * As a result, "a-ee-a" would create only one range,
+ * as the "a-e" and "e-a" are the same.
+ *
+ * The set of characters represented is the union of the specified ranges.
*
* All CharSet objects returned by this method will be immutable.
*
@@ -180,71 +197,26 @@ public class CharSet implements Serializable {
}
int len = str.length();
- switch (len) {
- case 0:
- // do nothing
- break;
-
- case 1:
- set.add(new CharRange(str.charAt(0)));
- break;
-
- default:
- int start = -1;
- boolean negated = false;
- for (int i = 0; i < len; i++) {
- char ch = str.charAt(i);
- if (ch == '-') {
- if (start == -1) {
- // dash found not as range separator
- // treat as ordinary start block char
- start = ch;
- } else if (i == len - 1) {
- // dash is last character, store two single characters
- set.add(new CharRange((char) start, (char) start, negated));
- set.add(DASH);
- start = -1;
- negated = false;
- } else {
- // range block found, store it
- set.add(new CharRange((char) start, str.charAt(++i), negated));
- start = -1;
- negated = false;
- }
- } else if (ch == '^') {
- if (start == -1) {
- if (negated) {
- // double negate, treat second as ordinary start block char
- start = ch;
- } else {
- // negate next block
- negated = true;
- }
- } else {
- // previous block has ended, store it
- set.add(new CharRange((char) start, (char) start, negated));
- start = -1;
- negated = true;
- }
- } else {
- if (start == -1) {
- // start of block
- start = ch;
- } else {
- // previous block has ended, store it, and start next block
- set.add(new CharRange((char) start, (char) start, negated));
- start = ch;
- negated = false;
- }
- }
+ int pos = 0;
+ while (pos < len) {
+ int remainder = (len - pos);
+ if (remainder >= 4 && str.charAt(pos) == '^' && str.charAt(pos + 2) == '-') {
+ // negated range
+ set.add(new CharRange(str.charAt(pos + 1), str.charAt(pos + 3), true));
+ pos += 4;
+ } else if (remainder >= 3 && str.charAt(pos + 1) == '-') {
+ // range
+ set.add(new CharRange(str.charAt(pos), str.charAt(pos + 2)));
+ pos += 3;
+ } else if (remainder >= 2 && str.charAt(pos) == '^') {
+ // negated char
+ set.add(new CharRange(str.charAt(pos + 1), true));
+ pos += 2;
+ } else {
+ // char
+ set.add(new CharRange(str.charAt(pos)));
+ pos += 1;
}
- // handle leftovers
- if (start != -1) {
- set.add(new CharRange((char) start, (char) start, negated));
- } else if (negated) {
- set.add(NEGATE);
- }
- break;
}
}
diff --git a/src/java/org/apache/commons/lang/CharSetUtils.java b/src/java/org/apache/commons/lang/CharSetUtils.java
index b460c6afa..d23fb3d76 100644
--- a/src/java/org/apache/commons/lang/CharSetUtils.java
+++ b/src/java/org/apache/commons/lang/CharSetUtils.java
@@ -62,8 +62,9 @@ package org.apache.commons.lang;
*
* @author Henri Yandell
* @author Stephen Colebourne
+ * @author Phil Steitz
* @since 1.0
- * @version $Id: CharSetUtils.java,v 1.20 2003/08/02 18:18:33 scolebourne Exp $
+ * @version $Id: CharSetUtils.java,v 1.21 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSetUtils {
@@ -80,13 +81,12 @@ public class CharSetUtils {
// Factory
//-----------------------------------------------------------------------
/**
- * Creates a CharSetUtils
object which allows a certain amount of
+ *
Creates a CharSet
instance which allows a certain amount of
* set logic to be performed.
* The syntax is:
*
* - "aeio" which implies 'a','e',..
- * - "^e" implies not e. However it only negates, it's not
- * a set in itself due to the size of that set in unicode.
+ * - "^e" implies not e.
* - "ej-m" implies e,j->m. e,j,k,l,m.
*
*
@@ -94,6 +94,7 @@ public class CharSetUtils {
* CharSetUtils.evaluateSet(null) = null
* CharSetUtils.evaluateSet("") = CharSet matching nothing
* CharSetUtils.evaluateSet("a-e") = CharSet matching a,b,c,d,e
+ * CharSetUtils.evaluateSet("abe-g") = CharSet matching a,b,e,f,g
*
*
* @param set the set, may be null
@@ -109,13 +110,12 @@ public class CharSetUtils {
}
/**
- * Creates a CharSetUtils
object which allows a certain amount of
+ *
Creates a CharSet
instance which allows a certain amount of
* set logic to be performed.
* The syntax is:
*
* - "aeio" which implies 'a','e',..
- * - "^e" implies not e. However it only negates, it's not
- * a set in itself due to the size of that set in unicode.
+ * - "^e" implies not e.
* - "ej-m" implies e,j->m. e,j,k,l,m.
*
*
diff --git a/src/test/org/apache/commons/lang/CharSetTest.java b/src/test/org/apache/commons/lang/CharSetTest.java
index e55b4a1dc..7d7010513 100644
--- a/src/test/org/apache/commons/lang/CharSetTest.java
+++ b/src/test/org/apache/commons/lang/CharSetTest.java
@@ -64,7 +64,8 @@ import junit.textui.TestRunner;
* Unit tests {@link org.apache.commons.lang.CharSet}.
*
* @author Stephen Colebourne
- * @version $Id: CharSetTest.java,v 1.1 2003/08/02 18:18:33 scolebourne Exp $
+ * @author Phil Steitz
+ * @version $Id: CharSetTest.java,v 1.2 2003/08/04 00:50:14 scolebourne Exp $
*/
public class CharSetTest extends TestCase {
@@ -278,59 +279,107 @@ public class CharSetTest extends TestCase {
set = CharSet.getInstance("^");
array = set.getCharRanges();
assertEquals(1, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^^");
array = set.getCharRanges();
assertEquals(1, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
set = CharSet.getInstance("^^^");
array = set.getCharRanges();
assertEquals(2, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^'))); // "^"
set = CharSet.getInstance("^^^^");
array = set.getCharRanges();
assertEquals(1, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^" x2
set = CharSet.getInstance("a^");
array = set.getCharRanges();
assertEquals(2, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('a')));
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('a'))); // "a"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^'))); // "^"
set = CharSet.getInstance("^a-");
array = set.getCharRanges();
assertEquals(2, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true)));
- assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('a', 'a', true))); // "^a"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
set = CharSet.getInstance("^^-c");
array = set.getCharRanges();
assertEquals(1, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true)));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', 'c', true))); // "^^-c"
set = CharSet.getInstance("^c-^");
array = set.getCharRanges();
assertEquals(1, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
set = CharSet.getInstance("^c-^d");
array = set.getCharRanges();
assertEquals(2, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true)));
- assertEquals(true, ArrayUtils.contains(array, new CharRange('d')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('c', '^', true))); // "^c-^"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('d'))); // "d"
set = CharSet.getInstance("^^-");
array = set.getCharRanges();
assertEquals(2, array.length);
- assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true)));
- assertEquals(true, ArrayUtils.contains(array, new CharRange('-')));
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^', '^', true))); // "^^"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('-'))); // "-"
}
+ public void testConstructor_String_oddCombinations() {
+ CharSet set;
+ CharRange[] array = null;
+
+ set = CharSet.getInstance("a-^c");
+ array = set.getCharRanges();
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^'))); // "a-^"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
+ assertEquals(false, set.contains('b'));
+ assertEquals(true, set.contains('^'));
+ assertEquals(true, set.contains('_')); // between ^ and a
+ assertEquals(true, set.contains('c'));
+
+ set = CharSet.getInstance("^a-^c");
+ array = set.getCharRanges();
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('a', '^', true))); // "^a-^"
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('c'))); // "c"
+ assertEquals(true, set.contains('b'));
+ assertEquals(false, set.contains('^'));
+ assertEquals(false, set.contains('_')); // between ^ and a
+
+ set = CharSet.getInstance("a- ^-- "); //contains everything
+ array = set.getCharRanges();
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('a', ' '))); // "a- "
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('-', ' ', true))); // "^-- "
+ assertEquals(true, set.contains('#'));
+ assertEquals(true, set.contains('^'));
+ assertEquals(true, set.contains('a'));
+ assertEquals(true, set.contains('*'));
+ assertEquals(true, set.contains('A'));
+
+ set = CharSet.getInstance("^-b");
+ array = set.getCharRanges();
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "^-b"
+ assertEquals(true, set.contains('b'));
+ assertEquals(true, set.contains('_')); // between ^ and a
+ assertEquals(false, set.contains('A'));
+ assertEquals(true, set.contains('^'));
+
+ set = CharSet.getInstance("b-^");
+ array = set.getCharRanges();
+ assertEquals(true, ArrayUtils.contains(array, new CharRange('^','b'))); // "b-^"
+ assertEquals(true, set.contains('b'));
+ assertEquals(true, set.contains('^'));
+ assertEquals(true, set.contains('a')); // between ^ and b
+ assertEquals(false, set.contains('c'));
+ }
+
//-----------------------------------------------------------------------
public void testEquals_Object() {
CharSet abc = CharSet.getInstance("abc");
@@ -377,6 +426,7 @@ public class CharSetTest extends TestCase {
//-----------------------------------------------------------------------
public void testContains_Char() {
CharSet btod = CharSet.getInstance("b-d");
+ CharSet dtob = CharSet.getInstance("d-b");
CharSet bcd = CharSet.getInstance("bcd");
CharSet bd = CharSet.getInstance("bd");
CharSet notbtod = CharSet.getInstance("^b-d");
@@ -404,6 +454,16 @@ public class CharSetTest extends TestCase {
assertEquals(false, notbtod.contains('c'));
assertEquals(false, notbtod.contains('d'));
assertEquals(true, notbtod.contains('e'));
+
+ assertEquals(false, dtob.contains('a'));
+ assertEquals(true, dtob.contains('b'));
+ assertEquals(true, dtob.contains('c'));
+ assertEquals(true, dtob.contains('d'));
+ assertEquals(false, dtob.contains('e'));
+
+ CharRange[] array = dtob.getCharRanges();
+ assertEquals("[b-d]", dtob.toString());
+ assertEquals(1, array.length);
}
//-----------------------------------------------------------------------