LUCENE-2413: consolidate pattern analysis into contrib/analyzers, deprecate the old PatternAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@940813 13f79535-47bb-0310-9956-ffa450edef68
2010-05-04 11:57:21 +00:00 · 2010-05-04 11:57:21 +00:00 · cd320b5f57
parent 6fa480a5ed
commit cd320b5f57
15 changed files with 333 additions and 156 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -92,6 +92,9 @@ API Changes
   stemming. Add Turkish and Romanian stopwords lists to support this.
   (Robert Muir, Uwe Schindler, Simon Willnauer)
   
+ * LUCENE-2413: Deprecated PatternAnalyzer in contrib/analyzers, in favor of the 
+   pattern package (CharFilter, Tokenizer, TokenFilter).  (Robert Muir)
+   
 New features

 * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
@ -165,6 +168,8 @@ New features
     into subwords and performs optional transformations on subword groups.
   - o.a.l.analysis.miscellaneous.RemoveDuplicatesTokenFilter: TokenFilter which 
     filters out Tokens at the same position and Term text as the previous token.
+   - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a 
+     CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
   (... in progress)

 Build
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/miscellaneous/PatternAnalyzer.java
@ -62,8 +62,10 @@ import org.apache.lucene.util.Version;
 *     pat.tokenStream("content", "James is running round in the woods"), 
 *     "English"));
 * </pre>
- *
+ * @deprecated use the pattern-based analysis in the analysis/pattern package instead.
+ * This analyzer will be removed in a future release (4.1)
 */
+@Deprecated
 public final class PatternAnalyzer extends Analyzer {
  
  /** <code>"\\W+"</code>; Divides text at non-letters (NOT Character.isLetter(c)) */
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceCharFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;

 import java.io.IOException;
 import java.util.LinkedList;
@ -45,7 +45,6 @@ import org.apache.lucene.analysis.CharStream;
 * highlight snippet="aa1&lt;em&gt;23bb&lt;/em&gt;"
 * </p>
 * 
- * @version $Id$
 * @since Solr 1.5
 */
 public class PatternReplaceCharFilter extends BaseCharFilter {
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternReplaceFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;

 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
@ -35,7 +35,6 @@ import java.io.IOException;
 * string.
 * </p>
 * 
- * @version $Id:$
 * @see Pattern
 */
 public final class PatternReplaceFilter extends TokenFilter {
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/PatternTokenizer.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;

 import java.io.IOException;
 import java.io.Reader;
@ -24,7 +24,6 @@ import java.util.regex.Pattern;
 import org.apache.lucene.analysis.Tokenizer;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.commons.io.IOUtils;

 /**
 * This tokenizer uses regex pattern matching to construct distinct tokens
@ -51,7 +50,6 @@ import org.apache.commons.io.IOUtils;
 * </p>
 * <p>NOTE: This Tokenizer does not output tokens that are of zero length.</p>
 *
- * @version $Id$
 * @see Pattern
 */
 public final class PatternTokenizer extends Tokenizer {
@ -59,7 +57,7 @@ public final class PatternTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

-  private String str;
+  private final StringBuilder str = new StringBuilder();
  private int index;
  
  private final Pattern pattern;
@ -71,7 +69,7 @@ public final class PatternTokenizer extends Tokenizer {
    super(input);
    this.pattern = pattern;
    this.group = group;
-    str = IOUtils.toString(input);
+    fillBuffer(str, input);
    matcher = pattern.matcher(str);
    index = 0;
  }
@ -84,11 +82,11 @@ public final class PatternTokenizer extends Tokenizer {
    
      // match a specific group
      while (matcher.find()) {
-        final String match = matcher.group(group);
-        if (match.length() == 0) continue;
-        termAtt.setEmpty().append(match);
        index = matcher.start(group);
-        offsetAtt.setOffset(correctOffset(index), correctOffset(matcher.end(group)));
+        final int endIndex = matcher.end(group);
+        if (index == endIndex) continue;       
+        termAtt.setEmpty().append(str, index, endIndex);
+        offsetAtt.setOffset(correctOffset(index), correctOffset(endIndex));
        return true;
      }
      
@ -131,9 +129,19 @@ public final class PatternTokenizer extends Tokenizer {
  @Override
  public void reset(Reader input) throws IOException {
    super.reset(input);
-    str = IOUtils.toString(input);
+    fillBuffer(str, input);
    matcher.reset(str);
    index = 0;
  }
-
+  
+  // TODO: we should see if we can make this tokenizer work without reading
+  // the entire document into RAM, perhaps with Matcher.hitEnd/requireEnd ?
+  final char[] buffer = new char[8192];
+  private void fillBuffer(StringBuilder sb, Reader input) throws IOException {
+    int len;
+    sb.setLength(0);
+    while ((len = input.read(buffer)) > 0) {
+      sb.append(buffer, 0, len);
+    }
+  }
 }
--- a/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html
+++ b/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/pattern/package.html
@ -0,0 +1,22 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html><head></head>
+<body>
+Set of components for pattern-based (regex) analysis.
+</body>
+</html>
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceCharFilter.java
@ -15,39 +15,31 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;

 import java.io.IOException;
 import java.io.StringReader;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.regex.Pattern;

+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.CharReader;
 import org.apache.lucene.analysis.CharStream;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;

 /**
- * 
- * @version $Id$
- *
+ * Tests {@link PatternReplaceCharFilter}
 */
-public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
+public class TestPatternReplaceCharFilter extends BaseTokenStreamTestCase {
  
  //           1111
  // 01234567890123
  // this is test.
  public void testNothingChange() throws IOException {
    final String BLOCK = "this is test.";
-    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
-    Map<String,String> args = new HashMap<String,String>();
-    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
-    args.put("replacement", "$1$2$3");
-    factory.init(args);
-    CharStream cs = factory.create(
+    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1$2$3",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "this", "is", "test." },
        new int[] { 0, 5, 8 },
@ -58,13 +50,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
  // aa bb cc
  public void testReplaceByEmpty() throws IOException {
    final String BLOCK = "aa bb cc";
-    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
-    Map<String,String> args = new HashMap<String,String>();
-    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
-    factory.init(args);
-    CharStream cs = factory.create(
+    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertFalse(ts.incrementToken());
  }
  
@ -73,14 +61,9 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
  // aa#bb#cc
  public void test1block1matchSameLength() throws IOException {
    final String BLOCK = "aa bb cc";
-    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
-    Map<String,String> args = new HashMap<String,String>();
-    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
-    args.put("replacement", "$1#$2#$3");
-    factory.init(args);
-    CharStream cs = factory.create(
+    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2#$3",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa#bb#cc" },
        new int[] { 0 },
@ -95,7 +78,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
    final String BLOCK = "aa bb cc dd";
    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1##$2###$3",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa##bb###cc", "dd" },
        new int[] { 0, 9 },
@ -109,7 +92,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
    final String BLOCK = " a  a";
    CharStream cs = new PatternReplaceCharFilter( pattern("a"), "aa",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa", "aa" },
        new int[] { 1, 4 },
@ -124,7 +107,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
    final String BLOCK = "aa  bb   cc dd";
    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1#$2",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa#bb", "dd" },
        new int[] { 0, 12 },
@ -139,7 +122,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
    final String BLOCK = "  aa bb cc --- aa bb aa   bb   cc";
    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)\\s+(cc)"), "$1  $2  $3",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
        new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
@ -154,7 +137,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
    final String BLOCK = "  aa bb cc --- aa bb aa. bb aa   bb cc";
    CharStream cs = new PatternReplaceCharFilter( pattern("(aa)\\s+(bb)"), "$1##$2", ".",
          CharReader.get( new StringReader( BLOCK ) ) );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
        new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
@ -171,7 +154,7 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
        CharReader.get( new StringReader( BLOCK ) ) );
    cs = new PatternReplaceCharFilter( pattern("bb"), "b", ".", cs );
    cs = new PatternReplaceCharFilter( pattern("ccc"), "c", ".", cs );
-    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    TokenStream ts = new WhitespaceTokenizer(TEST_VERSION_CURRENT, cs );
    assertTokenStreamContents(ts,
        new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
        new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternReplaceFilter.java
@ -15,8 +15,9 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.pattern;

+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;

@ -26,12 +27,12 @@ import java.util.regex.Pattern;
 /**
 * @version $Id:$
 */
-public class TestPatternReplaceFilter extends BaseTokenTestCase {
+public class TestPatternReplaceFilter extends BaseTokenStreamTestCase {

  public void testReplaceAll() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
-            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+            (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
                    Pattern.compile("a*b"),
                    "-", true);
    assertTokenStreamContents(ts, 
@ -41,7 +42,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
  public void testReplaceFirst() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
-            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+            (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
                    Pattern.compile("a*b"),
                    "-", false);
    assertTokenStreamContents(ts, 
@ -51,7 +52,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
  public void testStripFirst() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
-            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+            (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
                    Pattern.compile("a*b"),
                    null, false);
    assertTokenStreamContents(ts,
@ -61,7 +62,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
  public void testStripAll() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
-            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+            (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
                    Pattern.compile("a*b"),
                    null, true);
    assertTokenStreamContents(ts,
@ -71,7 +72,7 @@ public class TestPatternReplaceFilter extends BaseTokenTestCase {
  public void testReplaceAllWithBackRef() throws Exception {
    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
    TokenStream ts = new PatternReplaceFilter
-            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)),
+            (new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
                    Pattern.compile("(a*)b"),
                    "$1\\$", true);
    assertTokenStreamContents(ts,
--- a/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
+++ b/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/pattern/TestPatternTokenizer.java
@ -0,0 +1,119 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.pattern;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+public class TestPatternTokenizer extends BaseTokenStreamTestCase 
+{
+	public void testSplitting() throws Exception 
+  {
+    String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
+    String[][] tests = {
+      // group  pattern        input                    output
+      { "-1",   "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
+      { "-1",   ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
+      { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa bbb ccc" },
+      { "-1",   ":",           "boo:and:foo",           "boo and foo" },
+      { "-1",   "o",           "boo:and:foo",           "b :and:f" },
+      { "0",    ":",           "boo:and:foo",           ": :" },
+      { "0",    qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
+      { "1",    qpattern,      "aaa 'bbb' 'ccc'",       "bbb ccc" }
+    };
+    
+    for( String[] test : tests ) {     
+      TokenStream stream = new PatternTokenizer(new StringReader(test[2]), Pattern.compile(test[1]), Integer.parseInt(test[0]));
+      String out = tsToString( stream );
+      // System.out.println( test[2] + " ==> " + out );
+
+      assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
+      
+      // Make sure it is the same as if we called 'split'
+      // test disabled, as we remove empty tokens
+      /*if( "-1".equals( test[0] ) ) {
+        String[] split = test[2].split( test[1] );
+        stream = tokenizer.create( new StringReader( test[2] ) );
+        int i=0;
+        for( Token t = stream.next(); null != t; t = stream.next() ) 
+        {
+          assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
+        }
+      }*/
+    } 
+	}
+	
+  public void testOffsetCorrection() throws Exception {
+    final String INPUT = "G&uuml;nther G&uuml;nther is here";
+
+    // create MappingCharFilter
+    List<String> mappingRules = new ArrayList<String>();
+    mappingRules.add( "\"&uuml;\" => \"ü\"" );
+    NormalizeCharMap normMap = new NormalizeCharMap();
+    normMap.add("&uuml;", "ü");
+    CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
+
+    // create PatternTokenizer
+    TokenStream stream = new PatternTokenizer(charStream, Pattern.compile("[,;/\\s]+"), -1);
+    assertTokenStreamContents(stream,
+        new String[] { "Günther", "Günther", "is", "here" },
+        new int[] { 0, 13, 26, 29 },
+        new int[] { 12, 25, 28, 33 });
+    
+    charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
+    stream = new PatternTokenizer(charStream, Pattern.compile("Günther"), 0);
+    assertTokenStreamContents(stream,
+        new String[] { "Günther", "Günther" },
+        new int[] { 0, 13 },
+        new int[] { 12, 25 });
+  }
+  
+  /** 
+   * TODO: rewrite tests not to use string comparison.
+   * @deprecated only tests TermAttribute!
+   */
+  private static String tsToString(TokenStream in) throws IOException {
+    StringBuilder out = new StringBuilder();
+    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
+    // extra safety to enforce, that the state is not preserved and also
+    // assign bogus values
+    in.clearAttributes();
+    termAtt.setEmpty().append("bogusTerm");
+    while (in.incrementToken()) {
+      if (out.length() > 0)
+        out.append(' ');
+      out.append(termAtt.toString());
+      in.clearAttributes();
+      termAtt.setEmpty().append("bogusTerm");
+    }
+
+    in.close();
+    return out.toString();
+  }
+}
--- a/solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/PatternReplaceCharFilterFactory.java
@ -22,6 +22,7 @@ import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;

 import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.pattern.PatternReplaceCharFilter;

 /**
 * 
--- a/solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/PatternReplaceFilterFactory.java
@ -17,6 +17,7 @@

 package org.apache.solr.analysis;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.pattern.PatternReplaceFilter;

 import java.util.Map;
 import java.util.regex.Pattern;
--- a/solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/PatternTokenizerFactory.java
@ -27,6 +27,7 @@ import java.util.regex.Pattern;

 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.pattern.PatternTokenizer;
 import org.apache.solr.common.SolrException;


--- a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceCharFilterFactory.java
@ -0,0 +1,86 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestPatternReplaceCharFilterFactory extends BaseTokenTestCase {
+  
+  //           1111
+  // 01234567890123
+  // this is test.
+  public void testNothingChange() throws IOException {
+    final String BLOCK = "this is test.";
+    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
+    args.put("replacement", "$1$2$3");
+    factory.init(args);
+    CharStream cs = factory.create(
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "test." },
+        new int[] { 0, 5, 8 },
+        new int[] { 4, 7, 13 });
+  }
+  
+  // 012345678
+  // aa bb cc
+  public void testReplaceByEmpty() throws IOException {
+    final String BLOCK = "aa bb cc";
+    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
+    factory.init(args);
+    CharStream cs = factory.create(
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    assertFalse(ts.incrementToken());
+  }
+  
+  // 012345678
+  // aa bb cc
+  // aa#bb#cc
+  public void test1block1matchSameLength() throws IOException {
+    final String BLOCK = "aa bb cc";
+    PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
+    args.put("replacement", "$1#$2#$3");
+    factory.init(args);
+    CharStream cs = factory.create(
+          CharReader.get( new StringReader( BLOCK ) ) );
+    TokenStream ts = new WhitespaceTokenizer(DEFAULT_VERSION, cs );
+    assertTokenStreamContents(ts,
+        new String[] { "aa#bb#cc" },
+        new int[] { 0 },
+        new int[] { 8 });
+  }
+}
--- a/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestPatternReplaceFilterFactory.java
@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestPatternReplaceFilterFactory extends BaseTokenTestCase {
+
+  public void testReplaceAll() throws Exception {
+    String input = "aabfooaabfooabfoob ab caaaaaaaaab";
+    PatternReplaceFilterFactory factory = new PatternReplaceFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("pattern", "a*b");
+    args.put("replacement", "-");
+    factory.init(args);
+    TokenStream ts = factory.create
+            (new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader(input)));
+                   
+    assertTokenStreamContents(ts, 
+        new String[] { "-foo-foo-foo-", "-", "c-" });
+  }
+}
--- a/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestPatternTokenizerFactory.java
@ -17,120 +17,25 @@

 package org.apache.solr.analysis;

-import java.io.IOException;
 import java.io.StringReader;
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;

-import org.apache.lucene.analysis.CharReader;
-import org.apache.lucene.analysis.CharStream;
-import org.apache.lucene.analysis.charfilter.MappingCharFilter;
-import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
 import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

+/** Simple Tests to ensure this factory is working */
 public class TestPatternTokenizerFactory extends BaseTokenTestCase 
 {
-	public void testSplitting() throws Exception 
-  {
-    String qpattern = "\\'([^\\']+)\\'"; // get stuff between "'"
-    String[][] tests = {
-      // group  pattern        input                    output
-      { "-1",   "--",          "aaa--bbb--ccc",         "aaa bbb ccc" },
-      { "-1",   ":",           "aaa:bbb:ccc",           "aaa bbb ccc" },
-      { "-1",   "\\p{Space}",  "aaa   bbb \t\tccc  ",   "aaa bbb ccc" },
-      { "-1",   ":",           "boo:and:foo",           "boo and foo" },
-      { "-1",   "o",           "boo:and:foo",           "b :and:f" },
-      { "0",    ":",           "boo:and:foo",           ": :" },
-      { "0",    qpattern,      "aaa 'bbb' 'ccc'",       "'bbb' 'ccc'" },
-      { "1",    qpattern,      "aaa 'bbb' 'ccc'",       "bbb ccc" }
-    };
-    
-    
-    Map<String,String> args = new HashMap<String, String>();
-    for( String[] test : tests ) {
-      args.put( PatternTokenizerFactory.GROUP, test[0] );
-      args.put( PatternTokenizerFactory.PATTERN, test[1] );
-
-      PatternTokenizerFactory tokenizer = new PatternTokenizerFactory();
-      tokenizer.init( args );
-      
-      TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
-      String out = tsToString( stream );
-      // System.out.println( test[2] + " ==> " + out );
-
-      assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
-      
-      // Make sure it is the same as if we called 'split'
-      // test disabled, as we remove empty tokens
-      /*if( "-1".equals( test[0] ) ) {
-        String[] split = test[2].split( test[1] );
-        stream = tokenizer.create( new StringReader( test[2] ) );
-        int i=0;
-        for( Token t = stream.next(); null != t; t = stream.next() ) 
-        {
-          assertEquals( "split: "+test[1] + " "+i, split[i++], new String(t.termBuffer(), 0, t.termLength()) );
-        }
-      }*/
-    } 
-	}
-	
-  public void testOffsetCorrection() throws Exception {
-    final String INPUT = "G&uuml;nther G&uuml;nther is here";
-
-    // create MappingCharFilter
-    MappingCharFilterFactory cfFactory = new MappingCharFilterFactory();
-    List<String> mappingRules = new ArrayList<String>();
-    mappingRules.add( "\"&uuml;\" => \"ü\"" );
-    NormalizeCharMap normMap = new NormalizeCharMap();
-    cfFactory.parseRules( mappingRules, normMap );
-    CharStream charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
+  public void testFactory() throws Exception {
+    final String INPUT = "Günther Günther is here";

    // create PatternTokenizer
    Map<String,String> args = new HashMap<String, String>();
    args.put( PatternTokenizerFactory.PATTERN, "[,;/\\s]+" );
    PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
    tokFactory.init( args );
-    TokenStream stream = tokFactory.create( charStream );
+    TokenStream stream = tokFactory.create( new StringReader(INPUT) );
    assertTokenStreamContents(stream,
-        new String[] { "Günther", "Günther", "is", "here" },
-        new int[] { 0, 13, 26, 29 },
-        new int[] { 12, 25, 28, 33 });
-    
-    charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
-    args.put( PatternTokenizerFactory.PATTERN, "Günther" );
-    args.put( PatternTokenizerFactory.GROUP, "0" );
-    tokFactory = new PatternTokenizerFactory();
-    tokFactory.init( args );
-    stream = tokFactory.create( charStream );
-    assertTokenStreamContents(stream,
-        new String[] { "Günther", "Günther" },
-        new int[] { 0, 13 },
-        new int[] { 12, 25 });
-  }
-  
-  /** 
-   * TODO: rewrite tests not to use string comparison.
-   * @deprecated only tests TermAttribute!
-   */
-  private static String tsToString(TokenStream in) throws IOException {
-    StringBuilder out = new StringBuilder();
-    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
-    // extra safety to enforce, that the state is not preserved and also
-    // assign bogus values
-    in.clearAttributes();
-    termAtt.setEmpty().append("bogusTerm");
-    while (in.incrementToken()) {
-      if (out.length() > 0)
-        out.append(' ');
-      out.append(termAtt.toString());
-      in.clearAttributes();
-      termAtt.setEmpty().append("bogusTerm");
-    }
-
-    in.close();
-    return out.toString();
+        new String[] { "Günther", "Günther", "is", "here" });
  }
 }