LUCENE-2413: consolidate remaining solr tokenstreams into modules/analysis

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@957162 13f79535-47bb-0310-9956-ffa450edef68
2025-02-28 21:39:25 +00:00 · 2010-06-23 11:25:17 +00:00 · 2010-06-23 11:25:17 +00:00 · 8f71031ac8
commit 8f71031ac8
parent 653c7c160b
20 changed files with 604 additions and 199 deletions
--- a/modules/analysis/CHANGES.txt
+++ b/modules/analysis/CHANGES.txt
@ -27,11 +27,14 @@ New Features
     with text contained in the required words (inverse of StopFilter).
   - o.a.l.analysis.miscellaneous.HyphenatedWordsFilter: A TokenFilter that puts 
     hyphenated words broken into two lines back together.
+   - o.a.l.analysis.miscellaneous.CapitalizationFilter: A TokenFilter that applies
+     capitalization rules to tokens.
   - o.a.l.analysis.pattern: Package for pattern-based analysis, containing a 
     CharFilter, Tokenizer, and Tokenfilter for transforming text with regexes.
   - o.a.l.analysis.synonym.SynonymFilter: A synonym filter that supports multi-word
     synonyms.
-   (... in progress)
+   - o.a.l.analysis.phonetic: Package for phonetic search, containing various
+     phonetic encoders such as Double Metaphone.

  * LUCENE-2413: Consolidated all Lucene analyzers into common.
    - o.a.l.analysis.KeywordAnalyzer -> o.a.l.analysis.core.KeywordAnalyzer
@ -60,7 +63,6 @@ New Features
    - o.a.l.analysis.ReusableAnalyzerBase -> o.a.l.analysis.util.ReusableAnalyzerBase
    - o.a.l.analysis.StopwordAnalyzerBase -> o.a.l.analysis.util.StopwordAnalyzerBase
    - o.a.l.analysis.WordListLoader -> o.a.l.analysis.util.WordListLoader
-    ... (in progress)

 Build

--- a/modules/analysis/NOTICE.txt
+++ b/modules/analysis/NOTICE.txt
@ -4,6 +4,10 @@ Copyright 2006 The Apache Software Foundation
 This product includes software developed by
 The Apache Software Foundation (http://www.apache.org/).

+Includes software from other Apache Software Foundation projects,
+including, but not limited to:
+  - Apache Commons
+
 The snowball stemmers in
  common/src/java/net/sf/snowball
 were developed by Martin Porter and Richard Boulton.
--- a/modules/analysis/README.txt
+++ b/modules/analysis/README.txt
@ -20,7 +20,12 @@ lucene-analyzers-common-XX.jar
 lucene-analyzers-icu-XX.jar
  An add-on analysis library that provides improved Unicode support via
  International Components for Unicode (ICU). Note: this module depends on
-  the ICU4j jar file (version > 4.4.0)
+  the ICU4j jar file (version >= 4.4.0)
+
+lucene-analyzers-phonetic-XX.jar
+  An add-on analysis library that provides phonetic encoders via Apache
+  Commons-Codec. Note: this module depends on the commons-codec jar 
+  file (version >= 1.4)
  
 lucene-analyzers-smartcn-XX.jar
  An add-on analysis library that provides word segmentation for Simplified
@ -32,12 +37,14 @@ lucene-analyzers-stempel-XX.jar

 common/src/java
 icu/src/java
+phonetic/src/java
 smartcn/src/java
 stempel/src/java
-  The source code for the four libraries.
+  The source code for the ffve libraries.

 common/src/test
 icu/src/test
+phonetic/src/test
 smartcn/src/test
 stempel/src/test
-  Unit tests for the four libraries.
+  Unit tests for the five libraries.
--- a/modules/analysis/build.xml
+++ b/modules/analysis/build.xml
@ -35,6 +35,10 @@
    <ant dir="icu" />
  </target>

+  <target name="phonetic">
+    <ant dir="phonetic" />
+  </target>
+
  <target name="smartcn">
    <ant dir="smartcn" />
  </target>
@ -44,29 +48,33 @@
  </target>

  <target name="default" depends="compile"/>
-  <target name="compile" depends="common,icu,smartcn,stempel" />
+  <target name="compile" depends="common,icu,phonetic,smartcn,stempel" />

  <target name="clean">
    <ant dir="common" target="clean" />
    <ant dir="icu" target="clean" />
+    <ant dir="phonetic" target="clean" />
    <ant dir="smartcn" target="clean" />
    <ant dir="stempel" target="clean" />
  </target>
  <target name="compile-core">
    <ant dir="common" target="compile-core" />
    <ant dir="icu" target="compile-core" />
+    <ant dir="phonetic" target="compile-core" />
    <ant dir="smartcn" target="compile-core" />
    <ant dir="stempel" target="compile-core" />
  </target>
  <target name="compile-test">
    <ant dir="common" target="compile-test" />
    <ant dir="icu" target="compile-test" />
+    <ant dir="phonetic" target="compile-test" />
    <ant dir="smartcn" target="compile-test" />
    <ant dir="stempel" target="compile-test" />
  </target>
  <target name="test">
    <ant dir="common" target="test" />
    <ant dir="icu" target="test" />
+    <ant dir="phonetic" target="test" />
    <ant dir="smartcn" target="test" />
    <ant dir="stempel" target="test" />
  </target>
@ -76,6 +84,7 @@
  <target name="dist-maven" depends="default">
    <ant dir="common" target="dist-maven" />
    <ant dir="icu" target="dist-maven" />
+    <ant dir="phonetic" target="dist-maven" />
    <ant dir="smartcn" target="dist-maven" />
    <ant dir="stempel" target="dist-maven" />
  </target>  	
@ -83,6 +92,7 @@
  <target name="javadocs">
    <ant dir="common" target="javadocs" />
    <ant dir="icu" target="javadocs" />
+    <ant dir="phonetic" target="javadocs" />
    <ant dir="smartcn" target="javadocs" />
    <ant dir="stempel" target="javadocs" />
  </target>  	
@ -90,6 +100,7 @@
  <target name="javadocs-index.html">
    <ant dir="common" target="javadocs-index.html" />
    <ant dir="icu" target="javadocs-index.html" />
+    <ant dir="phonetic" target="javadocs-index.html" />
    <ant dir="smartcn" target="javadocs-index.html" />
    <ant dir="stempel" target="javadocs-index.html" />
  </target>
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/CapitalizationFilter.java
@ -0,0 +1,181 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/** 
+ * A filter to apply normal capitalization rules to Tokens.  It will make the first letter
+ * capital and the rest lower case.
+ * <p/>
+ * This filter is particularly useful to build nice looking facet parameters.  This filter
+ * is not appropriate if you intend to use a prefix query.
+ */
+public final class CapitalizationFilter extends TokenFilter {
+  public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = Integer.MAX_VALUE;
+  
+  private final boolean onlyFirstWord;
+  private final CharArraySet keep;
+  private final boolean forceFirstLetter;
+  private final Collection<char[]> okPrefix;
+
+  private final int minWordLength;
+  private final int maxWordCount;
+  private final int maxTokenLength;
+  
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /**
+   * Creates a CapitalizationFilter with the default parameters.
+   * <p>
+   * Calls {@link #CapitalizationFilter(TokenStream, boolean, CharArraySet, boolean, Collection, int, int, int)
+   *   CapitalizationFilter(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH)}
+   */
+  public CapitalizationFilter(TokenStream in) {
+    this(in, true, null, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+  }
+  
+  /**
+   * Creates a CapitalizationFilter with the specified parameters.
+   * @param in input tokenstream 
+   * @param onlyFirstWord should each word be capitalized or all of the words?
+   * @param keep a keep word list.  Each word that should be kept separated by whitespace.
+   * @param forceFirstLetter Force the first letter to be capitalized even if it is in the keep list.
+   * @param okPrefix do not change word capitalization if a word begins with something in this list.
+   * @param minWordLength how long the word needs to be to get capitalization applied.  If the
+   *                      minWordLength is 3, "and" > "And" but "or" stays "or".
+   * @param maxWordCount if the token contains more then maxWordCount words, the capitalization is
+   *                     assumed to be correct.
+   * @param maxTokenLength ???
+   */
+  public CapitalizationFilter(TokenStream in, boolean onlyFirstWord, CharArraySet keep, 
+      boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength, 
+      int maxWordCount, int maxTokenLength) {
+    super(in);
+    this.onlyFirstWord = onlyFirstWord;
+    this.keep = keep;
+    this.forceFirstLetter = forceFirstLetter;
+    this.okPrefix = okPrefix;
+    this.minWordLength = minWordLength;
+    this.maxWordCount = maxWordCount;
+    this.maxTokenLength = maxTokenLength;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if (!input.incrementToken()) return false;
+
+    char[] termBuffer = termAtt.buffer();
+    int termBufferLength = termAtt.length();
+    char[] backup = null;
+    
+    if (maxWordCount < DEFAULT_MAX_WORD_COUNT) {
+      //make a backup in case we exceed the word count
+      backup = new char[termBufferLength];
+      System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
+    }
+    
+    if (termBufferLength < maxTokenLength) {
+      int wordCount = 0;
+
+      int lastWordStart = 0;
+      for (int i = 0; i < termBufferLength; i++) {
+        char c = termBuffer[i];
+        if (c <= ' ' || c == '.') {
+          int len = i - lastWordStart;
+          if (len > 0) {
+            processWord(termBuffer, lastWordStart, len, wordCount++);
+            lastWordStart = i + 1;
+            i++;
+          }
+        }
+      }
+
+      // process the last word
+      if (lastWordStart < termBufferLength) {
+        processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
+      }
+
+      if (wordCount > maxWordCount) {
+        termAtt.copyBuffer(backup, 0, termBufferLength);
+      }
+    }
+
+    return true;
+  }
+  
+  private void processWord(char[] buffer, int offset, int length, int wordCount) {
+    if (length < 1) {
+      return;
+    }
+    
+    if (onlyFirstWord && wordCount > 0) {
+      for (int i = 0; i < length; i++) {
+        buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
+
+      }
+      return;
+    }
+
+    if (keep != null && keep.contains(buffer, offset, length)) {
+      if (wordCount == 0 && forceFirstLetter) {
+        buffer[offset] = Character.toUpperCase(buffer[offset]);
+      }
+      return;
+    }
+    
+    if (length < minWordLength) {
+      return;
+    }
+    
+    if (okPrefix != null) {
+      for (char[] prefix : okPrefix) {
+        if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
+          boolean match = true;
+          for (int i = 0; i < prefix.length; i++) {
+            if (prefix[i] != buffer[offset + i]) {
+              match = false;
+              break;
+            }
+          }
+          if (match == true) {
+            return;
+          }
+        }
+      }
+    }
+
+    // We know it has at least one character
+    /*char[] chars = w.toCharArray();
+    StringBuilder word = new StringBuilder( w.length() );
+    word.append( Character.toUpperCase( chars[0] ) );*/
+    buffer[offset] = Character.toUpperCase(buffer[offset]);
+
+    for (int i = 1; i < length; i++) {
+      buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
+    }
+    //return word.toString();
+  }
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/miscellaneous/TestCapitalizationFilter.java
@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.miscellaneous;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+import static org.apache.lucene.analysis.miscellaneous.CapitalizationFilter.*;
+
+/** Tests {@link CapitalizationFilter} */
+public class TestCapitalizationFilter extends BaseTokenStreamTestCase {  
+  public void testCapitalization() throws Exception {
+    CharArraySet keep = new CharArraySet(TEST_VERSION_CURRENT,
+        Arrays.asList("and", "the", "it", "BIG"), false);
+    
+    assertCapitalizesTo("kiTTEN", new String[] { "Kitten" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    assertCapitalizesTo("and", new String[] { "And" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    assertCapitalizesTo("AnD", new String[] { "And" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+
+    //first is not forced, but it's not a keep word, either
+    assertCapitalizesTo("AnD", new String[] { "And" }, 
+        true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+
+    assertCapitalizesTo("big", new String[] { "Big" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+
+    assertCapitalizesTo("BIG", new String[] { "BIG" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan", 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    // now each token
+    assertCapitalizesTo("Hello thEre my Name is Ryan", 
+        new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }, 
+        false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+           
+    // now only the long words
+    assertCapitalizesTo("Hello thEre my Name is Ryan", 
+        new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }, 
+        false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    // without prefix
+    assertCapitalizesTo("McKinley", 
+        new String[] { "Mckinley" }, 
+        true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    // Now try some prefixes
+    List<char[]> okPrefix = new ArrayList<char[]>();
+    okPrefix.add("McK".toCharArray());
+    
+    assertCapitalizesTo("McKinley", 
+        new String[] { "McKinley" }, 
+        true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
+    
+    // now try some stuff with numbers
+    assertCapitalizesTo("1st 2nd third", 
+        new String[] { "1st", "2nd", "Third" }, 
+        false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);    
+    
+    assertCapitalizesToKeyword("the The the", "The The the", 
+        false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);    
+  }
+  
+  static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
+      boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
+      Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
+      int maxTokenLength) throws IOException {
+    CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, 
+        forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
+    assertTokenStreamContents(filter, expected);    
+  }
+  
+  static void assertCapitalizesTo(String input, String expected[],
+      boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
+      Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
+      int maxTokenLength) throws IOException {
+    assertCapitalizesTo(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(input)),
+        expected, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, 
+        maxWordCount, maxTokenLength);
+  }
+  
+  static void assertCapitalizesToKeyword(String input, String expected,
+      boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
+      Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
+      int maxTokenLength) throws IOException {
+    assertCapitalizesTo(new KeywordTokenizer(new StringReader(input)),
+        new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
+        minWordLength, maxWordCount, maxTokenLength);    
+  }
+}
--- a/modules/analysis/phonetic/build.xml
+++ b/modules/analysis/phonetic/build.xml
@ -0,0 +1,63 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="analyzers-phonetic" default="default">
+
+  <description>
+  	Provides phonetic encoding support via Apache Commons Codec.
+  </description>
+
+  <property name="build.dir" location="../build/phonetic" />
+  <property name="dist.dir" location="../dist/phonetic" />
+
+  <path id="additional.dependencies">
+    <fileset dir="lib" includes="commons-codec-*.jar"/>
+  </path>
+
+  <pathconvert property="project.classpath"
+               targetos="unix"
+               refid="additional.dependencies"
+  />
+
+  <import file="../../../lucene/contrib/contrib-build.xml"/>
+
+  <module-uptodate name="analysis/common" jarfile="../build/common/lucene-analyzers-common-${version}.jar"
+    property="analyzers-common.uptodate" classpath.property="analyzers-common.jar"/>
+
+  <path id="classpath">
+    <pathelement path="${analyzers-common.jar}"/>
+    <path refid="base.classpath"/>
+  </path>
+
+  <path id="test.classpath">
+  	<pathelement path="${analyzers-common.jar}"/>
+    <path refid="classpath"/>
+    <pathelement location="../../../lucene/build/classes/test/"/>
+  	<pathelement location="../build/common/classes/test/"/>
+    <path refid="junit-path"/>
+    <pathelement location="${build.dir}/classes/java"/>
+  </path>
+
+  <target name="compile-core" depends="build-analyzers-common, common.compile-core" />
+
+  <target name="build-analyzers-common" unless="analyzers-common.uptodate">
+    <echo>phonetic building dependency ${analyzers-common.jar}</echo>
+    <ant antfile="../common/build.xml" target="default" inheritall="false" dir="../common" />
+  </target>
+</project>
--- a/modules/analysis/phonetic/lib/commons-codec-1.4.jar
+++ b/modules/analysis/phonetic/lib/commons-codec-1.4.jar
@ -0,0 +1,2 @@
+AnyObjectId[458d432da88b0efeab640c229903fb5aad274044] was removed in git history.
+Apache SVN contains full history.
--- a/modules/analysis/phonetic/pom.xml.template
+++ b/modules/analysis/phonetic/pom.xml.template
@ -0,0 +1,46 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+    
+    http://www.apache.org/licenses/LICENSE-2.0
+    
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+  -->
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-contrib</artifactId>
+    <version>@version@</version>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-phonetic</artifactId>
+  <name>
+    Lucene Phonetic Filters
+  </name>
+  <version>@version@</version>
+  <description>    
+  	Provides phonetic encoding via Commons Codec.
+  </description>
+  <packaging>jar</packaging>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>codec</artifactId>
+      <version>${codec-version}</version>
+    </dependency>
+  </dependencies>
+</project>
--- a/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java
+++ b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilter.java
@ -14,7 +14,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.phonetic;

 import java.io.IOException;
 import java.util.LinkedList;
@ -35,7 +35,7 @@ public final class DoubleMetaphoneFilter extends TokenFilter {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);

-  protected DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
+  public DoubleMetaphoneFilter(TokenStream input, int maxCodeLength, boolean inject) {
    super(input);
    this.encoder.setMaxCodeLen(maxCodeLength);
    this.inject = inject;
--- a/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
+++ b/modules/analysis/phonetic/src/java/org/apache/lucene/analysis/phonetic/PhoneticFilter.java
@ -15,7 +15,7 @@
 * limitations under the License.
 */

-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.phonetic;

 import org.apache.commons.codec.Encoder;
 import org.apache.lucene.analysis.TokenFilter;
@ -28,23 +28,19 @@ import java.io.IOException;
 /**
 * Create tokens for phonetic matches.  See:
 * http://jakarta.apache.org/commons/codec/api-release/org/apache/commons/codec/language/package-summary.html
- *
- * @version $Id$
 */
 public final class PhoneticFilter extends TokenFilter 
 {
  protected boolean inject = true; 
  protected Encoder encoder = null;
-  protected String name = null;
  
  protected State save = null;
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);

-  public PhoneticFilter(TokenStream in, Encoder encoder, String name, boolean inject) {
+  public PhoneticFilter(TokenStream in, Encoder encoder, boolean inject) {
    super(in);
    this.encoder = encoder;
-    this.name = name;
    this.inject = inject;   
  }

--- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
+++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/DoubleMetaphoneFilterTest.java
@ -14,52 +14,53 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-package org.apache.solr.analysis;
+package org.apache.lucene.analysis.phonetic;

 import java.io.StringReader;

+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;

-public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
+public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {

  public void testSize4FalseInject() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "ANTR" });
  }

  public void testSize4TrueInject() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
    assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
  }

  public void testAlternateInjectFalse() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Kuczewski"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
  }

  public void testSize8FalseInject() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("international"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
  }

  public void testNonConvertableStringsWithInject() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
  }

  public void testNonConvertableStringsWithoutInject() throws Exception {
-    TokenStream stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%&"));
+    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
    
    // should have something after the stream
-    stream = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("12345 #$%@#^%& hello"));
+    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
    filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
  }
--- a/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
+++ b/modules/analysis/phonetic/src/test/org/apache/lucene/analysis/phonetic/TestPhoneticFilter.java
@ -0,0 +1,73 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.phonetic;
+
+import java.io.StringReader;
+
+import org.apache.commons.codec.Encoder;
+import org.apache.commons.codec.language.Caverphone;
+import org.apache.commons.codec.language.DoubleMetaphone;
+import org.apache.commons.codec.language.Metaphone;
+import org.apache.commons.codec.language.RefinedSoundex;
+import org.apache.commons.codec.language.Soundex;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+
+/**
+ * Tests {@link PhoneticFilter}
+ */
+public class TestPhoneticFilter extends BaseTokenStreamTestCase {
+   
+  public void testAlgorithms() throws Exception {
+    assertAlgorithm(new Metaphone(), true, "aaa bbb ccc easgasg",
+        new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
+    assertAlgorithm(new Metaphone(), false, "aaa bbb ccc easgasg",
+        new String[] { "A", "B", "KKK", "ESKS" });
+    
+    assertAlgorithm(new DoubleMetaphone(), true, "aaa bbb ccc easgasg",
+        new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
+    assertAlgorithm(new DoubleMetaphone(), false, "aaa bbb ccc easgasg",
+        new String[] { "A", "PP", "KK", "ASKS" });
+    
+    assertAlgorithm(new Soundex(), true, "aaa bbb ccc easgasg",
+        new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
+    assertAlgorithm(new Soundex(), false, "aaa bbb ccc easgasg",
+        new String[] { "A000", "B000", "C000", "E220" });
+    
+    assertAlgorithm(new RefinedSoundex(), true, "aaa bbb ccc easgasg",
+        new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
+    assertAlgorithm(new RefinedSoundex(), false, "aaa bbb ccc easgasg",
+        new String[] { "A0", "B1", "C3", "E034034" });
+    
+    assertAlgorithm(new Caverphone(), true, "Darda Karleen Datha Carlene",
+        new String[] { "TTA1111111", "Darda", "KLN1111111", "Karleen", 
+          "TTA1111111", "Datha", "KLN1111111", "Carlene" });
+    assertAlgorithm(new Caverphone(), false, "Darda Karleen Datha Carlene",
+        new String[] { "TTA1111111", "KLN1111111", "TTA1111111", "KLN1111111" });
+  }
+
+  
+  static void assertAlgorithm(Encoder encoder, boolean inject, String input,
+      String[] expected) throws Exception {
+    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
+        new StringReader(input));
+    PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
+    assertTokenStreamContents(filter, expected);
+  }
+}
--- a/solr/common-build.xml
+++ b/solr/common-build.xml
@ -147,6 +147,7 @@
  <path id="lucene.classpath">
    <pathelement location="${common-solr.dir}/../lucene/build/classes/java" />
    <pathelement location="${common-solr.dir}/../modules/analysis/build/common/classes/java" />
+    <pathelement location="${common-solr.dir}/../modules/analysis/build/phonetic/classes/java" />
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/highlighter/classes/java" />
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/memory/classes/java" />
    <pathelement location="${common-solr.dir}/../lucene/build/contrib/misc/classes/java" />
@ -162,6 +163,7 @@
      </subant>
      <subant target="jar" inheritall="false" failonerror="true">
        <fileset dir="../modules/analysis/common" includes="build.xml" />
+        <fileset dir="../modules/analysis/phonetic" includes="build.xml" />
        <fileset dir="../lucene/contrib/highlighter" includes="build.xml" />
        <fileset dir="../lucene/contrib/memory" includes="build.xml" />
        <fileset dir="../lucene/contrib/misc" includes="build.xml" />
@ -181,6 +183,9 @@
      <fileset dir="../modules/analysis/build/common">
        <include name="lucene-analyzers-common-${version}.jar" />
      </fileset>
+      <fileset dir="../modules/analysis/build/phonetic">
+        <include name="lucene-analyzers-phonetic-${version}.jar" />
+      </fileset>
      <fileset dir="../lucene/build/contrib/highlighter">
        <include name="lucene-highlighter-${version}.jar" />
      </fileset>
@ -206,6 +211,7 @@
    <property name="lucene-compiled" value="true"/>
    <subant target="default">
      <fileset dir="../modules/analysis/common" includes="build.xml"/>
+      <fileset dir="../modules/analysis/phonetic" includes="build.xml"/>
      <fileset dir="../lucene/contrib/highlighter" includes="build.xml"/>
      <fileset dir="../lucene/contrib/memory" includes="build.xml"/>
      <fileset dir="../lucene/contrib/misc" includes="build.xml"/>
--- a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java
@ -17,11 +17,10 @@

 package org.apache.solr.analysis;

-import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.CapitalizationFilter;
 import org.apache.lucene.analysis.util.CharArraySet;

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@ -29,11 +28,7 @@ import java.util.Map;
 import java.util.StringTokenizer;

 /**
- * A filter to apply normal capitalization rules to Tokens.  It will make the first letter
- * capital and the rest lower case.
- * <p/>
- * This filter is particularly useful to build nice looking facet parameters.  This filter
- * is not appropriate if you intend to use a prefix query.
+ * Factory for {@link CapitalizationFilter}.
 * <p/>
 * The factory takes parameters:<br/>
 * "onlyFirstWord" - should each word be capitalized or all of the words?<br/>
@ -52,7 +47,6 @@ import java.util.StringTokenizer;
 * @since solr 1.3
 */
 public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
-  public static final int DEFAULT_MAX_WORD_COUNT = Integer.MAX_VALUE;
  public static final String KEEP = "keep";
  public static final String KEEP_IGNORE_CASE = "keepIgnoreCase";
  public static final String OK_PREFIX = "okPrefix";
@ -68,8 +62,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
  Collection<char[]> okPrefix = Collections.emptyList(); // for Example: McK

  int minWordLength = 0;  // don't modify capitalization for words shorter then this
-  int maxWordCount = DEFAULT_MAX_WORD_COUNT;
-  int maxTokenLength = DEFAULT_MAX_WORD_COUNT;
+  int maxWordCount = CapitalizationFilter.DEFAULT_MAX_WORD_COUNT;
+  int maxTokenLength = CapitalizationFilter.DEFAULT_MAX_TOKEN_LENGTH;
  boolean onlyFirstWord = true;
  boolean forceFirstLetter = true; // make sure the first letter is capitol even if it is in the keep list

@ -128,116 +122,8 @@ public class CapitalizationFilterFactory extends BaseTokenFilterFactory {
    }
  }

-
-  public void processWord(char[] buffer, int offset, int length, int wordCount) {
-    if (length < 1) {
-      return;
-    }
-    if (onlyFirstWord && wordCount > 0) {
-      for (int i = 0; i < length; i++) {
-        buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
-
-      }
-      return;
-    }
-
-    if (keep != null && keep.contains(buffer, offset, length)) {
-      if (wordCount == 0 && forceFirstLetter) {
-        buffer[offset] = Character.toUpperCase(buffer[offset]);
-      }
-      return;
-    }
-    if (length < minWordLength) {
-      return;
-    }
-    for (char[] prefix : okPrefix) {
-      if (length >= prefix.length) { //don't bother checking if the buffer length is less than the prefix
-        boolean match = true;
-        for (int i = 0; i < prefix.length; i++) {
-          if (prefix[i] != buffer[offset + i]) {
-            match = false;
-            break;
-          }
-        }
-        if (match == true) {
-          return;
-        }
-      }
-    }
-
-    // We know it has at least one character
-    /*char[] chars = w.toCharArray();
-    StringBuilder word = new StringBuilder( w.length() );
-    word.append( Character.toUpperCase( chars[0] ) );*/
-    buffer[offset] = Character.toUpperCase(buffer[offset]);
-
-    for (int i = 1; i < length; i++) {
-      buffer[offset + i] = Character.toLowerCase(buffer[offset + i]);
-    }
-    //return word.toString();
-  }
-
  public CapitalizationFilter create(TokenStream input) {
-    return new CapitalizationFilter(input, this);
+    return new CapitalizationFilter(input, onlyFirstWord, keep, 
+      forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
  }
 }
-
-
-/**
- * This relies on the Factory so that the difficult stuff does not need to be
- * re-initialized each time the filter runs.
- * <p/>
- * This is package protected since it is not useful without the Factory
- */
-final class CapitalizationFilter extends TokenFilter {
-  private final CapitalizationFilterFactory factory;
-  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
-
-  public CapitalizationFilter(TokenStream in, final CapitalizationFilterFactory factory) {
-    super(in);
-    this.factory = factory;
-  }
-
-  @Override
-  public boolean incrementToken() throws IOException {
-    if (!input.incrementToken()) return false;
-
-    char[] termBuffer = termAtt.buffer();
-    int termBufferLength = termAtt.length();
-    char[] backup = null;
-    if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) {
-      //make a backup in case we exceed the word count
-      backup = new char[termBufferLength];
-      System.arraycopy(termBuffer, 0, backup, 0, termBufferLength);
-    }
-    if (termBufferLength < factory.maxTokenLength) {
-      int wordCount = 0;
-
-      int lastWordStart = 0;
-      for (int i = 0; i < termBufferLength; i++) {
-        char c = termBuffer[i];
-        if (c <= ' ' || c == '.') {
-          int len = i - lastWordStart;
-          if (len > 0) {
-            factory.processWord(termBuffer, lastWordStart, len, wordCount++);
-            lastWordStart = i + 1;
-            i++;
-          }
-        }
-      }
-
-      // process the last word
-      if (lastWordStart < termBufferLength) {
-        factory.processWord(termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++);
-      }
-
-      if (wordCount > factory.maxWordCount) {
-        termAtt.copyBuffer(backup, 0, termBufferLength);
-      }
-    }
-
-    return true;
-  }
-
-}
-
--- a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java
@ -19,6 +19,7 @@ package org.apache.solr.analysis;
 import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;

 public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory 
 {
--- a/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java
+++ b/solr/src/java/org/apache/solr/analysis/PhoneticFilterFactory.java
@ -29,6 +29,7 @@ import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.phonetic.PhoneticFilter;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.StrUtils;

@ -96,6 +97,6 @@ public class PhoneticFilterFactory extends BaseTokenFilterFactory
  }
  
  public PhoneticFilter create(TokenStream input) {
-    return new PhoneticFilter(input,encoder,name,inject);
+    return new PhoneticFilter(input,encoder,inject);
  }
 }
--- a/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
+++ b/solr/src/test/org/apache/solr/analysis/DoubleMetaphoneFilterFactoryTest.java
@ -22,6 +22,7 @@ import java.util.Map;

 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.core.WhitespaceTokenizer;
+import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;

 public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
--- a/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestCapitalizationFilterFactory.java
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 /**
 * 
 */
-public class TestCapitalizationFilter extends BaseTokenTestCase {
+public class TestCapitalizationFilterFactory extends BaseTokenTestCase {
  
  public void testCapitalization() throws Exception 
  {
@ -40,74 +40,78 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
    
    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
    factory.init( args );
-    char[] termBuffer;
-    termBuffer = "kiTTEN".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "Kitten",  new String(termBuffer, 0, termBuffer.length));
-
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("kiTTEN"))),
+        new String[] { "Kitten" });
+    
    factory.forceFirstLetter = true;

-    termBuffer = "and".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "And",  new String(termBuffer, 0, termBuffer.length));//first is forced
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("and"))),
+        new String[] { "And" });

-    termBuffer = "AnD".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "And",  new String(termBuffer, 0, termBuffer.length));//first is forced, but it's not a keep word, either
+    //first is forced, but it's not a keep word, either
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
+        new String[] { "And" });

    factory.forceFirstLetter = false;
-    termBuffer = "AnD".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "And",  new String(termBuffer, 0, termBuffer.length)); //first is not forced, but it's not a keep word, either
+
+    //first is not forced, but it's not a keep word, either
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("AnD"))),
+        new String[] { "And" });

    factory.forceFirstLetter = true;
-    termBuffer = "big".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "Big",  new String(termBuffer, 0, termBuffer.length));
-    termBuffer = "BIG".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "BIG",  new String(termBuffer, 0, termBuffer.length));
    
-    Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
-    TokenStream stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("big"))),
+        new String[] { "Big" });
    
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("BIG"))),
+        new String[] { "BIG" });
+
+    assertTokenStreamContents(factory.create(
+        new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"))),
+        new String[] { "Hello there my name is ryan" });
+        
    // now each token
    factory.onlyFirstWord = false;
-    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
+        new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
    
    // now only the long words
    factory.minWordLength = 3;
-    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan" ));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("Hello thEre my Name is Ryan"))),
+        new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
    
    // without prefix
-    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "Mckinley" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
+        new String[] { "Mckinley" });
    
    // Now try some prefixes
    factory = new CapitalizationFilterFactory();
    args.put( "okPrefix", "McK" );  // all words
    factory.init( args );
-    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley" ));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "McKinley" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("McKinley"))),
+        new String[] { "McKinley" });
    
    // now try some stuff with numbers
    factory.forceFirstLetter = false;
    factory.onlyFirstWord = false;
-    tokenizer = new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third" ));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
+    assertTokenStreamContents(factory.create(
+        new WhitespaceTokenizer(DEFAULT_VERSION, new StringReader("1st 2nd third"))),
+        new String[] { "1st", "2nd", "Third" });
    
-    factory.forceFirstLetter = true;  
-    tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
-    stream = factory.create(tokenizer);
-    assertTokenStreamContents(stream, new String[] { "The The the" });
+    factory.forceFirstLetter = true;
+    assertTokenStreamContents(factory.create(
+        new KeywordTokenizer(new StringReader("the The the"))),
+        new String[] { "The The the" });
  }

  public void testKeepIgnoreCase() throws Exception {
@ -118,21 +122,20 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {

    CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
    factory.init( args );
-    char[] termBuffer;
-    termBuffer = "kiTTEN".toCharArray();
    factory.forceFirstLetter = true;
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "KiTTEN",  new String(termBuffer, 0, termBuffer.length));
+    assertTokenStreamContents(factory.create(
+        new KeywordTokenizer(new StringReader("kiTTEN"))),
+        new String[] { "KiTTEN" });

    factory.forceFirstLetter = false;
-    termBuffer = "kiTTEN".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "kiTTEN",  new String(termBuffer, 0, termBuffer.length));
+    assertTokenStreamContents(factory.create(
+        new KeywordTokenizer(new StringReader("kiTTEN"))),
+        new String[] { "kiTTEN" });

    factory.keep = null;
-    termBuffer = "kiTTEN".toCharArray();
-    factory.processWord(termBuffer, 0, termBuffer.length, 0 );
-    assertEquals( "Kitten",  new String(termBuffer, 0, termBuffer.length));
+    assertTokenStreamContents(factory.create(
+        new KeywordTokenizer(new StringReader("kiTTEN"))),
+        new String[] { "Kitten" });
  }
  
  /**
--- a/solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java
+++ b/solr/src/test/org/apache/solr/analysis/TestPhoneticFilterFactory.java
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.core.WhitespaceTokenizer;
 /**
 * @version $Id$
 */
-public class TestPhoneticFilter extends BaseTokenTestCase {
+public class TestPhoneticFilterFactory extends BaseTokenTestCase {
  
  public void testFactory()
  {