SOLR-822: Add CharFilter so that characters can be filtered before Tokenizer/TokenFilters.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@713902 13f79535-47bb-0310-9956-ffa450edef68
2008-11-14 01:56:21 +00:00 · 2008-11-14 01:56:21 +00:00 · eb0ec4a3e2
parent 4d7731fc90
commit eb0ec4a3e2
24 changed files with 1667 additions and 9 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -82,6 +82,9 @@ New Features
    DirectoryProvider will use NIOFSDirectory for better concurrency
    on non Windows platforms.  (Mark Miller, TJ Laurenzo via yonik)

+15. SOLR-822: Add CharFilter so that characters can be filtered (e.g. character normalization)
+    before Tokenizer/TokenFilters. (koji)
+
 Optimizations
 ----------------------
 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
--- a/example/solr/conf/mapping-ISOLatin1Accent.txt
+++ b/example/solr/conf/mapping-ISOLatin1Accent.txt
@ -0,0 +1,246 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Syntax:
+#   "source" => "target"
+#     "source".length() > 0 (source cannot be empty.)
+#     "target".length() >= 0 (target can be empty.)
+
+# example:
+#   "À" => "A"
+#   "\u00C0" => "A"
+#   "\u00C0" => "\u0041"
+#   "ß" => "ss"
+#   "\t" => " "
+#   "\n" => ""
+
+# À => A
+"\u00C0" => "A"
+
+# Á => A
+"\u00C1" => "A"
+
+# Â => A
+"\u00C2" => "A"
+
+# Ã => A
+"\u00C3" => "A"
+
+# Ä => A
+"\u00C4" => "A"
+
+# Å => A
+"\u00C5" => "A"
+
+# Æ => AE
+"\u00C6" => "AE"
+
+# Ç => C
+"\u00C7" => "C"
+
+# È => E
+"\u00C8" => "E"
+
+# É => E
+"\u00C9" => "E"
+
+# Ê => E
+"\u00CA" => "E"
+
+# Ë => E
+"\u00CB" => "E"
+
+# Ì => I
+"\u00CC" => "I"
+
+# Í => I
+"\u00CD" => "I"
+
+# Î => I
+"\u00CE" => "I"
+
+# Ï => I
+"\u00CF" => "I"
+
+# Ĳ => IJ
+"\u0132" => "IJ"
+
+# Ð => D
+"\u00D0" => "D"
+
+# Ñ => N
+"\u00D1" => "N"
+
+# Ò => O
+"\u00D2" => "O"
+
+# Ó => O
+"\u00D3" => "O"
+
+# Ô => O
+"\u00D4" => "O"
+
+# Õ => O
+"\u00D5" => "O"
+
+# Ö => O
+"\u00D6" => "O"
+
+# Ø => O
+"\u00D8" => "O"
+
+# Œ => OE
+"\u0152" => "OE"
+
+# Þ
+"\u00DE" => "TH"
+
+# Ù => U
+"\u00D9" => "U"
+
+# Ú => U
+"\u00DA" => "U"
+
+# Û => U
+"\u00DB" => "U"
+
+# Ü => U
+"\u00DC" => "U"
+
+# Ý => Y
+"\u00DD" => "Y"
+
+# Ÿ => Y
+"\u0178" => "Y"
+
+# à => a
+"\u00E0" => "a"
+
+# á => a
+"\u00E1" => "a"
+
+# â => a
+"\u00E2" => "a"
+
+# ã => a
+"\u00E3" => "a"
+
+# ä => a
+"\u00E4" => "a"
+
+# å => a
+"\u00E5" => "a"
+
+# æ => ae
+"\u00E6" => "ae"
+
+# ç => c
+"\u00E7" => "c"
+
+# è => e
+"\u00E8" => "e"
+
+# é => e
+"\u00E9" => "e"
+
+# ê => e
+"\u00EA" => "e"
+
+# ë => e
+"\u00EB" => "e"
+
+# ì => i
+"\u00EC" => "i"
+
+# í => i
+"\u00ED" => "i"
+
+# î => i
+"\u00EE" => "i"
+
+# ï => i
+"\u00EF" => "i"
+
+# ĳ => ij
+"\u0133" => "ij"
+
+# ð => d
+"\u00F0" => "d"
+
+# ñ => n
+"\u00F1" => "n"
+
+# ò => o
+"\u00F2" => "o"
+
+# ó => o
+"\u00F3" => "o"
+
+# ô => o
+"\u00F4" => "o"
+
+# õ => o
+"\u00F5" => "o"
+
+# ö => o
+"\u00F6" => "o"
+
+# ø => o
+"\u00F8" => "o"
+
+# œ => oe
+"\u0153" => "oe"
+
+# ß => ss
+"\u00DF" => "ss"
+
+# þ => th
+"\u00FE" => "th"
+
+# ù => u
+"\u00F9" => "u"
+
+# ú => u
+"\u00FA" => "u"
+
+# û => u
+"\u00FB" => "u"
+
+# ü => u
+"\u00FC" => "u"
+
+# ý => y
+"\u00FD" => "y"
+
+# ÿ => y
+"\u00FF" => "y"
+
+# ﬀ => ff
+"\uFB00" => "ff"
+
+# ﬁ => fi
+"\uFB01" => "fi"
+
+# ﬂ => fl
+"\uFB02" => "fl"
+
+# ﬃ => ffi
+"\uFB03" => "ffi"
+
+# ﬄ => ffl
+"\uFB04" => "ffl"
+
+# ﬅ => ft
+"\uFB05" => "ft"
+
+# ﬆ => st
+"\uFB06" => "st"
--- a/example/solr/conf/schema.xml
+++ b/example/solr/conf/schema.xml
@ -215,6 +215,16 @@
      </analyzer>
    </fieldType>

+    <!-- charFilter + "CharStream aware" WhitespaceTokenizer  -->
+    <!--
+    <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" >
+      <analyzer>
+        <charFilter class="solr.MappingCharFilterFactory" mapping="mapping-ISOLatin1Accent.txt"/>
+        <tokenizer class="solr.CharStreamAwareWhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+    -->
+
    <!-- This is an example of using the KeywordTokenizer along
         With various TokenFilterFactories to produce a sortable field
         that does not include some properties of the source text
--- a/src/java/org/apache/solr/analysis/BaseCharFilter.java
+++ b/src/java/org/apache/solr/analysis/BaseCharFilter.java
@ -0,0 +1,67 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class BaseCharFilter extends CharFilter {
+
+  protected List<PosCorrectMap> pcmList;
+  
+  public BaseCharFilter( CharStream in ){
+    super(in);
+    pcmList = new ArrayList<PosCorrectMap>();
+  }
+
+  protected int correctPosition( int currentPos ){
+    if( pcmList.isEmpty() ) return currentPos;
+    for( int i = pcmList.size() - 1; i >= 0; i-- ){
+      if( currentPos >= pcmList.get( i ).pos )
+        return currentPos + pcmList.get( i ).cumulativeDiff;
+    }
+    return currentPos;
+  }
+
+  protected static class PosCorrectMap {
+
+    protected int pos;
+    protected int cumulativeDiff;
+
+    public PosCorrectMap( int pos, int cumulativeDiff ){
+      this.pos = pos;
+      this.cumulativeDiff = cumulativeDiff;
+    }
+
+    public String toString(){
+      StringBuffer sb = new StringBuffer();
+      sb.append('(');
+      sb.append(pos);
+      sb.append(',');
+      sb.append(cumulativeDiff);
+      sb.append(')');
+      return sb.toString();
+    }
+  }
+}
--- a/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/BaseCharFilterFactory.java
@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+*
+* @version $Id$
+* @since Solr 1.4
+*
+*/
+public abstract class BaseCharFilterFactory implements CharFilterFactory {
+
+  public static final Logger log = LoggerFactory.getLogger(BaseCharFilterFactory.class);
+
+  /** The init args */
+  protected Map<String,String> args;
+
+  public Map<String, String> getArgs() {
+    return args;
+  }
+
+  public void init(Map<String, String> args) {
+    this.args = args;
+  }
+
+}
--- a/src/java/org/apache/solr/analysis/CharFilter.java
+++ b/src/java/org/apache/solr/analysis/CharFilter.java
@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+
+/**
+ *
+ * Subclasses of CharFilter can be chained to filter CharStream.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class CharFilter extends CharStream {
+
+  protected CharStream input;
+
+  protected CharFilter( CharStream in ){
+    input = in;
+  }
+
+  /**
+   *
+   * Subclass may want to override to correct the current position.
+   *
+   * @param pos current position
+   * @return corrected position
+   */
+  protected int correctPosition( int pos ){
+    return pos;
+  }
+
+  @Override
+  public final int correctOffset(int currentOff) {
+    return input.correctOffset( correctPosition( currentOff ) );
+  }
+
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len);
+  }
+}
--- a/src/java/org/apache/solr/analysis/CharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/CharFilterFactory.java
@ -0,0 +1,32 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.Map;
+
+/**
+*
+* @version $Id$
+* @since Solr 1.4
+*
+*/
+public interface CharFilterFactory {
+  public void init(Map<String,String> args);
+  public Map<String,String> getArgs();
+  public CharStream create(CharStream input);
+}
--- a/src/java/org/apache/solr/analysis/CharReader.java
+++ b/src/java/org/apache/solr/analysis/CharReader.java
@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.io.Reader;
+
+/**
+ * CharReader is a Reader wrapper. It reads chars from Reader and outputs CharStream.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public final class CharReader extends CharStream {
+
+  protected Reader input;
+
+  public CharReader( Reader in ){
+    input = in;
+  }
+
+  @Override
+  public int correctOffset(int currentOff) {
+    return currentOff;
+  }
+
+  @Override
+  public void close() throws IOException {
+    input.close();
+  }
+
+  @Override
+  public int read(char[] cbuf, int off, int len) throws IOException {
+    return input.read(cbuf, off, len );
+  }
+}
--- a/src/java/org/apache/solr/analysis/CharStream.java
+++ b/src/java/org/apache/solr/analysis/CharStream.java
@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+/**
+ * CharStream adds <a href="#correctOffset(int)">correctOffset</a> functionality over Reader.
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public abstract class CharStream extends Reader {
+
+  /**
+   * called by CharFilter(s) and Tokenizer to correct token offset.
+   *
+   * @param currentOff current offset
+   * @return corrected token offset
+   */
+  public abstract int correctOffset( int currentOff );
+}
--- a/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java
+++ b/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizer.java
@ -0,0 +1,276 @@
+package org.apache.solr.analysis;
+
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+
+
+/**
+ * CJKTokenizer was modified from StopTokenizer which does a decent job for
+ * most European languages. It performs other token methods for double-byte
+ * Characters: the token will return at each two characters with overlap match.<br>
+ * Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
+ * also need filter filter zero length token ""<br>
+ * for Digit: digit, '+', '#' will token as letter<br>
+ * for more info on Asia language(Chinese Japanese Korean) text segmentation:
+ * please search  <a
+ * href="http://www.google.com/search?q=word+chinese+segment">google</a>
+ *
+ */
+
+/*
+ * LUCENE-973 is applied
+ */
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public final class CharStreamAwareCJKTokenizer extends Tokenizer {
+    //~ Static fields/initializers ---------------------------------------------
+    /** Word token type */
+    static final int WORD_TYPE = 0;
+
+    /** Single byte token type */
+    static final int SINGLE_TOKEN_TYPE = 1;
+
+    /** Double byte token type */
+    static final int DOUBLE_TOKEN_TYPE = 2;
+
+    /** Names for token types */
+    static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" };
+
+    /** Max word length */
+    private static final int MAX_WORD_LEN = 255;
+
+    /** buffer size: */
+    private static final int IO_BUFFER_SIZE = 256;
+
+    //~ Instance fields --------------------------------------------------------
+
+    /** word offset, used to imply which character(in ) is parsed */
+    private int offset = 0;
+
+    /** the index used only for ioBuffer */
+    private int bufferIndex = 0;
+
+    /** data length */
+    private int dataLen = 0;
+
+    /**
+     * character buffer, store the characters which are used to compose <br>
+     * the returned Token
+     */
+    private final char[] buffer = new char[MAX_WORD_LEN];
+
+    /**
+     * I/O buffer, used to store the content of the input(one of the <br>
+     * members of Tokenizer)
+     */
+    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+    /** word type: single=>ASCII  double=>non-ASCII word=>default */
+    private int tokenType = WORD_TYPE;
+
+    /**
+     * tag: previous character is a cached double-byte character  "C1C2C3C4"
+     * ----(set the C1 isTokened) C1C2 "C2C3C4" ----(set the C2 isTokened)
+     * C1C2 C2C3 "C3C4" ----(set the C3 isTokened) "C1C2 C2C3 C3C4"
+     */
+    private boolean preIsTokened = false;
+
+    //~ Constructors -----------------------------------------------------------
+
+    /**
+     * Construct a token stream processing the given input.
+     *
+     * @param in I/O reader
+     */
+    public CharStreamAwareCJKTokenizer(CharStream in) {
+        input = in;
+    }
+
+    //~ Methods ----------------------------------------------------------------
+
+    /**
+     * Returns the next token in the stream, or null at EOS.
+     * See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
+     * for detail.
+     *
+     * @param reusableToken a reusable token
+     * @return Token
+     *
+     * @throws java.io.IOException - throw IOException when read error <br>
+     *         happened in the InputStream
+     *
+     */
+    public final Token next(final Token reusableToken) throws java.io.IOException {
+        /** how many character(s) has been stored in buffer */
+        assert reusableToken != null;
+        int length = 0;
+
+        /** the position used to create Token */
+        int start = offset;
+
+        while (true) {
+            /** current character */
+            char c;
+
+            /** unicode block of current character for detail */
+            Character.UnicodeBlock ub;
+
+            offset++;
+
+            if (bufferIndex >= dataLen) {
+                dataLen = input.read(ioBuffer);
+                bufferIndex = 0;
+            }
+
+            if (dataLen == -1) {
+                if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    }
+
+                    break;
+                } else {
+                    return null;
+                }
+            } else {
+                //get current character
+                c = ioBuffer[bufferIndex++];
+
+                //get the UnicodeBlock of the current character
+                ub = Character.UnicodeBlock.of(c);
+            }
+
+            //if the current character is ASCII or Extend ASCII
+            if ((ub == Character.UnicodeBlock.BASIC_LATIN)
+                    || (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS)
+               ) {
+                if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) {
+                    // convert  HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN
+                    int i = (int) c;
+                    i = i - 65248;
+                    c = (char) i;
+                }
+
+                // if the current character is a letter or "_" "+" "#"
+                if (Character.isLetterOrDigit(c)
+                        || ((c == '_') || (c == '+') || (c == '#'))
+                   ) {
+                    if (length == 0) {
+                        // "javaC1C2C3C4linux" <br>
+                        //      ^--: the current character begin to token the ASCII
+                        // letter
+                        start = offset - 1;
+                    } else if (tokenType == DOUBLE_TOKEN_TYPE) {
+                        // "javaC1C2C3C4linux" <br>
+                        //              ^--: the previous non-ASCII
+                        // : the current character
+                        offset--;
+                        bufferIndex--;
+
+                        if (preIsTokened == true) {
+                            // there is only one non-ASCII has been stored
+                            length = 0;
+                            preIsTokened = false;
+                            break;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    // store the LowerCase(c) in the buffer
+                    buffer[length++] = Character.toLowerCase(c);
+                    tokenType = SINGLE_TOKEN_TYPE;
+
+                    // break the procedure if buffer overflowed!
+                    if (length == MAX_WORD_LEN) {
+                        break;
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            } else {
+                // non-ASCII letter, e.g."C1C2C3C4"
+                if (Character.isLetter(c)) {
+                    if (length == 0) {
+                        start = offset - 1;
+                        buffer[length++] = c;
+                        tokenType = DOUBLE_TOKEN_TYPE;
+                    } else {
+                      if (tokenType == SINGLE_TOKEN_TYPE) {
+                            offset--;
+                            bufferIndex--;
+
+                            //return the previous ASCII characters
+                            break;
+                        } else {
+                            buffer[length++] = c;
+                            tokenType = DOUBLE_TOKEN_TYPE;
+
+                            if (length == 2) {
+                                offset--;
+                                bufferIndex--;
+                                preIsTokened = true;
+
+                                break;
+                            }
+                        }
+                    }
+                } else if (length > 0) {
+                    if (preIsTokened == true) {
+                        // empty the buffer
+                        length = 0;
+                        preIsTokened = false;
+                    } else {
+                        break;
+                    }
+                }
+            }
+        }
+
+        if (length > 0) {
+            // Because of "CharStream aware" tokenizer, using correctOffset() to
+            // correct start/end offsets
+            return reusableToken.reinit
+                (buffer, 0, length,
+                	((CharStream)input).correctOffset( start ),
+                	((CharStream)input).correctOffset( start+length ),
+                	TOKEN_TYPE_NAMES[tokenType]);
+        } else if (dataLen != -1) {
+            // Don't return an empty string - recurse to get the next token
+            return next(reusableToken);
+        } else {
+          return null;
+        }
+    }
+}
--- a/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/CharStreamAwareCJKTokenizerFactory.java
@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class CharStreamAwareCJKTokenizerFactory extends BaseTokenizerFactory {
+
+  public TokenStream create(Reader input) {
+    return new CharStreamAwareCJKTokenizer( (CharStream)input );
+  }
+}
--- a/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java
+++ b/src/java/org/apache/solr/analysis/CharStreamAwareCharTokenizer.java
@ -0,0 +1,102 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+/** An abstract base class for simple, character-oriented tokenizers.*/
+public abstract class CharStreamAwareCharTokenizer extends Tokenizer {
+  public CharStreamAwareCharTokenizer(CharStream input) {
+    super(input);
+  }
+
+  private int offset = 0, bufferIndex = 0, dataLen = 0;
+  private static final int MAX_WORD_LEN = 255;
+  private static final int IO_BUFFER_SIZE = 4096;
+  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+  /** Returns true iff a character should be included in a token.  This
+   * tokenizer generates as tokens adjacent sequences of characters which
+   * satisfy this predicate.  Characters for which this is false are used to
+   * define token boundaries and are not included in tokens. */
+  protected abstract boolean isTokenChar(char c);
+
+  /** Called on each token character to normalize it before it is added to the
+   * token.  The default implementation does nothing.  Subclasses may use this
+   * to, e.g., lowercase tokens. */
+  protected char normalize(char c) {
+    return c;
+  }
+
+  public final Token next(final Token reusableToken) throws IOException {
+    assert reusableToken != null;
+    reusableToken.clear();
+    int length = 0;
+    int start = bufferIndex;
+    char[] buffer = reusableToken.termBuffer();
+    while (true) {
+
+      if (bufferIndex >= dataLen) {
+        offset += dataLen;
+        dataLen = input.read(ioBuffer);
+        if (dataLen == -1) {
+          if (length > 0)
+            break;
+          else
+            return null;
+        }
+        bufferIndex = 0;
+      }
+
+      final char c = ioBuffer[bufferIndex++];
+
+      if (isTokenChar(c)) {               // if it's a token char
+
+        if (length == 0)                 // start of token
+          start = offset + bufferIndex - 1;
+        else if (length == buffer.length)
+          buffer = reusableToken.resizeTermBuffer(1+length);
+
+        buffer[length++] = normalize(c); // buffer it, normalized
+
+        if (length == MAX_WORD_LEN)      // buffer overflow!
+          break;
+
+      } else if (length > 0)             // at non-Letter w/ chars
+        break;                           // return 'em
+    }
+
+    reusableToken.setTermLength(length);
+    // Because of "CharStream aware" tokenizer, using correctOffset() to
+    // correct start/end offsets
+    reusableToken.setStartOffset(((CharStream)input).correctOffset(start));
+    reusableToken.setEndOffset(((CharStream)input).correctOffset(start+length));
+    return reusableToken;
+  }
+
+  public void reset(Reader input) throws IOException {
+    super.reset(input);
+    bufferIndex = 0;
+    offset = 0;
+    dataLen = 0;
+  }
+}
--- a/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java
+++ b/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizer.java
@ -0,0 +1,33 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
+ * Adjacent sequences of non-Whitespace characters form tokens. */
+public class CharStreamAwareWhitespaceTokenizer extends CharStreamAwareCharTokenizer {
+  /** Construct a new WhitespaceTokenizer. */
+  public CharStreamAwareWhitespaceTokenizer(CharStream in) {
+    super(in);
+  }
+
+  /** Collects only characters which do not satisfy
+   * {@link Character#isWhitespace(char)}.*/
+  protected boolean isTokenChar(char c) {
+    return !Character.isWhitespace(c);
+  }
+}
--- a/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/CharStreamAwareWhitespaceTokenizerFactory.java
@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class CharStreamAwareWhitespaceTokenizerFactory extends BaseTokenizerFactory {
+
+  public TokenStream create(Reader input) {
+    return new CharStreamAwareWhitespaceTokenizer( (CharStream)input );
+  }
+}
--- a/src/java/org/apache/solr/analysis/MappingCharFilter.java
+++ b/src/java/org/apache/solr/analysis/MappingCharFilter.java
@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.LinkedList;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class MappingCharFilter extends BaseCharFilter {
+
+  private final NormalizeMap normMap;
+  private LinkedList<Character> buffer;
+  private String replacement;
+  private int charPointer;
+  private int nextCharCounter;
+
+  public MappingCharFilter( NormalizeMap normMap, CharStream in ){
+    super( in );
+    this.normMap = normMap;
+  }
+
+  public int read() throws IOException {
+    while( true ){
+      if( replacement != null && charPointer < replacement.length() )
+        return replacement.charAt( charPointer++ );
+
+      int firstChar = nextChar();
+      if( firstChar == -1 ) return -1;
+      NormalizeMap nm = normMap.submap != null ?
+        normMap.submap.get( (char)firstChar ) : null;
+      if( nm == null ) return firstChar;
+      NormalizeMap result = match( nm );
+      if( result == null ) return firstChar;
+      replacement = result.normStr;
+      charPointer = 0;
+      if( result.diff != 0 ){
+        int prevCumulativeDiff = pcmList.isEmpty() ? 0 :
+          pcmList.get( pcmList.size() - 1 ).cumulativeDiff;
+        if( result.diff < 0 ){
+          for( int i = 0; i < -result.diff ; i++ )
+            pcmList.add( new PosCorrectMap( nextCharCounter + i - prevCumulativeDiff, prevCumulativeDiff - 1 - i ) );
+        }
+        else{
+          pcmList.add( new PosCorrectMap( nextCharCounter - result.diff - prevCumulativeDiff, prevCumulativeDiff + result.diff ) );
+        }
+      }
+    }
+  }
+
+  private int nextChar() throws IOException {
+    nextCharCounter++;
+    if( buffer != null && !buffer.isEmpty() )
+      return buffer.removeFirst();
+    return input.read();
+  }
+
+  private void pushChar( int c ){
+    nextCharCounter--;
+    if( buffer == null )
+      buffer = new LinkedList<Character>();
+    buffer.addFirst( (char)c );
+  }
+
+  private void pushLastChar( int c ){
+    if( buffer == null )
+      buffer = new LinkedList<Character>();
+    buffer.addLast( (char)c );
+  }
+
+  private NormalizeMap match( NormalizeMap map ) throws IOException {
+    NormalizeMap result = null;
+    if( map.submap != null ){
+      int chr = nextChar();
+      if( chr != -1 ){
+        NormalizeMap subMap = map.submap.get( (char)chr );
+        if( subMap != null ){
+          result = match( subMap );
+        }
+        if( result == null )
+          pushChar( chr );
+      }
+    }
+    if( result == null && map.normStr != null )
+      result = map;
+    return result;
+  }
+
+  public int read( char[] cbuf, int off, int len ) throws IOException {
+    char[] tmp = new char[len];
+    int l = input.read( tmp, 0, len );
+    if( l != -1 ){
+      for( int i = 0; i < l; i++ )
+        pushLastChar( tmp[i] );
+    }
+    l = 0;
+    for( int i = off; i < off + len; i++ ){
+      int c = read();
+      if( c == -1 ) break;
+      cbuf[i] = (char)c;
+      l++;
+    }
+    return l == 0 ? -1 : l;
+  }
+
+  public boolean markSupported(){
+    return false;
+  }
+
+  public void mark( int readAheadLimit ) throws IOException {
+    throw new IOException( "mark/reset not supported" );
+  }
+
+  public void reset() throws IOException {
+    throw new IOException( "mark/reset not supported" );
+  }
+}
--- a/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java
@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.solr.common.ResourceLoader;
+import org.apache.solr.common.util.StrUtils;
+import org.apache.solr.util.plugin.ResourceLoaderAware;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class MappingCharFilterFactory extends BaseCharFilterFactory implements
+    ResourceLoaderAware {
+
+  protected NormalizeMap normMap;
+  private String mapping;
+
+  public void inform(ResourceLoader loader) {
+    mapping = args.get( "mapping" );
+
+    if( mapping != null ){
+      List<String> wlist = null;
+      try{
+        File mappingFile = new File( mapping );
+        if( mappingFile.exists() ){
+          wlist = loader.getLines( mapping );
+        }
+        else{
+          List<String> files = StrUtils.splitFileNames( mapping );
+          wlist = new ArrayList<String>();
+          for( String file : files ){
+            List<String> lines = loader.getLines( file.trim() );
+            wlist.addAll( lines );
+          }
+        }
+      }
+      catch( IOException e ){
+        throw new RuntimeException( e );
+      }
+      normMap = new NormalizeMap();
+      parseRules( wlist, normMap );
+    }
+  }
+
+  public CharStream create(CharStream input) {
+    return new MappingCharFilter(normMap,input);
+  }
+
+  // "source" => "target"
+  static Pattern p = Pattern.compile( "\"(.*)\"\\s*=>\\s*\"(.*)\"\\s*$" );
+  
+  protected void parseRules( List<String> rules, NormalizeMap normMap ){
+    for( String rule : rules ){
+      Matcher m = p.matcher( rule );
+      if( !m.find() )
+        throw new RuntimeException( "Invalid Mapping Rule : [" + rule + "], file = " + mapping );
+      normMap.add( parseString( m.group( 1 ) ), parseString( m.group( 2 ) ) );
+    }
+  }
+
+  char[] out = new char[256];
+  
+  protected String parseString( String s ){
+    int readPos = 0;
+    int len = s.length();
+    int writePos = 0;
+    while( readPos < len ){
+      char c = s.charAt( readPos++ );
+      if( c == '\\' ){
+        if( readPos >= len )
+          throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
+        c = s.charAt( readPos++ );
+        switch( c ) {
+          case '\\' : c = '\\'; break;
+          case '"' : c = '"'; break;
+          case 'n' : c = '\n'; break;
+          case 't' : c = '\t'; break;
+          case 'r' : c = '\r'; break;
+          case 'b' : c = '\b'; break;
+          case 'f' : c = '\f'; break;
+          case 'u' :
+            if( readPos + 3 >= len )
+              throw new RuntimeException( "Invalid escaped char in [" + s + "]" );
+            c = (char)Integer.parseInt( s.substring( readPos, readPos + 4 ), 16 );
+            readPos += 4;
+            break;
+        }
+      }
+      out[writePos++] = c;
+    }
+    return new String( out, 0, writePos );
+  }
+}
--- a/src/java/org/apache/solr/analysis/NormalizeMap.java
+++ b/src/java/org/apache/solr/analysis/NormalizeMap.java
@ -0,0 +1,55 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ *
+ * @version $Id$
+ * @since Solr 1.4
+ *
+ */
+public class NormalizeMap {
+
+  Map<Character, NormalizeMap> submap;
+  String normStr;
+  int diff;
+
+  public void add( String singleMatch, String replacement ){
+    NormalizeMap currMap = this;
+    for( int i = 0; i < singleMatch.length(); i++ ){
+      char c = singleMatch.charAt( i );
+      if( currMap.submap == null ){
+        currMap.submap = new HashMap<Character, NormalizeMap>( 1 );
+      }
+      NormalizeMap map = currMap.submap.get( c );
+      if( map == null ){
+        map = new NormalizeMap();
+        currMap.submap.put( c, map );
+      }
+      currMap = map;
+    }
+    if( currMap.normStr != null ){
+      throw new RuntimeException( "MappingCharFilter: there is already a mapping for " + singleMatch );
+    }
+    currMap.normStr = replacement;
+    currMap.diff = singleMatch.length() - replacement.length();
+  }
+}
--- a/src/java/org/apache/solr/analysis/TokenizerChain.java
+++ b/src/java/org/apache/solr/analysis/TokenizerChain.java
@ -31,19 +31,37 @@ import java.io.Reader;
 // create a TokenStream.
 //
 public class TokenizerChain extends SolrAnalyzer {
+  final private CharFilterFactory[] charFilters;
  final private TokenizerFactory tokenizer;
  final private TokenFilterFactory[] filters;

  public TokenizerChain(TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+    this(null,tokenizer,filters);
+  }
+
+  public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
+    this.charFilters = charFilters;
    this.tokenizer = tokenizer;
    this.filters = filters;
  }

+  public CharFilterFactory[] getCharFilterFactories() { return charFilters; }
  public TokenizerFactory getTokenizerFactory() { return tokenizer; }
  public TokenFilterFactory[] getTokenFilterFactories() { return filters; }

+  public Reader charStream(Reader reader){
+    if( charFilters != null && charFilters.length > 0 ){
+      CharStream cs = new CharReader( reader );
+      for (int i=0; i<charFilters.length; i++) {
+        cs = charFilters[i].create(cs);
+      }
+      reader = cs;
+    }
+    return reader;
+  }
+
  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream ts = tokenizer.create(reader);
+    TokenStream ts = tokenizer.create(charStream(reader));
    for (int i=0; i<filters.length; i++) {
      ts = filters[i].create(ts);
    }
@ -52,6 +70,10 @@ public class TokenizerChain extends SolrAnalyzer {

  public String toString() {
    StringBuilder sb = new StringBuilder("TokenizerChain(");
+    for (CharFilterFactory filter: charFilters) {
+      sb.append(filter);
+      sb.append(", ");
+    }
    sb.append(tokenizer);
    for (TokenFilterFactory filter: filters) {
      sb.append(", ");
--- a/src/java/org/apache/solr/core/SolrResourceLoader.java
+++ b/src/java/org/apache/solr/core/SolrResourceLoader.java
@ -37,6 +37,7 @@ import javax.naming.InitialContext;
 import javax.naming.NamingException;
 import javax.naming.NoInitialContextException;

+import org.apache.solr.analysis.CharFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerFactory;
 import org.apache.solr.common.ResourceLoader;
@ -394,8 +395,9 @@ public class SolrResourceLoader implements ResourceLoader
      }
    );

-    awareCompatibility.put( 
+    awareCompatibility.put(
      ResourceLoaderAware.class, new Class[] {
+        CharFilterFactory.class,
        TokenFilterFactory.class,
        TokenizerFactory.class,
        FieldType.class
@ -427,5 +429,5 @@ public class SolrResourceLoader implements ResourceLoader
    }
    throw new SolrException( SolrException.ErrorCode.SERVER_ERROR, builder.toString() );
  }
-  
+
 }
--- a/src/java/org/apache/solr/schema/IndexSchema.java
+++ b/src/java/org/apache/solr/schema/IndexSchema.java
@ -29,6 +29,7 @@ import org.apache.solr.common.util.DOMUtil;
 import org.apache.solr.core.SolrConfig;
 import org.apache.solr.core.Config;
 import org.apache.solr.core.SolrResourceLoader;
+import org.apache.solr.analysis.CharFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.analysis.TokenizerFactory;
@ -739,12 +740,33 @@ public final class IndexSchema {

    XPath xpath = XPathFactory.newInstance().newXPath();

+    // Load the CharFilters
+    // --------------------------------------------------------------------------------
+    final ArrayList<CharFilterFactory> charFilters = new ArrayList<CharFilterFactory>();
+    AbstractPluginLoader<CharFilterFactory> charFilterLoader =
+      new AbstractPluginLoader<CharFilterFactory>( "[schema.xml] analyzer/charFilter", false, false )
+    {
+      @Override
+      protected void init(CharFilterFactory plugin, Node node) throws Exception {
+        if( plugin != null ) {
+          plugin.init( DOMUtil.toMapExcept(node.getAttributes(),"class") );
+          charFilters.add( plugin );
+        }
+      }
+
+      @Override
+      protected CharFilterFactory register(String name, CharFilterFactory plugin) throws Exception {
+        return null; // used for map registration
+      }
+    };
+    charFilterLoader.load( solrConfig.getResourceLoader(), (NodeList)xpath.evaluate("./charFilter", node, XPathConstants.NODESET) );
+
    // Load the Tokenizer
-    // Although an analyzer only allows a single Tokenizer, we load a list to make sure 
+    // Although an analyzer only allows a single Tokenizer, we load a list to make sure
    // the configuration is ok
    // --------------------------------------------------------------------------------
    final ArrayList<TokenizerFactory> tokenizers = new ArrayList<TokenizerFactory>(1);
-    AbstractPluginLoader<TokenizerFactory> tokenizerLoader = 
+    AbstractPluginLoader<TokenizerFactory> tokenizerLoader =
      new AbstractPluginLoader<TokenizerFactory>( "[schema.xml] analyzer/tokenizer", false, false )
    {
      @Override
@ -790,8 +812,9 @@ public final class IndexSchema {
      }
    };
    filterLoader.load( loader, (NodeList)xpath.evaluate("./filter", node, XPathConstants.NODESET) );
-    
-    return new TokenizerChain(tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
+
+    return new TokenizerChain(charFilters.toArray(new CharFilterFactory[charFilters.size()]),
+        tokenizers.get(0), filters.toArray(new TokenFilterFactory[filters.size()]));
  };


--- a/src/test/org/apache/solr/analysis/TestCharFilter.java
+++ b/src/test/org/apache/solr/analysis/TestCharFilter.java
@ -0,0 +1,52 @@
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+
+import junit.framework.TestCase;
+
+public class TestCharFilter extends TestCase {
+
+  public void testCharFilter1() throws Exception {
+    CharStream cs = new CharFilter1( new CharReader( new StringReader("") ) );
+    assertEquals( "corrected position is invalid", 1, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter2() throws Exception {
+    CharStream cs = new CharFilter2( new CharReader( new StringReader("") ) );
+    assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter12() throws Exception {
+    CharStream cs = new CharFilter2( new CharFilter1( new CharReader( new StringReader("") ) ) );
+    assertEquals( "corrected position is invalid", 3, cs.correctOffset( 0 ) );
+  }
+
+  public void testCharFilter11() throws Exception {
+    CharStream cs = new CharFilter1( new CharFilter1( new CharReader( new StringReader("") ) ) );
+    assertEquals( "corrected position is invalid", 2, cs.correctOffset( 0 ) );
+  }
+
+  static class CharFilter1 extends CharFilter {
+
+    protected CharFilter1(CharStream in) {
+      super(in);
+    }
+
+    @Override
+    protected int correctPosition(int currentPos) {
+      return currentPos + 1;
+    }
+  }
+
+  static class CharFilter2 extends CharFilter {
+
+    protected CharFilter2(CharStream in) {
+      super(in);
+    }
+
+    @Override
+    protected int correctPosition(int currentPos) {
+      return currentPos + 2;
+    }
+  }
+}
--- a/src/test/org/apache/solr/analysis/TestMappingCharFilter.java
+++ b/src/test/org/apache/solr/analysis/TestMappingCharFilter.java
@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import java.io.StringReader;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class TestMappingCharFilter extends BaseTokenTestCase {
+
+  NormalizeMap normMap;
+
+  public void setUp() throws Exception {
+    normMap = new NormalizeMap();
+
+    normMap.add( "aa", "a" );
+    normMap.add( "bbb", "b" );
+    normMap.add( "cccc", "cc" );
+
+    normMap.add( "h", "i" );
+    normMap.add( "j", "jj" );
+    normMap.add( "k", "kkk" );
+    normMap.add( "ll", "llll" );
+
+    normMap.add( "empty", "" );
+  }
+
+  public void testNothingChange() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "x" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "x" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "i" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to2() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "j" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "jj,1,0,1" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test1to3() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "k" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "kkk,1,0,1" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test2to4() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "ll" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "llll,1,0,2" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test2to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "aa" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "a,1,0,2" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test3to1() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "bbb" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "b,1,0,3" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test4to2() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "cccc" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "cc,1,0,4" );
+    assertTokEqualOff( expect, real );
+  }
+
+  public void test5to0() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "empty" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    assertEquals( 0, real.size() );
+  }
+
+  //
+  //                1111111111222
+  //      01234567890123456789012
+  //(in)  h i j k ll cccc bbb aa
+  //
+  //                1111111111222
+  //      01234567890123456789012
+  //(out) i i jj kkk llll cc b a
+  //
+  //    h, 0, 1 =>    i, 0, 1
+  //    i, 2, 3 =>    i, 2, 3
+  //    j, 4, 5 =>   jj, 4, 5
+  //    k, 6, 7 =>  kkk, 6, 7
+  //   ll, 8,10 => llll, 8,10
+  // cccc,11,15 =>   cc,11,15
+  //  bbb,16,19 =>    b,16,19
+  //   aa,20,22 =>    a,20,22
+  //
+  public void testTokenStream() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap, new CharReader( new StringReader( "h i j k ll cccc bbb aa" ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "i,1,0,1 i,1,2,3 jj,1,4,5 kkk,1,6,7 llll,1,8,10 cc,1,11,15 b,1,16,19 a,1,20,22" );
+    assertTokEqualOff( expect, real );
+  }
+
+  //
+  //
+  //        0123456789
+  //(in)    aaaa ll h
+  //(out-1) aa llll i
+  //(out-2) a llllllll i
+  //
+  // aaaa,0,4 => a,0,4
+  //   ll,5,7 => llllllll,5,7
+  //    h,8,9 => i,8,9
+  public void testChained() throws Exception {
+    CharStream cs = new MappingCharFilter( normMap,
+        new MappingCharFilter( normMap, new CharReader( new StringReader( "aaaa ll h" ) ) ) );
+    TokenStream ts = new CharStreamAwareWhitespaceTokenizer( cs );
+    List<Token> real = getTokens( ts );
+    List<Token> expect = tokens( "a,1,0,4 llllllll,1,5,7 i,1,8,9" );
+    assertTokEqualOff( expect, real );
+  }
+}
--- a/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java
+++ b/src/test/org/apache/solr/analysis/TestMappingCharFilterFactory.java
@ -0,0 +1,52 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.analysis;
+
+import junit.framework.TestCase;
+
+public class TestMappingCharFilterFactory extends TestCase {
+  public void testParseString() throws Exception {
+
+    MappingCharFilterFactory f = new MappingCharFilterFactory();
+
+    try {
+      f.parseString( "\\" );
+      fail( "escape character cannot be alone." );
+    }
+    catch( RuntimeException expected ){}
+    
+    assertEquals( "unexpected escaped characters",
+        "\\\"\n\t\r\b\f", f.parseString( "\\\\\\\"\\n\\t\\r\\b\\f" ) );
+    assertEquals( "unexpected escaped characters",
+        "A", f.parseString( "\\u0041" ) );
+    assertEquals( "unexpected escaped characters",
+        "AB", f.parseString( "\\u0041\\u0042" ) );
+
+    try {
+      f.parseString( "\\u000" );
+      fail( "invalid length check." );
+    }
+    catch( RuntimeException expected ){}
+
+    try {
+      f.parseString( "\\u123x" );
+      fail( "invalid hex number check." );
+    }
+    catch( NumberFormatException expected ){}
+  }
+}
--- a/src/webapp/web/admin/analysis.jsp
+++ b/src/webapp/web/admin/analysis.jsp
@ -181,9 +181,9 @@
       TokenizerFactory tfac = tchain.getTokenizerFactory();
       TokenFilterFactory[] filtfacs = tchain.getTokenFilterFactories();

-       TokenStream tstream = tfac.create(reader);
+       TokenStream tstream = tfac.create(tchain.charStream(reader));
       List<Token> tokens = getTokens(tstream);
-       tstream = tfac.create(reader);
+       tstream = tfac.create(tchain.charStream(reader));
       if (verbose) {
         writeHeader(out, tfac.getClass(), tfac.getArgs());
       }