LUCENE-2906: filter to process output of Standard/ICUTokenizer and create overlapping bigrams for CJK

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1225433 13f79535-47bb-0310-9956-ffa450edef68
2011-12-29 05:04:49 +00:00 · 2011-12-29 05:04:49 +00:00 · b2970db4bc
parent a1a8eda2cd
commit b2970db4bc
15 changed files with 1202 additions and 12 deletions
--- a/lucene/contrib/CHANGES.txt
+++ b/lucene/contrib/CHANGES.txt
@ -102,6 +102,12 @@ New Features
 * SOLR-2982: Added phonetic encoders to contrib/analyzers/phonetic:
  Metaphone, Soundex, Caverphone, Beider-Morse, etc.  (Robert Muir)

+* LUCENE-2906: Added CJKBigramFilter that forms bigrams from StandardTokenizer or
+  ICUTokenizer CJK tokens, and CJKWidthFilter that normalizes halfwidth/fullwidth. 
+  This filter supports unicode supplementary characters and you can toggle whether 
+  bigrams are formed for each of Han/Hiragana/Katakana/Hangul independently. Deprecates
+  CJKTokenizer.  (Tom Burton-West, Robert Muir)
+
 API Changes

 * LUCENE-3596: DirectoryTaxonomyWriter.openIndexWriter() now takes an 
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKAnalyzer.java
@ -22,16 +22,19 @@ import java.io.Reader;
 import java.util.Set;

 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
 import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
 import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
 import org.apache.lucene.util.Version;

-
 /**
- * An {@link Analyzer} that tokenizes text with {@link CJKTokenizer} and
- * filters with {@link StopFilter}
- *
+ * An {@link Analyzer} that tokenizes text with {@link StandardTokenizer},
+ * normalizes content with {@link CJKWidthFilter}, folds case with
+ * {@link LowerCaseFilter}, forms bigrams of CJK with {@link CJKBigramFilter},
+ * and filters stopwords with {@link StopFilter}
 */
 public final class CJKAnalyzer extends StopwordAnalyzerBase {
  /**
@ -86,7 +89,16 @@ public final class CJKAnalyzer extends StopwordAnalyzerBase {
  @Override
  protected TokenStreamComponents createComponents(String fieldName,
      Reader reader) {
-    final Tokenizer source = new CJKTokenizer(reader);
-    return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+    if (matchVersion.onOrAfter(Version.LUCENE_36)) {
+      final Tokenizer source = new StandardTokenizer(matchVersion, reader);
+      // run the widthfilter first before bigramming, it sometimes combines characters.
+      TokenStream result = new CJKWidthFilter(source);
+      result = new LowerCaseFilter(matchVersion, result);
+      result = new CJKBigramFilter(result);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
+    } else {
+      final Tokenizer source = new CJKTokenizer(reader);
+      return new TokenStreamComponents(source, new StopFilter(matchVersion, source, stopwords));
+    }
  }
 }
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java
@ -0,0 +1,307 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.ArrayUtil;
+
+/**
+ * Forms bigrams of CJK terms that are generated from StandardTokenizer
+ * or ICUTokenizer.
+ * <p>
+ * CJK types are set by these tokenizers, but you can also use 
+ * {@link #CJKBigramFilter(TokenStream, int)} to explicitly control which
+ * of the CJK scripts are turned into bigrams.
+ * <p>
+ * In all cases, all non-CJK input is passed thru unmodified.
+ */
+public final class CJKBigramFilter extends TokenFilter {
+  // configuration
+  /** bigram flag for Han Ideographs */
+  public static final int HAN = 1;
+  /** bigram flag for Hiragana */
+  public static final int HIRAGANA = 2;
+  /** bigram flag for Katakana */
+  public static final int KATAKANA = 4;
+  /** bigram flag for Hangul */
+  public static final int HANGUL = 8;
+
+  /** when we emit a bigram, its then marked as this type */
+  public static final String DOUBLE_TYPE = "<DOUBLE>";
+  /** when we emit a unigram, its then marked as this type */
+  public static final String SINGLE_TYPE = "<SINGLE>";
+
+  // the types from standardtokenizer
+  private static final String HAN_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC];
+  private static final String HIRAGANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA];
+  private static final String KATAKANA_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA];
+  private static final String HANGUL_TYPE = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL];
+  
+  // sentinel value for ignoring a script 
+  private static final Object NO = new Object();
+
+  // these are set to either their type or NO if we want to pass them thru
+  private final Object doHan;
+  private final Object doHiragana;
+  private final Object doKatakana;
+  private final Object doHangul;
+    
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+  
+  // buffers containing codepoint and offsets in parallel
+  int buffer[] = new int[8];
+  int startOffset[] = new int[8];
+  int endOffset[] = new int[8];
+  // length of valid buffer
+  int bufferLen;
+  // current buffer index
+  int index;
+  
+  // the last end offset, to determine if we should bigram across tokens
+  int lastEndOffset;
+  
+  private boolean exhausted;
+  
+  /** 
+   * Calls {@link CJKBigramFilter#CJKBigramFilter(TokenStream, int)
+   *       CJKBigramFilter(HAN | HIRAGANA | KATAKANA | HANGUL)}
+   */
+  public CJKBigramFilter(TokenStream in) {
+    this(in, HAN | HIRAGANA | KATAKANA | HANGUL);
+  }
+  
+  /** 
+   * Create a new CJKBigramFilter, specifying which writing systems should be bigrammed.
+   * @param flags OR'ed set from {@link CJKBigramFilter#HAN}, {@link CJKBigramFilter#HIRAGANA}, 
+   *        {@link CJKBigramFilter#KATAKANA}, {@link CJKBigramFilter#HANGUL}
+   */
+  public CJKBigramFilter(TokenStream in, int flags) {
+    super(in);
+    doHan =      (flags & HAN) == 0      ? NO : HAN_TYPE;
+    doHiragana = (flags & HIRAGANA) == 0 ? NO : HIRAGANA_TYPE;
+    doKatakana = (flags & KATAKANA) == 0 ? NO : KATAKANA_TYPE;
+    doHangul =   (flags & HANGUL) == 0   ? NO : HANGUL_TYPE;
+  }
+  
+  /*
+   * much of this complexity revolves around handling the special case of a 
+   * "lone cjk character" where cjktokenizer would output a unigram. this 
+   * is also the only time we ever have to captureState.
+   */
+  @Override
+  public boolean incrementToken() throws IOException {
+    while (true) {
+      if (hasBufferedBigram()) {
+        
+        // case 1: we have multiple remaining codepoints buffered,
+        // so we can emit a bigram here.
+        
+        flushBigram();
+        return true;
+      } else if (doNext()) {
+        
+        // case 2: look at the token type. should we form any n-grams?
+        
+        String type = typeAtt.type();
+        if (type == doHan || type == doHiragana || type == doKatakana || type == doHangul) {
+          
+          // acceptable CJK type: we form n-grams from these.
+          // as long as the offsets are aligned, we just add these to our current buffer.
+          // otherwise, we clear the buffer and start over.
+          
+          if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue
+            if (hasBufferedUnigram()) {
+              
+              // we have a buffered unigram, and we peeked ahead to see if we could form
+              // a bigram, but we can't, because the offsets are unaligned. capture the state 
+              // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+              
+              loneState = captureState();
+              flushUnigram();
+              return true;
+            }
+            index = 0;
+            bufferLen = 0;
+          }
+          refill();
+        } else {
+          
+          // not a CJK type: we just return these as-is.
+          
+          if (hasBufferedUnigram()) {
+            
+            // we have a buffered unigram, and we peeked ahead to see if we could form
+            // a bigram, but we can't, because its not a CJK type. capture the state 
+            // of this peeked data to be revisited next time thru the loop, and dump our unigram.
+            
+            loneState = captureState();
+            flushUnigram();
+            return true;
+          }
+          return true;
+        }
+      } else {
+        
+        // case 3: we have only zero or 1 codepoints buffered, 
+        // so not enough to form a bigram. But, we also have no
+        // more input. So if we have a buffered codepoint, emit
+        // a unigram, otherwise, its end of stream.
+        
+        if (hasBufferedUnigram()) {
+          flushUnigram(); // flush our remaining unigram
+          return true;
+        }
+        return false;
+      }
+    }
+  }
+  
+  private State loneState; // rarely used: only for "lone cjk characters", where we emit unigrams
+  
+  /** 
+   * looks at next input token, returning false is none is available 
+   */
+  private boolean doNext() throws IOException {
+    if (loneState != null) {
+      restoreState(loneState);
+      loneState = null;
+      return true;
+    } else {
+      if (exhausted) {
+        return false;
+      } else if (input.incrementToken()) {
+        return true;
+      } else {
+        exhausted = true;
+        return false;
+      }
+    }
+  }
+  
+  /**
+   * refills buffers with new data from the current token.
+   */
+  private void refill() throws IOException {
+    // compact buffers to keep them smallish if they become large
+    // just a safety check, but technically we only need the last codepoint
+    if (bufferLen > 64) {
+      int last = bufferLen - 1;
+      buffer[0] = buffer[last];
+      startOffset[0] = startOffset[last];
+      endOffset[0] = endOffset[last];
+      bufferLen = 1;
+      index -= last;
+    }
+
+    char termBuffer[] = termAtt.buffer();
+    int len = termAtt.length();
+    int start = offsetAtt.startOffset();
+    int end = offsetAtt.endOffset();
+    
+    int newSize = bufferLen + len;
+    buffer = ArrayUtil.grow(buffer, newSize);
+    startOffset = ArrayUtil.grow(startOffset, newSize);
+    endOffset = ArrayUtil.grow(endOffset, newSize);
+    lastEndOffset = end;
+
+    if (end - start != len) {
+      // crazy offsets (modified by synonym or charfilter): just preserve
+      for (int i = 0, cp = 0; i < len; i += Character.charCount(cp)) {
+        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
+        startOffset[bufferLen] = start;
+        endOffset[bufferLen] = end;
+        bufferLen++;
+      }
+    } else {
+      // normal offsets
+      for (int i = 0, cp = 0, cpLen = 0; i < len; i += cpLen) {
+        cp = buffer[bufferLen] = Character.codePointAt(termBuffer, i, len);
+        cpLen = Character.charCount(cp);
+        startOffset[bufferLen] = start;
+        start = endOffset[bufferLen] = start + cpLen;
+        bufferLen++;
+      }
+    }
+  }
+
+  /** 
+   * Flushes a bigram token to output from our buffer 
+   * This is the normal case, e.g. ABC -> AB BC
+   */
+  private void flushBigram() {
+    clearAttributes();
+    char termBuffer[] = termAtt.resizeBuffer(4); // maximum bigram length in code units (2 supplementaries)
+    int len1 = Character.toChars(buffer[index], termBuffer, 0);
+    int len2 = len1 + Character.toChars(buffer[index+1], termBuffer, len1);
+    termAtt.setLength(len2);
+    offsetAtt.setOffset(startOffset[index], endOffset[index+1]);
+    typeAtt.setType(DOUBLE_TYPE);
+    index++;
+  }
+  
+  /** 
+   * Flushes a unigram token to output from our buffer.
+   * This happens when we encounter isolated CJK characters, either the whole
+   * CJK string is a single character, or we encounter a CJK character surrounded 
+   * by space, punctuation, english, etc, but not beside any other CJK.
+   */
+  private void flushUnigram() {
+    clearAttributes();
+    char termBuffer[] = termAtt.resizeBuffer(2); // maximum unigram length (2 surrogates)
+    int len = Character.toChars(buffer[index], termBuffer, 0);
+    termAtt.setLength(len);
+    offsetAtt.setOffset(startOffset[index], endOffset[index]);
+    typeAtt.setType(SINGLE_TYPE);
+    index++;
+  }
+  
+  /**
+   * True if we have multiple codepoints sitting in our buffer
+   */
+  private boolean hasBufferedBigram() {
+    return bufferLen - index > 1;
+  }
+
+  /**
+   * True if we have a single codepoint sitting in our buffer, where its future
+   * (whether it is emitted as unigram or forms a bigram) depends upon not-yet-seen
+   * inputs.
+   */
+  private boolean hasBufferedUnigram() {
+    return bufferLen == 1 && index == 0;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    bufferLen = 0;
+    index = 0;
+    lastEndOffset = 0;
+    loneState = null;
+    exhausted = false;
+  }
+}
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java
@ -44,7 +44,9 @@ import org.apache.lucene.util.AttributeSource;
 * please search  <a
 * href="http://www.google.com/search?q=word+chinese+segment">google</a>
 *
+ * @deprecated Use StandardTokenizer, CJKWidthFilter, CJKBigramFilter, and LowerCaseFilter instead.
 */
+@Deprecated
 public final class CJKTokenizer extends Tokenizer {
    //~ Static fields/initializers ---------------------------------------------
    /** Word token type */
--- a/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
+++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKWidthFilter.java
@ -0,0 +1,95 @@
+package org.apache.lucene.analysis.cjk;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.util.StemmerUtil;
+
+/**
+ * A {@link TokenFilter} that normalizes CJK width differences:
+ * <ul>
+ *   <li>Folds fullwidth ASCII variants into the equivalent basic latin
+ *   <li>Folds halfwidth Katakana variants into the equivalent kana
+ * </ul>
+ * <p>
+ * NOTE: this filter can be viewed as a (practical) subset of NFKC/NFKD
+ * Unicode normalization. See the normalization support in the ICU package
+ * for full normalization.
+ */
+public final class CJKWidthFilter extends TokenFilter {
+  private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+  
+  /* halfwidth kana mappings: 0xFF65-0xFF9D 
+   *
+   * note: 0xFF9C and 0xFF9D are only mapped to 0x3099 and 0x309A
+   * as a fallback when they cannot properly combine with a preceding 
+   * character into a composed form.
+   */
+  private static final char KANA_NORM[] = new char[] {
+    0x30fb, 0x30f2, 0x30a1, 0x30a3, 0x30a5, 0x30a7, 0x30a9, 0x30e3, 0x30e5,
+    0x30e7, 0x30c3, 0x30fc, 0x30a2, 0x30a4, 0x30a6, 0x30a8, 0x30aa, 0x30ab,
+    0x30ad, 0x30af, 0x30b1, 0x30b3, 0x30b5, 0x30b7, 0x30b9, 0x30bb, 0x30bd,
+    0x30bf, 0x30c1, 0x30c4, 0x30c6, 0x30c8, 0x30ca, 0x30cb, 0x30cc, 0x30cd,
+    0x30ce, 0x30cf, 0x30d2, 0x30d5, 0x30d8, 0x30db, 0x30de, 0x30df, 0x30e0,
+    0x30e1, 0x30e2, 0x30e4, 0x30e6, 0x30e8, 0x30e9, 0x30ea, 0x30eb, 0x30ec,
+    0x30ed, 0x30ef, 0x30f3, 0x3099, 0x309A
+  };
+
+  public CJKWidthFilter(TokenStream input) {
+    super(input);
+  }
+
+  public boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      char text[] = termAtt.buffer();
+      int length = termAtt.length();
+      for (int i = 0; i < length; i++) {
+        final char ch = text[i];
+        if (ch >= 0xFF01 && ch <= 0xFF5E) {
+          // Fullwidth ASCII variants
+          text[i] -= 0xFEE0;
+        } else if (ch >= 0xFF65 && ch <= 0xFF9F) {
+          // Halfwidth Katakana variants
+          if ((ch == 0xFF9E || ch == 0xFF9F) && i > 0 && combine(text, i, length, ch)) {
+            length = StemmerUtil.delete(text, i--, length);
+          } else {
+            text[i] = KANA_NORM[ch - 0xFF65];
+          }
+        }
+      }
+      termAtt.setLength(length);
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /* kana combining diffs: 0x30A6-0x30FD */
+  private static final byte KANA_COMBINE_VOICED[] = new byte[] {
+    78, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
+     0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 
+     0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+     0, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
+  };
+  
+  private static final byte KANA_COMBINE_HALF_VOICED[] = new byte[] {
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 
+     0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
+     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+  };
+  
+  /** returns true if we successfully combined the voice mark */
+  private static boolean combine(char text[], int pos, int length, char ch) {
+    final char prev = text[pos-1];
+    if (prev >= 0x30A6 && prev <= 0x30FD) {
+      text[pos-1] += (ch == 0xFF9F)
+        ? KANA_COMBINE_HALF_VOICED[prev - 0x30A6] 
+        : KANA_COMBINE_VOICED[prev - 0x30A6];
+      return text[pos-1] != prev;
+    }
+    return false;
+  }
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java
@ -0,0 +1,275 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.charfilter.MappingCharFilter;
+import org.apache.lucene.analysis.charfilter.NormalizeCharMap;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Most tests adopted from TestCJKTokenizer
+ */
+public class TestCJKAnalyzer extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
+  
+  public void testJa1() throws IOException {
+    assertAnalyzesTo(analyzer, "一二三四五六七八九十",
+      new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
+      new int[] { 0, 1, 2, 3, 4, 5, 6, 7,  8 },
+      new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1, 1, 1, 1,  1 });
+  }
+  
+  public void testJa2() throws IOException {
+    assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
+      new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" },
+      new int[] { 0, 2, 3, 6, 7,  8,  9, 12 },
+      new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
+      new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+      new int[] { 1, 1, 1, 1, 1,  1,  1,  1 });
+  }
+  
+  public void testC() throws IOException {
+    assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
+      new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
+      new int[] { 0, 4, 10, 17, 25, 30 },
+      new int[] { 3, 9, 16, 24, 29, 31 },
+      new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
+      new int[] { 1, 1,  1,  1,  1,  1 });
+  }
+  
+  /**
+   * LUCENE-2207: wrong offset calculated by end() 
+   */
+  public void testFinalOffset() throws IOException {
+    assertAnalyzesTo(analyzer, "あい",
+      new String[] { "あい" },
+      new int[] { 0 },
+      new int[] { 2 },
+      new String[] { "<DOUBLE>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "あい   ",
+      new String[] { "あい" },
+      new int[] { 0 },
+      new int[] { 2 },
+      new String[] { "<DOUBLE>" },
+      new int[] { 1 });
+
+    assertAnalyzesTo(analyzer, "test",
+      new String[] { "test" },
+      new int[] { 0 },
+      new int[] { 4 },
+      new String[] { "<ALPHANUM>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "test   ",
+      new String[] { "test" },
+      new int[] { 0 },
+      new int[] { 4 },
+      new String[] { "<ALPHANUM>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "あいtest",
+      new String[] { "あい", "test" },
+      new int[] { 0, 2 },
+      new int[] { 2, 6 },
+      new String[] { "<DOUBLE>", "<ALPHANUM>" },
+      new int[] { 1, 1 });
+    
+    assertAnalyzesTo(analyzer, "testあい    ",
+      new String[] { "test", "あい" },
+      new int[] { 0, 4 },
+      new int[] { 4, 6 },
+      new String[] { "<ALPHANUM>", "<DOUBLE>" },
+      new int[] { 1, 1 });
+  }
+  
+  public void testMix() throws IOException {
+    assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
+      new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
+      new int[] { 0, 1, 2, 3, 5,  8,  9, 10, 11 },
+      new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1,  1,  1,  1,  1});
+  }
+  
+  public void testMix2() throws IOException {
+    assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
+      new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
+      new int[] { 0, 1, 2, 3, 5, 7, 8,  9, 10, 11, 14 },
+      new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+      new int[] { 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1 });
+  }
+  
+  /**
+   * Non-english text (outside of CJK) is treated normally, according to unicode rules 
+   */
+  public void testNonIdeographic() throws IOException {
+    assertAnalyzesTo(analyzer, "一 روبرت موير",
+      new String[] { "一", "روبرت", "موير" },
+      new int[] { 0, 2, 8 },
+      new int[] { 1, 7, 12 },
+      new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
+      new int[] { 1, 1, 1 });
+  }
+  
+  /**
+   * Same as the above, except with a nonspacing mark to show correctness.
+   */
+  public void testNonIdeographicNonLetter() throws IOException {
+    assertAnalyzesTo(analyzer, "一 رُوبرت موير",
+      new String[] { "一", "رُوبرت", "موير" },
+      new int[] { 0, 2, 9 },
+      new int[] { 1, 8, 13 },
+      new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
+      new int[] { 1, 1, 1 });
+  }
+  
+  public void testSurrogates() throws IOException {
+    assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
+      new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
+      new int[] { 0, 2, 3, 4, 5 },
+      new int[] { 3, 4, 5, 6, 7 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1 });
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
+        new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
+        new int[] { 0, 1, 2, 3, 5,  8,  9, 10, 11 },
+        new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
+        new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+        new int[] { 1, 1, 1, 1, 1,  1,  1,  1,  1});
+    
+    assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
+        new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
+        new int[] { 0, 1, 2, 3, 5, 7, 8,  9, 10, 11, 14 },
+        new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
+        new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1 });
+  }
+  
+  public void testSingleChar() throws IOException {
+    assertAnalyzesTo(analyzer, "一",
+      new String[] { "一" },
+      new int[] { 0 },
+      new int[] { 1 },
+      new String[] { "<SINGLE>" },
+      new int[] { 1 });
+  }
+  
+  public void testTokenStream() throws IOException {
+    assertAnalyzesTo(analyzer, "一丁丂", 
+      new String[] { "一丁", "丁丂"},
+      new int[] { 0, 1 },
+      new int[] { 2, 3 },
+      new String[] { "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1 });
+  }
+  
+  /** test that offsets are correct when mappingcharfilter is previously applied */
+  public void testChangedOffsets() throws IOException {
+    final NormalizeCharMap norm = new NormalizeCharMap();
+    norm.add("a", "一二");
+    norm.add("b", "二三");
+    Analyzer analyzer = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
+        return new TokenStreamComponents(tokenizer, new CJKBigramFilter(tokenizer));
+      }
+
+      @Override
+      protected Reader initReader(Reader reader) {
+        return new MappingCharFilter(norm, CharReader.get(reader));
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "ab",
+        new String[] { "一二", "二二", "二三" },
+        new int[] { 0, 0, 1 },
+        new int[] { 1, 1, 2 });
+    
+    // note: offsets are strange since this is how the charfilter maps them... 
+    // before bigramming, the 4 tokens look like:
+    //   { 0, 0, 1, 1 },
+    //   { 0, 1, 1, 2 }
+  }
+
+  private static class FakeStandardTokenizer extends TokenFilter {
+    final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+    
+    public FakeStandardTokenizer(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public boolean incrementToken() throws IOException {
+      if (input.incrementToken()) {
+        typeAtt.setType(StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC]);
+        return true;
+      } else {
+        return false;
+      }
+    }
+  }
+
+  public void testSingleChar2() throws Exception {
+    Analyzer analyzer = new Analyzer() {
+
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+        TokenFilter filter = new FakeStandardTokenizer(tokenizer);
+        filter = new StopFilter(TEST_VERSION_CURRENT, filter, CharArraySet.EMPTY_SET);
+        filter = new CJKBigramFilter(filter);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    };
+    
+    assertAnalyzesTo(analyzer, "一",
+        new String[] { "一" },
+        new int[] { 0 },
+        new int[] { 1 },
+        new String[] { "<SINGLE>" },
+        new int[] { 1 });
+  }
+  
+  /** blast some random strings through the analyzer */
+  public void testRandomStrings() throws Exception {
+    checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+  }
+}
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java
@ -21,7 +21,10 @@ import java.io.IOException;

 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.util.Version;

+/** @deprecated Remove when CJKTokenizer is removed (5.0) */
+@Deprecated
 public class TestCJKTokenizer extends BaseTokenStreamTestCase {
  
  class TestToken {
@ -41,7 +44,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
  }

  public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
-    Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
    String terms[] = new String[out_tokens.length];
    int startOffsets[] = new int[out_tokens.length];
    int endOffsets[] = new int[out_tokens.length];
@ -56,7 +59,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
  }
  
  public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
-    Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
    String terms[] = new String[out_tokens.length];
    int startOffsets[] = new int[out_tokens.length];
    int endOffsets[] = new int[out_tokens.length];
@ -212,13 +215,13 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
  }
  
  public void testTokenStream() throws Exception {
-    Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
    assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02", 
        new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
  }
  
  public void testReusableTokenStream() throws Exception {
-    Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
+    Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_30);
    String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
    
    TestToken[] out_tokens = { 
@ -273,6 +276,6 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
  
  /** blast some random strings through the analyzer */
  public void testRandomStrings() throws Exception {
-    checkRandomData(random, new CJKAnalyzer(TEST_VERSION_CURRENT), 10000*RANDOM_MULTIPLIER);
+    checkRandomData(random, new CJKAnalyzer(Version.LUCENE_30), 10000*RANDOM_MULTIPLIER);
  }
 }
--- a/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java
+++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKWidthFilter.java
@ -0,0 +1,67 @@
+package org.apache.lucene.analysis.cjk;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * Tests for {@link CJKWidthFilter}
+ */
+public class TestCJKWidthFilter extends BaseTokenStreamTestCase {
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer source = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
+      return new TokenStreamComponents(source, new CJKWidthFilter(source));
+    }
+  };
+  
+  /**
+   * Full-width ASCII forms normalized to half-width (basic latin)
+   */
+  public void testFullWidthASCII() throws IOException {
+    assertAnalyzesTo(analyzer, "Ｔｅｓｔ １２３４",
+      new String[] { "Test",  "1234" },
+      new int[] { 0, 5 },
+      new int[] { 4, 9 });
+  }
+  
+  /**
+   * Half-width katakana forms normalized to standard katakana.
+   * A bit trickier in some cases, since half-width forms are decomposed
+   * and voice marks need to be recombined with a preceding base form. 
+   */
+  public void testHalfWidthKana() throws IOException {
+    assertAnalyzesTo(analyzer, "ｶﾀｶﾅ",
+      new String[] { "カタカナ" });
+    assertAnalyzesTo(analyzer, "ｳﾞｨｯﾂ",
+      new String[] { "ヴィッツ" });
+    assertAnalyzesTo(analyzer, "ﾊﾟﾅｿﾆｯｸ",
+      new String[] { "パナソニック" });
+  }
+  
+  public void testRandomData() throws IOException {
+    checkRandomData(random, analyzer, 10000*RANDOM_MULTIPLIER);
+  }
+}
--- a/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
+++ b/modules/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java
@ -0,0 +1,226 @@
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
+import org.apache.lucene.analysis.core.StopFilter;
+import org.apache.lucene.analysis.icu.ICUNormalizer2Filter;
+import org.apache.lucene.analysis.util.CharArraySet;
+
+/**
+ * Tests ICUTokenizer's ability to work with CJKBigramFilter.
+ * Most tests adopted from TestCJKTokenizer
+ */
+public class TestWithCJKBigramFilter extends BaseTokenStreamTestCase {
+  
+  /**
+   * ICUTokenizer+CJKBigramFilter
+   */
+  private Analyzer analyzer = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer source = new ICUTokenizer(reader);
+      TokenStream result = new CJKBigramFilter(source);
+      return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
+    }
+  };
+  
+  /**
+   * ICUTokenizer+ICUNormalizer2Filter+CJKBigramFilter.
+   * 
+   * ICUNormalizer2Filter uses nfkc_casefold by default, so this is a language-independent
+   * superset of CJKWidthFilter's foldings.
+   */
+  private Analyzer analyzer2 = new Analyzer() {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer source = new ICUTokenizer(reader);
+      // we put this before the CJKBigramFilter, because the normalization might combine
+      // some halfwidth katakana forms, which will affect the bigramming.
+      TokenStream result = new ICUNormalizer2Filter(source);
+      result = new CJKBigramFilter(source);
+      return new TokenStreamComponents(source, new StopFilter(TEST_VERSION_CURRENT, result, CharArraySet.EMPTY_SET));
+    }
+  };
+  
+  public void testJa1() throws IOException {
+    assertAnalyzesTo(analyzer, "一二三四五六七八九十",
+      new String[] { "一二", "二三", "三四", "四五", "五六", "六七", "七八", "八九", "九十" },
+      new int[] { 0, 1, 2, 3, 4, 5, 6, 7,  8 },
+      new int[] { 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1, 1, 1, 1,  1 });
+  }
+  
+  public void testJa2() throws IOException {
+    assertAnalyzesTo(analyzer, "一 二三四 五六七八九 十",
+      new String[] { "一", "二三", "三四", "五六", "六七", "七八", "八九", "十" },
+      new int[] { 0, 2, 3, 6, 7,  8,  9, 12 },
+      new int[] { 1, 4, 5, 8, 9, 10, 11, 13 },
+      new String[] { "<SINGLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+      new int[] { 1, 1, 1, 1, 1,  1,  1,  1 });
+  }
+  
+  public void testC() throws IOException {
+    assertAnalyzesTo(analyzer, "abc defgh ijklmn opqrstu vwxy z",
+      new String[] { "abc", "defgh", "ijklmn", "opqrstu", "vwxy", "z" },
+      new int[] { 0, 4, 10, 17, 25, 30 },
+      new int[] { 3, 9, 16, 24, 29, 31 },
+      new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" },
+      new int[] { 1, 1,  1,  1,  1,  1 });
+  }
+  
+  /**
+   * LUCENE-2207: wrong offset calculated by end() 
+   */
+  public void testFinalOffset() throws IOException {
+    assertAnalyzesTo(analyzer, "あい",
+      new String[] { "あい" },
+      new int[] { 0 },
+      new int[] { 2 },
+      new String[] { "<DOUBLE>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "あい   ",
+      new String[] { "あい" },
+      new int[] { 0 },
+      new int[] { 2 },
+      new String[] { "<DOUBLE>" },
+      new int[] { 1 });
+
+    assertAnalyzesTo(analyzer, "test",
+      new String[] { "test" },
+      new int[] { 0 },
+      new int[] { 4 },
+      new String[] { "<ALPHANUM>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "test   ",
+      new String[] { "test" },
+      new int[] { 0 },
+      new int[] { 4 },
+      new String[] { "<ALPHANUM>" },
+      new int[] { 1 });
+    
+    assertAnalyzesTo(analyzer, "あいtest",
+      new String[] { "あい", "test" },
+      new int[] { 0, 2 },
+      new int[] { 2, 6 },
+      new String[] { "<DOUBLE>", "<ALPHANUM>" },
+      new int[] { 1, 1 });
+    
+    assertAnalyzesTo(analyzer, "testあい    ",
+      new String[] { "test", "あい" },
+      new int[] { 0, 4 },
+      new int[] { 4, 6 },
+      new String[] { "<ALPHANUM>", "<DOUBLE>" },
+      new int[] { 1, 1 });
+  }
+  
+  public void testMix() throws IOException {
+    assertAnalyzesTo(analyzer, "あいうえおabcかきくけこ",
+      new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
+      new int[] { 0, 1, 2, 3, 5,  8,  9, 10, 11 },
+      new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1,  1,  1,  1,  1});
+  }
+  
+  public void testMix2() throws IOException {
+    assertAnalyzesTo(analyzer, "あいうえおabんcかきくけ こ",
+      new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
+      new int[] { 0, 1, 2, 3, 5, 7, 8,  9, 10, 11, 14 },
+      new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+      new int[] { 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1 });
+  }
+  
+  /**
+   * Non-english text (outside of CJK) is treated normally, according to unicode rules 
+   */
+  public void testNonIdeographic() throws IOException {
+    assertAnalyzesTo(analyzer, "一 روبرت موير",
+        new String[] { "一", "روبرت", "موير" },
+        new int[] { 0, 2, 8 },
+        new int[] { 1, 7, 12 },
+        new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
+        new int[] { 1, 1, 1 });
+  }
+  
+  /**
+   * Same as the above, except with a nonspacing mark to show correctness.
+   */
+  public void testNonIdeographicNonLetter() throws IOException {
+    assertAnalyzesTo(analyzer, "一 رُوبرت موير",
+        new String[] { "一", "رُوبرت", "موير" },
+        new int[] { 0, 2, 9 },
+        new int[] { 1, 8, 13 },
+        new String[] { "<SINGLE>", "<ALPHANUM>", "<ALPHANUM>" },
+        new int[] { 1, 1, 1 });
+  }
+  
+  public void testSurrogates() throws IOException {
+    assertAnalyzesTo(analyzer, "𩬅艱鍟䇹愯瀛",
+      new String[] { "𩬅艱", "艱鍟", "鍟䇹", "䇹愯", "愯瀛" },
+      new int[] { 0, 2, 3, 4, 5 },
+      new int[] { 3, 4, 5, 6, 7 },
+      new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1, 1, 1, 1 });
+  }
+  
+  public void testReusableTokenStream() throws IOException {
+    assertAnalyzesToReuse(analyzer, "あいうえおabcかきくけこ",
+        new String[] { "あい", "いう", "うえ", "えお", "abc", "かき", "きく", "くけ", "けこ" },
+        new int[] { 0, 1, 2, 3, 5,  8,  9, 10, 11 },
+        new int[] { 2, 3, 4, 5, 8, 10, 11, 12, 13 },
+        new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>" },
+        new int[] { 1, 1, 1, 1, 1,  1,  1,  1,  1});
+    
+    assertAnalyzesToReuse(analyzer, "あいうえおabんcかきくけ こ",
+        new String[] { "あい", "いう", "うえ", "えお", "ab", "ん", "c", "かき", "きく", "くけ", "こ" },
+        new int[] { 0, 1, 2, 3, 5, 7, 8,  9, 10, 11, 14 },
+        new int[] { 2, 3, 4, 5, 7, 8, 9, 11, 12, 13, 15 },
+        new String[] { "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<ALPHANUM>", "<SINGLE>", "<ALPHANUM>", "<DOUBLE>", "<DOUBLE>", "<DOUBLE>", "<SINGLE>" },
+        new int[] { 1, 1, 1, 1, 1, 1, 1,  1,  1,  1,  1 });
+  }
+  
+  public void testSingleChar() throws IOException {
+    assertAnalyzesTo(analyzer, "一",
+      new String[] { "一" },
+      new int[] { 0 },
+      new int[] { 1 },
+      new String[] { "<SINGLE>" },
+      new int[] { 1 });
+  }
+  
+  public void testTokenStream() throws IOException {
+    assertAnalyzesTo(analyzer, "一丁丂", 
+      new String[] { "一丁", "丁丂"},
+      new int[] { 0, 1 },
+      new int[] { 2, 3 },
+      new String[] { "<DOUBLE>", "<DOUBLE>" },
+      new int[] { 1, 1 });
+  }
+}
--- a/solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/CJKBigramFilterFactory.java
@ -0,0 +1,64 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cjk.CJKBigramFilter;
+
+/** 
+ * Factory for {@link CJKBigramFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.CJKBigramFilterFactory" 
+ *       han="true" hiragana="true" 
+ *       katakana="true" hangul="true" /&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+public class CJKBigramFilterFactory extends BaseTokenFilterFactory {
+  int flags;
+
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    flags = 0;
+    if (getBoolean("han", true)) {
+      flags |= CJKBigramFilter.HAN;
+    }
+    if (getBoolean("hiragana", true)) {
+      flags |= CJKBigramFilter.HIRAGANA;
+    }
+    if (getBoolean("katakana", true)) {
+      flags |= CJKBigramFilter.KATAKANA;
+    }
+    if (getBoolean("hangul", true)) {
+      flags |= CJKBigramFilter.HANGUL;
+    }
+  }
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CJKBigramFilter(input, flags);
+  }
+}
--- a/solr/core/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java
@ -30,8 +30,9 @@ import java.io.Reader;
 *     &lt;tokenizer class="solr.CJKTokenizerFactory"/&gt;
 *   &lt;/analyzer&gt;
 * &lt;/fieldType&gt;</pre>
- *
+ * @deprecated
 */
+@Deprecated
 public class CJKTokenizerFactory extends BaseTokenizerFactory {
  public CJKTokenizer create(Reader in) {
    return new CJKTokenizer(in);
--- a/solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java
+++ b/solr/core/src/java/org/apache/solr/analysis/CJKWidthFilterFactory.java
@ -0,0 +1,42 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
+
+/** 
+ * Factory for {@link CJKWidthFilter}.
+ * <pre class="prettyprint" >
+ * &lt;fieldType name="text_cjk" class="solr.TextField"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.StandardTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CJKWidthFilterFactory"/&gt;
+ *     &lt;filter class="solr.LowerCaseFilterFactory"/&gt;
+ *     &lt;filter class="solr.CJKBigramFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ */
+
+public class CJKWidthFilterFactory extends BaseTokenFilterFactory {
+  
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CJKWidthFilter(input);
+  }
+}
--- a/solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestCJKBigramFilterFactory.java
@ -0,0 +1,52 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardTokenizer;
+
+/**
+ * Simple tests to ensure the CJK bigram factory is working.
+ * @deprecated
+ */
+public class TestCJKBigramFilterFactory extends BaseTokenTestCase {
+  public void testDefaults() throws Exception {
+    Reader reader = new StringReader("多くの学生が試験に落ちた。");
+    CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
+    factory.init(DEFAULT_VERSION_PARAM);
+    TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
+    assertTokenStreamContents(stream,
+        new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
+  }
+  
+  public void testHanOnly() throws Exception {
+    Reader reader = new StringReader("多くの学生が試験に落ちた。");
+    CJKBigramFilterFactory factory = new CJKBigramFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("hiragana", "false");
+    factory.init(args);
+    TokenStream stream = factory.create(new StandardTokenizer(TEST_VERSION_CURRENT, reader));
+    assertTokenStreamContents(stream,
+        new String[] { "多", "く", "の",  "学生", "が",  "試験", "に",  "落", "ち", "た" });
+  }
+}
--- a/solr/core/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestCJKTokenizerFactory.java
@ -24,7 +24,9 @@ import org.apache.lucene.analysis.TokenStream;

 /**
 * Simple tests to ensure the CJK tokenizer factory is working.
+ * @deprecated
 */
+@Deprecated
 public class TestCJKTokenizerFactory extends BaseTokenTestCase {
  /**
   * Ensure the tokenizer actually tokenizes CJK text correctly
--- a/solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java
+++ b/solr/core/src/test/org/apache/solr/analysis/TestCJKWidthFilterFactory.java
@ -0,0 +1,36 @@
+package org.apache.solr.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure the CJKWidthFilterFactory is working
+ */
+public class TestCJKWidthFilterFactory extends BaseTokenTestCase {
+  public void test() throws Exception {
+    Reader reader = new StringReader("Ｔｅｓｔ １２３４");
+    CJKWidthFilterFactory factory = new CJKWidthFilterFactory();
+    TokenStream stream = factory.create(new MockTokenizer(reader, MockTokenizer.WHITESPACE, false));
+    assertTokenStreamContents(stream, new String[] { "Test", "1234" });
+  }
+}