mirror of https://github.com/apache/lucene.git
LUCENE-2238: deprecated ChineseAnalyzer / ChineseTokenizer in favor of StandardAnalyzer / Tokenizer which does the same thing
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@904521 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
49b3a12971
commit
537bb742cd
|
@ -21,15 +21,17 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} that tokenizes text with {@link ChineseTokenizer} and
|
||||
* filters with {@link ChineseFilter}
|
||||
*
|
||||
* @deprecated Use {@link StandardAnalyzer} instead, which has the same functionality.
|
||||
* This analyzer will be removed in Lucene 4.0
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public final class ChineseAnalyzer extends ReusableAnalyzerBase {
|
||||
|
||||
/**
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Arrays;
|
|||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.StopFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -41,9 +42,10 @@ import org.apache.lucene.util.Version;
|
|||
* </ol>
|
||||
*
|
||||
* @version 1.0
|
||||
*
|
||||
* @deprecated Use {@link StopFilter} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public final class ChineseFilter extends TokenFilter {
|
||||
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.lucene.analysis.cn;
|
|||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
@ -52,9 +53,10 @@ import org.apache.lucene.util.AttributeSource;
|
|||
* CJKTokenizer will not work.
|
||||
* </p>
|
||||
* @version 1.0
|
||||
*
|
||||
* @deprecated Use {@link StandardTokenizer} instead, which has the same functionality.
|
||||
* This filter will be removed in Lucene 4.0
|
||||
*/
|
||||
|
||||
@Deprecated
|
||||
public final class ChineseTokenizer extends Tokenizer {
|
||||
|
||||
|
||||
|
|
|
@ -24,14 +24,14 @@ Analyzer for Chinese, which indexes unigrams (individual chinese characters).
|
|||
<p>
|
||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
||||
<ul>
|
||||
<li>ChineseAnalyzer (in this package): Index unigrams (individual Chinese characters) as a token.
|
||||
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
|
||||
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
|
||||
<li>SmartChineseAnalyzer (in the analyzers/smartcn package): Index words (attempt to segment Chinese text into words) as tokens.
|
||||
</ul>
|
||||
|
||||
Example phrase: "我是中国人"
|
||||
<ol>
|
||||
<li>ChineseAnalyzer: 我-是-中-国-人</li>
|
||||
<li>StandardAnalyzer: 我-是-中-国-人</li>
|
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
||||
</ol>
|
||||
|
|
|
@ -24,11 +24,12 @@ import java.io.StringReader;
|
|||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
|
||||
|
||||
/** @deprecated Remove this test when ChineseAnalyzer is removed. */
|
||||
@Deprecated
|
||||
public class TestChineseTokenizer extends BaseTokenStreamTestCase
|
||||
{
|
||||
public void testOtherLetterOffset() throws IOException
|
||||
|
|
|
@ -33,14 +33,14 @@ in such a case.
|
|||
<div>
|
||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
||||
<ul>
|
||||
<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
|
||||
<li>StandardAnalyzer: Index unigrams (individual Chinese characters) as a token.
|
||||
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
|
||||
<li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
|
||||
</ul>
|
||||
|
||||
Example phrase: "我是中国人"
|
||||
<ol>
|
||||
<li>ChineseAnalyzer: 我-是-中-国-人</li>
|
||||
<li>StandardAnalyzer: 我-是-中-国-人</li>
|
||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
||||
</ol>
|
||||
|
|
Loading…
Reference in New Issue