mirror of
https://github.com/apache/lucene.git
synced 2025-02-21 17:46:28 +00:00
LUCENE-1882: move SmartChineseAnalyzer to the 'correct' package ... this commit is based on a sequence of svn commands and a patch provided by Robert Muir in LUCENE-1862
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@810208 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
566aaf28e7
commit
e5cb7f668a
@ -1,50 +0,0 @@
|
|||||||
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
|
|
||||||
<!--
|
|
||||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
contributor license agreements. See the NOTICE file distributed with
|
|
||||||
this work for additional information regarding copyright ownership.
|
|
||||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
(the "License"); you may not use this file except in compliance with
|
|
||||||
the License. You may obtain a copy of the License at
|
|
||||||
|
|
||||||
http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
|
|
||||||
Unless required by applicable law or agreed to in writing, software
|
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
See the License for the specific language governing permissions and
|
|
||||||
limitations under the License.
|
|
||||||
-->
|
|
||||||
<html>
|
|
||||||
<head>
|
|
||||||
<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<div>
|
|
||||||
Analyzer for Simplified Chinese, which indexes words.
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
<font color="#FF0000">
|
|
||||||
WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental. The APIs
|
|
||||||
and file formats introduced here might change in the future and will not be supported anymore
|
|
||||||
in such a case.
|
|
||||||
</font>
|
|
||||||
</div>
|
|
||||||
<div>
|
|
||||||
Three analyzers are provided for Chinese, each of which treats Chinese text in a different way.
|
|
||||||
<ul>
|
|
||||||
<li>ChineseAnalyzer (in the analyzers/cn package): Index unigrams (individual Chinese characters) as a token.
|
|
||||||
<li>CJKAnalyzer (in the analyzers/cjk package): Index bigrams (overlapping groups of two adjacent Chinese characters) as tokens.
|
|
||||||
<li>SmartChineseAnalyzer (in this package): Index words (attempt to segment Chinese text into words) as tokens.
|
|
||||||
</ul>
|
|
||||||
|
|
||||||
Example phrase: "我是中国人"
|
|
||||||
<ol>
|
|
||||||
<li>ChineseAnalyzer: 我-是-中-国-人</li>
|
|
||||||
<li>CJKAnalyzer: 我是-是中-中国-国人</li>
|
|
||||||
<li>SmartChineseAnalyzer: 我-是-中国-人</li>
|
|
||||||
</ol>
|
|
||||||
</div>
|
|
||||||
|
|
||||||
</body>
|
|
||||||
</html>
|
|
@ -28,7 +28,7 @@ import java.util.Properties;
|
|||||||
* SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
|
* SmartChineseAnalyzer has a built-in dictionary and stopword list out-of-box.
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.cn.smart;
|
|||||||
/**
|
/**
|
||||||
* Internal SmartChineseAnalyzer character type constants.
|
* Internal SmartChineseAnalyzer character type constants.
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -32,7 +32,7 @@ import org.apache.lucene.util.AttributeSource;
|
|||||||
* The output tokens can then be broken into words with {@link WordTokenFilter}
|
* The output tokens can then be broken into words with {@link WordTokenFilter}
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.analysis.cn;
|
package org.apache.lucene.analysis.cn.smart;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
@ -51,7 +51,7 @@ import org.apache.lucene.analysis.cn.smart.WordTokenFilter;
|
|||||||
* Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
|
* Thanks to ICTCLAS for their hard work, and for contributing the data under the Apache 2 License!
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter; // for javadoc
|
|||||||
/**
|
/**
|
||||||
* SmartChineseAnalyzer utility constants and methods
|
* SmartChineseAnalyzer utility constants and methods
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -27,7 +27,7 @@ import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
|
|||||||
/**
|
/**
|
||||||
* Segment a sentence of Chinese text into words.
|
* Segment a sentence of Chinese text into words.
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -31,7 +31,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that breaks sentences into words.
|
* A {@link TokenFilter} that breaks sentences into words.
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.cn.smart;
|
|||||||
/**
|
/**
|
||||||
* Internal SmartChineseAnalyzer token type constants
|
* Internal SmartChineseAnalyzer token type constants
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -27,7 +27,7 @@ import java.io.UnsupportedEncodingException;
|
|||||||
* Contains methods for dealing with GB2312 encoding.
|
* Contains methods for dealing with GB2312 encoding.
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -32,7 +32,7 @@ import org.apache.lucene.analysis.cn.smart.Utility;
|
|||||||
* For each start offset, a list of possible token pairs is stored.
|
* For each start offset, a list of possible token pairs is stored.
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -35,7 +35,7 @@ import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
|
|||||||
/**
|
/**
|
||||||
* SmartChineseAnalyzer Bigram dictionary.
|
* SmartChineseAnalyzer Bigram dictionary.
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -27,7 +27,7 @@ import org.apache.lucene.analysis.cn.smart.hhmm.PathNode;//javadoc @link
|
|||||||
/**
|
/**
|
||||||
* Finds the optimal segmentation of a sentence into Chinese words
|
* Finds the optimal segmentation of a sentence into Chinese words
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -23,7 +23,7 @@ package org.apache.lucene.analysis.cn.smart.hhmm;
|
|||||||
* Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
|
* Used by {@link BiSegGraph} to maximize the segmentation with the Viterbi algorithm.
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -29,7 +29,7 @@ import java.util.Map;
|
|||||||
* For each start offset, a list of possible tokens is stored.
|
* For each start offset, a list of possible tokens is stored.
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.cn.smart.WordType; // for javadocs
|
|||||||
/**
|
/**
|
||||||
* SmartChineseAnalyzer internal token
|
* SmartChineseAnalyzer internal token
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -26,7 +26,7 @@ import org.apache.lucene.analysis.cn.smart.WordType;
|
|||||||
* Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
|
* Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
|
||||||
* </p>
|
* </p>
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -22,7 +22,7 @@ import java.util.Arrays;
|
|||||||
/**
|
/**
|
||||||
* A pair of tokens in {@link SegGraph}
|
* A pair of tokens in {@link SegGraph}
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -37,7 +37,7 @@ import org.apache.lucene.analysis.cn.smart.Utility;
|
|||||||
* SmartChineseAnalyzer Word Dictionary
|
* SmartChineseAnalyzer Word Dictionary
|
||||||
*
|
*
|
||||||
* <p><font color="#FF0000">
|
* <p><font color="#FF0000">
|
||||||
* WARNING: The status of the analyzers/smartcn <b>analysis.cn</b> package is experimental.
|
* WARNING: The status of the analyzers/smartcn <b>analysis.cn.smart</b> package is experimental.
|
||||||
* The APIs and file formats introduced here might change in the future and will not be
|
* The APIs and file formats introduced here might change in the future and will not be
|
||||||
* supported anymore in such a case.</font>
|
* supported anymore in such a case.</font>
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -15,7 +15,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.lucene.analysis.cn;
|
package org.apache.lucene.analysis.cn.smart;
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
Loading…
x
Reference in New Issue
Block a user