mirror of https://github.com/apache/lucene.git
LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
136f054a05
commit
1ebbe2abd1
|
@ -40,6 +40,10 @@ API Changes
|
|||
The SpanScorer API (now QueryScorer) has also been improved to more closely
|
||||
match the API of the previous QueryScorer implementation. (Mark Miller)
|
||||
|
||||
5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
|
||||
Analyzers. If you need to index text in these encodings, please use Java's
|
||||
character set conversion facilities (InputStreamReader, etc) during I/O,
|
||||
so that Lucene can analyze this text as Unicode instead. (Robert Muir)
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
|
|
@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
* Charset for Greek letters.
|
||||
* Represents encoding for 24 lowercase Greek letters.
|
||||
* Predefined charsets can be taken from {@link GreekCharsets} class
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
|
@ -166,6 +167,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
* @deprecated Use {@link #GreekAnalyzer()} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset)
|
||||
{
|
||||
|
@ -175,6 +177,17 @@ public final class GreekAnalyzer extends Analyzer
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @param stopwords Array of stopwords to use.
|
||||
*/
|
||||
public GreekAnalyzer(String [] stopwords)
|
||||
{
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
|
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
|
|||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
// Takes greek stop words and translates them to a String array, using
|
||||
// the given charset
|
||||
/**
|
||||
* Takes greek stop words and translates them to a String array, using
|
||||
* the given charset.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[GREEK_STOP_WORDS.length];
|
||||
|
@ -203,6 +219,7 @@ public final class GreekAnalyzer extends Analyzer
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
|
||||
*/
|
||||
public GreekAnalyzer(char[] charset, Map stopwords)
|
||||
{
|
||||
|
@ -210,6 +227,15 @@ public final class GreekAnalyzer extends Analyzer
|
|||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public GreekAnalyzer(Map stopwords)
|
||||
{
|
||||
charset = GreekCharsets.UnicodeGreek;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||
*
|
||||
|
|
|
@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
|
|||
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
||||
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
||||
* </p>
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
public class GreekCharsets
|
||||
{
|
||||
|
|
|
@ -28,10 +28,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
*/
|
||||
public final class GreekLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
char[] charset;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
|
||||
*/
|
||||
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
|
@ -39,6 +45,11 @@ public final class GreekLowerCaseFilter extends TokenFilter
|
|||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public GreekLowerCaseFilter(TokenStream in)
|
||||
{
|
||||
this(in, GreekCharsets.UnicodeGreek);
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
char[] chArray = termAtt.termBuffer();
|
||||
|
|
|
@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
* Charset for Russian letters.
|
||||
* Represents encoding for 32 lowercase Russian letters.
|
||||
* Predefined charsets can be taken from RussianCharSets class
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
|
@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
|
||||
/**
|
||||
* Builds an analyzer.
|
||||
* @deprecated Use {@link #RussianAnalyzer()} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset)
|
||||
{
|
||||
|
@ -211,6 +213,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||
{
|
||||
|
@ -218,8 +221,19 @@ public final class RussianAnalyzer extends Analyzer
|
|||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
// Takes russian stop words and translates them to a String array, using
|
||||
// the given charset
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
*/
|
||||
public RussianAnalyzer(String[] stopwords)
|
||||
{
|
||||
this.charset = RussianCharsets.UnicodeRussian;
|
||||
stopSet = StopFilter.makeStopSet(stopwords);
|
||||
}
|
||||
|
||||
/** Takes russian stop words and translates them to a String array, using
|
||||
* the given charset.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private static String[] makeStopWords(char[] charset)
|
||||
{
|
||||
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||
|
@ -240,6 +254,7 @@ public final class RussianAnalyzer extends Analyzer
|
|||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @todo create a Set version of this ctor
|
||||
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
|
||||
*/
|
||||
public RussianAnalyzer(char[] charset, Map stopwords)
|
||||
{
|
||||
|
@ -247,6 +262,16 @@ public final class RussianAnalyzer extends Analyzer
|
|||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds an analyzer with the given stop words.
|
||||
* @todo create a Set version of this ctor
|
||||
*/
|
||||
public RussianAnalyzer(Map stopwords)
|
||||
{
|
||||
charset = RussianCharsets.UnicodeRussian;
|
||||
stopSet = new HashSet(stopwords.keySet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a {@link TokenStream} which tokenizes all the text in the
|
||||
* provided {@link Reader}.
|
||||
|
|
|
@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
|
|||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||
* and adding logic to toLowerCase() method for that charset.
|
||||
* </p>
|
||||
*
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
* @version $Id$
|
||||
*/
|
||||
public class RussianCharsets
|
||||
|
|
|
@ -37,21 +37,33 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
|||
|
||||
public class RussianLetterTokenizer extends CharTokenizer
|
||||
{
|
||||
/** Construct a new LetterTokenizer. */
|
||||
/**
|
||||
* Charset this tokenizer uses.
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
|
||||
*/
|
||||
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
this.charset = charset;
|
||||
}
|
||||
|
||||
public RussianLetterTokenizer(Reader in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects only characters which satisfy
|
||||
* {@link Character#isLetter(char)}.
|
||||
*/
|
||||
protected boolean isTokenChar(char c)
|
||||
{
|
||||
/* in the next release, this can be implemented as isLetter(c) or [0-9] */
|
||||
if (Character.isLetter(c))
|
||||
return true;
|
||||
for (int i = 0; i < charset.length; i++)
|
||||
|
|
|
@ -31,10 +31,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|||
*/
|
||||
public final class RussianLowerCaseFilter extends TokenFilter
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
char[] charset;
|
||||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
|
||||
*/
|
||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
|
@ -42,6 +48,11 @@ public final class RussianLowerCaseFilter extends TokenFilter
|
|||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public RussianLowerCaseFilter(TokenStream in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
}
|
||||
|
||||
public final boolean incrementToken() throws IOException
|
||||
{
|
||||
if (input.incrementToken()) {
|
||||
|
|
|
@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter
|
|||
|
||||
private TermAttribute termAtt;
|
||||
|
||||
/**
|
||||
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
|
||||
*/
|
||||
public RussianStemFilter(TokenStream in, char[] charset)
|
||||
{
|
||||
super(in);
|
||||
|
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
|
|||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
}
|
||||
|
||||
public RussianStemFilter(TokenStream in)
|
||||
{
|
||||
this(in, RussianCharsets.UnicodeRussian);
|
||||
}
|
||||
/**
|
||||
* Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
|
|
|
@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
|
|||
*/
|
||||
class RussianStemmer
|
||||
{
|
||||
/**
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
private char[] charset;
|
||||
|
||||
// positions of RV, R1 and R2 respectively
|
||||
|
@ -255,6 +258,7 @@ class RussianStemmer
|
|||
|
||||
/**
|
||||
* RussianStemmer constructor comment.
|
||||
* @deprecated Use {@link #RussianStemmer()} instead.
|
||||
*/
|
||||
public RussianStemmer(char[] charset)
|
||||
{
|
||||
|
@ -529,6 +533,7 @@ class RussianStemmer
|
|||
* Insert the method's description here.
|
||||
* Creation date: (16/03/2002 10:58:42 PM)
|
||||
* @param newCharset char[]
|
||||
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||
*/
|
||||
public void setCharset(char[] newCharset)
|
||||
{
|
||||
|
@ -620,6 +625,7 @@ class RussianStemmer
|
|||
|
||||
/**
|
||||
* Static method for stemming with different charsets
|
||||
* @deprecated Use {@link #stemWord(String)} instead.
|
||||
*/
|
||||
public static String stem(String theWord, char[] charset)
|
||||
{
|
||||
|
@ -627,4 +633,14 @@ class RussianStemmer
|
|||
stemmer.setCharset(charset);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
|
||||
/**
|
||||
* Static method for stemming.
|
||||
*/
|
||||
public static String stemWord(String theWord)
|
||||
{
|
||||
RussianStemmer stemmer = new RussianStemmer();
|
||||
stemmer.setCharset(RussianCharsets.UnicodeRussian);
|
||||
return stemmer.stem(theWord);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase
|
|||
|
||||
public void testUnicode() throws IOException
|
||||
{
|
||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
|
||||
RussianAnalyzer ra = new RussianAnalyzer();
|
||||
inWords =
|
||||
new InputStreamReader(
|
||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
||||
|
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase
|
|||
|
||||
RussianLetterTokenizer sample =
|
||||
new RussianLetterTokenizer(
|
||||
sampleUnicode,
|
||||
RussianCharsets.UnicodeRussian);
|
||||
sampleUnicode);
|
||||
|
||||
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
||||
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
||||
|
|
Loading…
Reference in New Issue