LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-08-22 20:36:06 +00:00
parent 136f054a05
commit 1ebbe2abd1
11 changed files with 121 additions and 9 deletions

View File

@ -40,6 +40,10 @@ API Changes
The SpanScorer API (now QueryScorer) has also been improved to more closely
match the API of the previous QueryScorer implementation. (Mark Miller)
5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
Analyzers. If you need to index text in these encodings, please use Java's
character set conversion facilities (InputStreamReader, etc) during I/O,
so that Lucene can analyze this text as Unicode instead. (Robert Muir)
Bug fixes

View File

@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
* Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters.
* Predefined charsets can be taken from {@link GreekCharsets} class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
@ -166,15 +167,27 @@ public final class GreekAnalyzer extends Analyzer
/**
* Builds an analyzer.
* @deprecated Use {@link #GreekAnalyzer()} instead.
*/
public GreekAnalyzer(char[] charset)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
}
/**
* Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
*/
public GreekAnalyzer(String [] stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = StopFilter.makeStopSet(stopwords);
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
*/
public GreekAnalyzer(char[] charset, String[] stopwords)
{
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes greek stop words and translates them to a String array, using
// the given charset
/**
* Takes greek stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[GREEK_STOP_WORDS.length];
@ -203,12 +219,22 @@ public final class GreekAnalyzer extends Analyzer
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
*/
public GreekAnalyzer(char[] charset, Map stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
*/
public GreekAnalyzer(Map stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.

View File

@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
* the definition of a new charset as well as the required logic in the toLowerCase() method.
* </p>
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
public class GreekCharsets
{

View File

@ -28,16 +28,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/
public final class GreekLowerCaseFilter extends TokenFilter
{
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset;
private TermAttribute termAtt;
/**
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
*/
public GreekLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public GreekLowerCaseFilter(TokenStream in)
{
this(in, GreekCharsets.UnicodeGreek);
}
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {

View File

@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
* Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer
/**
* Builds an analyzer.
* @deprecated Use {@link #RussianAnalyzer()} instead.
*/
public RussianAnalyzer(char[] charset)
{
@ -211,15 +213,27 @@ public final class RussianAnalyzer extends Analyzer
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
*/
public RussianAnalyzer(char[] charset, String[] stopwords)
{
this.charset = charset;
stopSet = StopFilter.makeStopSet(stopwords);
}
/**
* Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(String[] stopwords)
{
this.charset = RussianCharsets.UnicodeRussian;
stopSet = StopFilter.makeStopSet(stopwords);
}
// Takes russian stop words and translates them to a String array, using
// the given charset
/** Takes russian stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset)
{
String[] res = new String[RUSSIAN_STOP_WORDS.length];
@ -240,12 +254,23 @@ public final class RussianAnalyzer extends Analyzer
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
*/
public RussianAnalyzer(char[] charset, Map stopwords)
{
this.charset = charset;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/
public RussianAnalyzer(Map stopwords)
{
charset = RussianCharsets.UnicodeRussian;
stopSet = new HashSet(stopwords.keySet());
}
/**
* Creates a {@link TokenStream} which tokenizes all the text in the

View File

@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset.
* </p>
*
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
* @version $Id$
*/
public class RussianCharsets

View File

@ -37,14 +37,25 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
public class RussianLetterTokenizer extends CharTokenizer
{
/** Construct a new LetterTokenizer. */
/**
* Charset this tokenizer uses.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
/**
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
*/
public RussianLetterTokenizer(Reader in, char[] charset)
{
super(in);
this.charset = charset;
}
public RussianLetterTokenizer(Reader in)
{
this(in, RussianCharsets.UnicodeRussian);
}
/**
* Collects only characters which satisfy
@ -52,6 +63,7 @@ public class RussianLetterTokenizer extends CharTokenizer
*/
protected boolean isTokenChar(char c)
{
/* in the next release, this can be implemented as isLetter(c) or [0-9] */
if (Character.isLetter(c))
return true;
for (int i = 0; i < charset.length; i++)

View File

@ -31,16 +31,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/
public final class RussianLowerCaseFilter extends TokenFilter
{
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset;
private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
*/
public RussianLowerCaseFilter(TokenStream in, char[] charset)
{
super(in);
this.charset = charset;
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public RussianLowerCaseFilter(TokenStream in)
{
this(in, RussianCharsets.UnicodeRussian);
}
public final boolean incrementToken() throws IOException
{

View File

@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter
private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
*/
public RussianStemFilter(TokenStream in, char[] charset)
{
super(in);
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
public RussianStemFilter(TokenStream in)
{
this(in, RussianCharsets.UnicodeRussian);
}
/**
* Returns the next token in the stream, or null at EOS
*/

View File

@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
*/
class RussianStemmer
{
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset;
// positions of RV, R1 and R2 respectively
@ -255,6 +258,7 @@ class RussianStemmer
/**
* RussianStemmer constructor comment.
* @deprecated Use {@link #RussianStemmer()} instead.
*/
public RussianStemmer(char[] charset)
{
@ -529,6 +533,7 @@ class RussianStemmer
* Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[]
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
public void setCharset(char[] newCharset)
{
@ -620,6 +625,7 @@ class RussianStemmer
/**
* Static method for stemming with different charsets
* @deprecated Use {@link #stemWord(String)} instead.
*/
public static String stem(String theWord, char[] charset)
{
@ -627,4 +633,14 @@ class RussianStemmer
stemmer.setCharset(charset);
return stemmer.stem(theWord);
}
/**
* Static method for stemming.
*/
public static String stemWord(String theWord)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(RussianCharsets.UnicodeRussian);
return stemmer.stem(theWord);
}
}

View File

@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase
public void testUnicode() throws IOException
{
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
RussianAnalyzer ra = new RussianAnalyzer();
inWords =
new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase
RussianLetterTokenizer sample =
new RussianLetterTokenizer(
sampleUnicode,
RussianCharsets.UnicodeRussian);
sampleUnicode);
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);