LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2009-08-22 20:36:06 +00:00
parent 136f054a05
commit 1ebbe2abd1
11 changed files with 121 additions and 9 deletions

View File

@ -40,6 +40,10 @@ API Changes
The SpanScorer API (now QueryScorer) has also been improved to more closely The SpanScorer API (now QueryScorer) has also been improved to more closely
match the API of the previous QueryScorer implementation. (Mark Miller) match the API of the previous QueryScorer implementation. (Mark Miller)
5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
Analyzers. If you need to index text in these encodings, please use Java's
character set conversion facilities (InputStreamReader, etc) during I/O,
so that Lucene can analyze this text as Unicode instead. (Robert Muir)
Bug fixes Bug fixes

View File

@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
* Charset for Greek letters. * Charset for Greek letters.
* Represents encoding for 24 lowercase Greek letters. * Represents encoding for 24 lowercase Greek letters.
* Predefined charsets can be taken from {@link GreekCharsets} class * Predefined charsets can be taken from {@link GreekCharsets} class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/ */
private char[] charset; private char[] charset;
@ -166,6 +167,7 @@ public final class GreekAnalyzer extends Analyzer
/** /**
* Builds an analyzer. * Builds an analyzer.
* @deprecated Use {@link #GreekAnalyzer()} instead.
*/ */
public GreekAnalyzer(char[] charset) public GreekAnalyzer(char[] charset)
{ {
@ -175,6 +177,17 @@ public final class GreekAnalyzer extends Analyzer
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* @param stopwords Array of stopwords to use.
*/
public GreekAnalyzer(String [] stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = StopFilter.makeStopSet(stopwords);
}
/**
* Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
*/ */
public GreekAnalyzer(char[] charset, String[] stopwords) public GreekAnalyzer(char[] charset, String[] stopwords)
{ {
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
stopSet = StopFilter.makeStopSet(stopwords); stopSet = StopFilter.makeStopSet(stopwords);
} }
// Takes greek stop words and translates them to a String array, using /**
// the given charset * Takes greek stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset) private static String[] makeStopWords(char[] charset)
{ {
String[] res = new String[GREEK_STOP_WORDS.length]; String[] res = new String[GREEK_STOP_WORDS.length];
@ -203,6 +219,7 @@ public final class GreekAnalyzer extends Analyzer
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
*/ */
public GreekAnalyzer(char[] charset, Map stopwords) public GreekAnalyzer(char[] charset, Map stopwords)
{ {
@ -210,6 +227,15 @@ public final class GreekAnalyzer extends Analyzer
stopSet = new HashSet(stopwords.keySet()); stopSet = new HashSet(stopwords.keySet());
} }
/**
* Builds an analyzer with the given stop words.
*/
public GreekAnalyzer(Map stopwords)
{
charset = GreekCharsets.UnicodeGreek;
stopSet = new HashSet(stopwords.keySet());
}
/** /**
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
* *

View File

@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
* the definition of a new charset as well as the required logic in the toLowerCase() method. * the definition of a new charset as well as the required logic in the toLowerCase() method.
* </p> * </p>
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/ */
public class GreekCharsets public class GreekCharsets
{ {

View File

@ -28,10 +28,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/ */
public final class GreekLowerCaseFilter extends TokenFilter public final class GreekLowerCaseFilter extends TokenFilter
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset; char[] charset;
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
*/
public GreekLowerCaseFilter(TokenStream in, char[] charset) public GreekLowerCaseFilter(TokenStream in, char[] charset)
{ {
super(in); super(in);
@ -39,6 +45,11 @@ public final class GreekLowerCaseFilter extends TokenFilter
termAtt = (TermAttribute) addAttribute(TermAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class);
} }
public GreekLowerCaseFilter(TokenStream in)
{
this(in, GreekCharsets.UnicodeGreek);
}
public boolean incrementToken() throws IOException { public boolean incrementToken() throws IOException {
if (input.incrementToken()) { if (input.incrementToken()) {
char[] chArray = termAtt.termBuffer(); char[] chArray = termAtt.termBuffer();

View File

@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
* Charset for Russian letters. * Charset for Russian letters.
* Represents encoding for 32 lowercase Russian letters. * Represents encoding for 32 lowercase Russian letters.
* Predefined charsets can be taken from RussianCharSets class * Predefined charsets can be taken from RussianCharSets class
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/ */
private char[] charset; private char[] charset;
@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer
/** /**
* Builds an analyzer. * Builds an analyzer.
* @deprecated Use {@link #RussianAnalyzer()} instead.
*/ */
public RussianAnalyzer(char[] charset) public RussianAnalyzer(char[] charset)
{ {
@ -211,6 +213,7 @@ public final class RussianAnalyzer extends Analyzer
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
*/ */
public RussianAnalyzer(char[] charset, String[] stopwords) public RussianAnalyzer(char[] charset, String[] stopwords)
{ {
@ -218,8 +221,19 @@ public final class RussianAnalyzer extends Analyzer
stopSet = StopFilter.makeStopSet(stopwords); stopSet = StopFilter.makeStopSet(stopwords);
} }
// Takes russian stop words and translates them to a String array, using /**
// the given charset * Builds an analyzer with the given stop words.
*/
public RussianAnalyzer(String[] stopwords)
{
this.charset = RussianCharsets.UnicodeRussian;
stopSet = StopFilter.makeStopSet(stopwords);
}
/** Takes russian stop words and translates them to a String array, using
* the given charset.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private static String[] makeStopWords(char[] charset) private static String[] makeStopWords(char[] charset)
{ {
String[] res = new String[RUSSIAN_STOP_WORDS.length]; String[] res = new String[RUSSIAN_STOP_WORDS.length];
@ -240,6 +254,7 @@ public final class RussianAnalyzer extends Analyzer
/** /**
* Builds an analyzer with the given stop words. * Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor * @todo create a Set version of this ctor
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
*/ */
public RussianAnalyzer(char[] charset, Map stopwords) public RussianAnalyzer(char[] charset, Map stopwords)
{ {
@ -247,6 +262,16 @@ public final class RussianAnalyzer extends Analyzer
stopSet = new HashSet(stopwords.keySet()); stopSet = new HashSet(stopwords.keySet());
} }
/**
* Builds an analyzer with the given stop words.
* @todo create a Set version of this ctor
*/
public RussianAnalyzer(Map stopwords)
{
charset = RussianCharsets.UnicodeRussian;
stopSet = new HashSet(stopwords.keySet());
}
/** /**
* Creates a {@link TokenStream} which tokenizes all the text in the * Creates a {@link TokenStream} which tokenizes all the text in the
* provided {@link Reader}. * provided {@link Reader}.

View File

@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
* and adding logic to toLowerCase() method for that charset. * and adding logic to toLowerCase() method for that charset.
* </p> * </p>
* * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
* @version $Id$ * @version $Id$
*/ */
public class RussianCharsets public class RussianCharsets

View File

@ -37,21 +37,33 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
public class RussianLetterTokenizer extends CharTokenizer public class RussianLetterTokenizer extends CharTokenizer
{ {
/** Construct a new LetterTokenizer. */ /**
* Charset this tokenizer uses.
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset; private char[] charset;
/**
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
*/
public RussianLetterTokenizer(Reader in, char[] charset) public RussianLetterTokenizer(Reader in, char[] charset)
{ {
super(in); super(in);
this.charset = charset; this.charset = charset;
} }
public RussianLetterTokenizer(Reader in)
{
this(in, RussianCharsets.UnicodeRussian);
}
/** /**
* Collects only characters which satisfy * Collects only characters which satisfy
* {@link Character#isLetter(char)}. * {@link Character#isLetter(char)}.
*/ */
protected boolean isTokenChar(char c) protected boolean isTokenChar(char c)
{ {
/* in the next release, this can be implemented as isLetter(c) or [0-9] */
if (Character.isLetter(c)) if (Character.isLetter(c))
return true; return true;
for (int i = 0; i < charset.length; i++) for (int i = 0; i < charset.length; i++)

View File

@ -31,10 +31,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
*/ */
public final class RussianLowerCaseFilter extends TokenFilter public final class RussianLowerCaseFilter extends TokenFilter
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
char[] charset; char[] charset;
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
*/
public RussianLowerCaseFilter(TokenStream in, char[] charset) public RussianLowerCaseFilter(TokenStream in, char[] charset)
{ {
super(in); super(in);
@ -42,6 +48,11 @@ public final class RussianLowerCaseFilter extends TokenFilter
termAtt = (TermAttribute) addAttribute(TermAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class);
} }
public RussianLowerCaseFilter(TokenStream in)
{
this(in, RussianCharsets.UnicodeRussian);
}
public final boolean incrementToken() throws IOException public final boolean incrementToken() throws IOException
{ {
if (input.incrementToken()) { if (input.incrementToken()) {

View File

@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter
private TermAttribute termAtt; private TermAttribute termAtt;
/**
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
*/
public RussianStemFilter(TokenStream in, char[] charset) public RussianStemFilter(TokenStream in, char[] charset)
{ {
super(in); super(in);
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
termAtt = (TermAttribute) addAttribute(TermAttribute.class); termAtt = (TermAttribute) addAttribute(TermAttribute.class);
} }
public RussianStemFilter(TokenStream in)
{
this(in, RussianCharsets.UnicodeRussian);
}
/** /**
* Returns the next token in the stream, or null at EOS * Returns the next token in the stream, or null at EOS
*/ */

View File

@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
*/ */
class RussianStemmer class RussianStemmer
{ {
/**
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/
private char[] charset; private char[] charset;
// positions of RV, R1 and R2 respectively // positions of RV, R1 and R2 respectively
@ -255,6 +258,7 @@ class RussianStemmer
/** /**
* RussianStemmer constructor comment. * RussianStemmer constructor comment.
* @deprecated Use {@link #RussianStemmer()} instead.
*/ */
public RussianStemmer(char[] charset) public RussianStemmer(char[] charset)
{ {
@ -529,6 +533,7 @@ class RussianStemmer
* Insert the method's description here. * Insert the method's description here.
* Creation date: (16/03/2002 10:58:42 PM) * Creation date: (16/03/2002 10:58:42 PM)
* @param newCharset char[] * @param newCharset char[]
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
*/ */
public void setCharset(char[] newCharset) public void setCharset(char[] newCharset)
{ {
@ -620,6 +625,7 @@ class RussianStemmer
/** /**
* Static method for stemming with different charsets * Static method for stemming with different charsets
* @deprecated Use {@link #stemWord(String)} instead.
*/ */
public static String stem(String theWord, char[] charset) public static String stem(String theWord, char[] charset)
{ {
@ -627,4 +633,14 @@ class RussianStemmer
stemmer.setCharset(charset); stemmer.setCharset(charset);
return stemmer.stem(theWord); return stemmer.stem(theWord);
} }
/**
* Static method for stemming.
*/
public static String stemWord(String theWord)
{
RussianStemmer stemmer = new RussianStemmer();
stemmer.setCharset(RussianCharsets.UnicodeRussian);
return stemmer.stem(theWord);
}
} }

View File

@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase
public void testUnicode() throws IOException public void testUnicode() throws IOException
{ {
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); RussianAnalyzer ra = new RussianAnalyzer();
inWords = inWords =
new InputStreamReader( new InputStreamReader(
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")), new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase
RussianLetterTokenizer sample = RussianLetterTokenizer sample =
new RussianLetterTokenizer( new RussianLetterTokenizer(
sampleUnicode, sampleUnicode);
RussianCharsets.UnicodeRussian);
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class); TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class); TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);