mirror of https://github.com/apache/lucene.git
LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
136f054a05
commit
1ebbe2abd1
|
@ -40,6 +40,10 @@ API Changes
|
||||||
The SpanScorer API (now QueryScorer) has also been improved to more closely
|
The SpanScorer API (now QueryScorer) has also been improved to more closely
|
||||||
match the API of the previous QueryScorer implementation. (Mark Miller)
|
match the API of the previous QueryScorer implementation. (Mark Miller)
|
||||||
|
|
||||||
|
5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
|
||||||
|
Analyzers. If you need to index text in these encodings, please use Java's
|
||||||
|
character set conversion facilities (InputStreamReader, etc) during I/O,
|
||||||
|
so that Lucene can analyze this text as Unicode instead. (Robert Muir)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
|
|
@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
* Charset for Greek letters.
|
* Charset for Greek letters.
|
||||||
* Represents encoding for 24 lowercase Greek letters.
|
* Represents encoding for 24 lowercase Greek letters.
|
||||||
* Predefined charsets can be taken from {@link GreekCharsets} class
|
* Predefined charsets can be taken from {@link GreekCharsets} class
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
*/
|
*/
|
||||||
private char[] charset;
|
private char[] charset;
|
||||||
|
|
||||||
|
@ -166,15 +167,27 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer.
|
* Builds an analyzer.
|
||||||
|
* @deprecated Use {@link #GreekAnalyzer()} instead.
|
||||||
*/
|
*/
|
||||||
public GreekAnalyzer(char[] charset)
|
public GreekAnalyzer(char[] charset)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
stopSet = StopFilter.makeStopSet(makeStopWords(charset));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @param stopwords Array of stopwords to use.
|
||||||
|
*/
|
||||||
|
public GreekAnalyzer(String [] stopwords)
|
||||||
|
{
|
||||||
|
charset = GreekCharsets.UnicodeGreek;
|
||||||
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @deprecated Use {@link #GreekAnalyzer(String[])} instead.
|
||||||
*/
|
*/
|
||||||
public GreekAnalyzer(char[] charset, String[] stopwords)
|
public GreekAnalyzer(char[] charset, String[] stopwords)
|
||||||
{
|
{
|
||||||
|
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
stopSet = StopFilter.makeStopSet(stopwords);
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Takes greek stop words and translates them to a String array, using
|
/**
|
||||||
// the given charset
|
* Takes greek stop words and translates them to a String array, using
|
||||||
|
* the given charset.
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
private static String[] makeStopWords(char[] charset)
|
private static String[] makeStopWords(char[] charset)
|
||||||
{
|
{
|
||||||
String[] res = new String[GREEK_STOP_WORDS.length];
|
String[] res = new String[GREEK_STOP_WORDS.length];
|
||||||
|
@ -203,12 +219,22 @@ public final class GreekAnalyzer extends Analyzer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @deprecated Use {@link #GreekAnalyzer(Map)} instead.
|
||||||
*/
|
*/
|
||||||
public GreekAnalyzer(char[] charset, Map stopwords)
|
public GreekAnalyzer(char[] charset, Map stopwords)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stopSet = new HashSet(stopwords.keySet());
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public GreekAnalyzer(Map stopwords)
|
||||||
|
{
|
||||||
|
charset = GreekCharsets.UnicodeGreek;
|
||||||
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
* Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
|
||||||
|
|
|
@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
|
||||||
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
* including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
|
||||||
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
* the definition of a new charset as well as the required logic in the toLowerCase() method.
|
||||||
* </p>
|
* </p>
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
*/
|
*/
|
||||||
public class GreekCharsets
|
public class GreekCharsets
|
||||||
{
|
{
|
||||||
|
|
|
@ -28,16 +28,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
*/
|
*/
|
||||||
public final class GreekLowerCaseFilter extends TokenFilter
|
public final class GreekLowerCaseFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
char[] charset;
|
char[] charset;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
|
||||||
|
*/
|
||||||
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public GreekLowerCaseFilter(TokenStream in)
|
||||||
|
{
|
||||||
|
this(in, GreekCharsets.UnicodeGreek);
|
||||||
|
}
|
||||||
|
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
|
|
|
@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
* Charset for Russian letters.
|
* Charset for Russian letters.
|
||||||
* Represents encoding for 32 lowercase Russian letters.
|
* Represents encoding for 32 lowercase Russian letters.
|
||||||
* Predefined charsets can be taken from RussianCharSets class
|
* Predefined charsets can be taken from RussianCharSets class
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
*/
|
*/
|
||||||
private char[] charset;
|
private char[] charset;
|
||||||
|
|
||||||
|
@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer.
|
* Builds an analyzer.
|
||||||
|
* @deprecated Use {@link #RussianAnalyzer()} instead.
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(char[] charset)
|
public RussianAnalyzer(char[] charset)
|
||||||
{
|
{
|
||||||
|
@ -211,15 +213,27 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @deprecated Use {@link #RussianAnalyzer(String[])} instead.
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(char[] charset, String[] stopwords)
|
public RussianAnalyzer(char[] charset, String[] stopwords)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stopSet = StopFilter.makeStopSet(stopwords);
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
*/
|
||||||
|
public RussianAnalyzer(String[] stopwords)
|
||||||
|
{
|
||||||
|
this.charset = RussianCharsets.UnicodeRussian;
|
||||||
|
stopSet = StopFilter.makeStopSet(stopwords);
|
||||||
|
}
|
||||||
|
|
||||||
// Takes russian stop words and translates them to a String array, using
|
/** Takes russian stop words and translates them to a String array, using
|
||||||
// the given charset
|
* the given charset.
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
private static String[] makeStopWords(char[] charset)
|
private static String[] makeStopWords(char[] charset)
|
||||||
{
|
{
|
||||||
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
String[] res = new String[RUSSIAN_STOP_WORDS.length];
|
||||||
|
@ -240,12 +254,23 @@ public final class RussianAnalyzer extends Analyzer
|
||||||
/**
|
/**
|
||||||
* Builds an analyzer with the given stop words.
|
* Builds an analyzer with the given stop words.
|
||||||
* @todo create a Set version of this ctor
|
* @todo create a Set version of this ctor
|
||||||
|
* @deprecated Use {@link #RussianAnalyzer(Map)} instead.
|
||||||
*/
|
*/
|
||||||
public RussianAnalyzer(char[] charset, Map stopwords)
|
public RussianAnalyzer(char[] charset, Map stopwords)
|
||||||
{
|
{
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
stopSet = new HashSet(stopwords.keySet());
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Builds an analyzer with the given stop words.
|
||||||
|
* @todo create a Set version of this ctor
|
||||||
|
*/
|
||||||
|
public RussianAnalyzer(Map stopwords)
|
||||||
|
{
|
||||||
|
charset = RussianCharsets.UnicodeRussian;
|
||||||
|
stopSet = new HashSet(stopwords.keySet());
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStream} which tokenizes all the text in the
|
* Creates a {@link TokenStream} which tokenizes all the text in the
|
||||||
|
|
|
@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
|
||||||
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
* One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
|
||||||
* and adding logic to toLowerCase() method for that charset.
|
* and adding logic to toLowerCase() method for that charset.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
*/
|
*/
|
||||||
public class RussianCharsets
|
public class RussianCharsets
|
||||||
|
|
|
@ -37,14 +37,25 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
|
||||||
|
|
||||||
public class RussianLetterTokenizer extends CharTokenizer
|
public class RussianLetterTokenizer extends CharTokenizer
|
||||||
{
|
{
|
||||||
/** Construct a new LetterTokenizer. */
|
/**
|
||||||
|
* Charset this tokenizer uses.
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
private char[] charset;
|
private char[] charset;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead.
|
||||||
|
*/
|
||||||
public RussianLetterTokenizer(Reader in, char[] charset)
|
public RussianLetterTokenizer(Reader in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RussianLetterTokenizer(Reader in)
|
||||||
|
{
|
||||||
|
this(in, RussianCharsets.UnicodeRussian);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Collects only characters which satisfy
|
* Collects only characters which satisfy
|
||||||
|
@ -52,6 +63,7 @@ public class RussianLetterTokenizer extends CharTokenizer
|
||||||
*/
|
*/
|
||||||
protected boolean isTokenChar(char c)
|
protected boolean isTokenChar(char c)
|
||||||
{
|
{
|
||||||
|
/* in the next release, this can be implemented as isLetter(c) or [0-9] */
|
||||||
if (Character.isLetter(c))
|
if (Character.isLetter(c))
|
||||||
return true;
|
return true;
|
||||||
for (int i = 0; i < charset.length; i++)
|
for (int i = 0; i < charset.length; i++)
|
||||||
|
|
|
@ -31,16 +31,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
*/
|
*/
|
||||||
public final class RussianLowerCaseFilter extends TokenFilter
|
public final class RussianLowerCaseFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
char[] charset;
|
char[] charset;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
|
||||||
|
*/
|
||||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RussianLowerCaseFilter(TokenStream in)
|
||||||
|
{
|
||||||
|
this(in, RussianCharsets.UnicodeRussian);
|
||||||
|
}
|
||||||
|
|
||||||
public final boolean incrementToken() throws IOException
|
public final boolean incrementToken() throws IOException
|
||||||
{
|
{
|
||||||
|
|
|
@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
|
||||||
|
*/
|
||||||
public RussianStemFilter(TokenStream in, char[] charset)
|
public RussianStemFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
|
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public RussianStemFilter(TokenStream in)
|
||||||
|
{
|
||||||
|
this(in, RussianCharsets.UnicodeRussian);
|
||||||
|
}
|
||||||
/**
|
/**
|
||||||
* Returns the next token in the stream, or null at EOS
|
* Returns the next token in the stream, or null at EOS
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
|
||||||
*/
|
*/
|
||||||
class RussianStemmer
|
class RussianStemmer
|
||||||
{
|
{
|
||||||
|
/**
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
|
*/
|
||||||
private char[] charset;
|
private char[] charset;
|
||||||
|
|
||||||
// positions of RV, R1 and R2 respectively
|
// positions of RV, R1 and R2 respectively
|
||||||
|
@ -255,6 +258,7 @@ class RussianStemmer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* RussianStemmer constructor comment.
|
* RussianStemmer constructor comment.
|
||||||
|
* @deprecated Use {@link #RussianStemmer()} instead.
|
||||||
*/
|
*/
|
||||||
public RussianStemmer(char[] charset)
|
public RussianStemmer(char[] charset)
|
||||||
{
|
{
|
||||||
|
@ -529,6 +533,7 @@ class RussianStemmer
|
||||||
* Insert the method's description here.
|
* Insert the method's description here.
|
||||||
* Creation date: (16/03/2002 10:58:42 PM)
|
* Creation date: (16/03/2002 10:58:42 PM)
|
||||||
* @param newCharset char[]
|
* @param newCharset char[]
|
||||||
|
* @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
|
||||||
*/
|
*/
|
||||||
public void setCharset(char[] newCharset)
|
public void setCharset(char[] newCharset)
|
||||||
{
|
{
|
||||||
|
@ -620,6 +625,7 @@ class RussianStemmer
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Static method for stemming with different charsets
|
* Static method for stemming with different charsets
|
||||||
|
* @deprecated Use {@link #stemWord(String)} instead.
|
||||||
*/
|
*/
|
||||||
public static String stem(String theWord, char[] charset)
|
public static String stem(String theWord, char[] charset)
|
||||||
{
|
{
|
||||||
|
@ -627,4 +633,14 @@ class RussianStemmer
|
||||||
stemmer.setCharset(charset);
|
stemmer.setCharset(charset);
|
||||||
return stemmer.stem(theWord);
|
return stemmer.stem(theWord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Static method for stemming.
|
||||||
|
*/
|
||||||
|
public static String stemWord(String theWord)
|
||||||
|
{
|
||||||
|
RussianStemmer stemmer = new RussianStemmer();
|
||||||
|
stemmer.setCharset(RussianCharsets.UnicodeRussian);
|
||||||
|
return stemmer.stem(theWord);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
|
|
||||||
public void testUnicode() throws IOException
|
public void testUnicode() throws IOException
|
||||||
{
|
{
|
||||||
RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
|
RussianAnalyzer ra = new RussianAnalyzer();
|
||||||
inWords =
|
inWords =
|
||||||
new InputStreamReader(
|
new InputStreamReader(
|
||||||
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
|
||||||
|
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
|
|
||||||
RussianLetterTokenizer sample =
|
RussianLetterTokenizer sample =
|
||||||
new RussianLetterTokenizer(
|
new RussianLetterTokenizer(
|
||||||
sampleUnicode,
|
sampleUnicode);
|
||||||
RussianCharsets.UnicodeRussian);
|
|
||||||
|
|
||||||
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
||||||
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
||||||
|
|
Loading…
Reference in New Issue