From 1ebbe2abd1457e734846b46a7cac9cb2bec7e844 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Sat, 22 Aug 2009 20:36:06 +0000 Subject: [PATCH] LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68 --- contrib/CHANGES.txt | 4 +++ .../lucene/analysis/el/GreekAnalyzer.java | 30 +++++++++++++++++-- .../lucene/analysis/el/GreekCharsets.java | 1 + .../analysis/el/GreekLowerCaseFilter.java | 11 +++++++ .../lucene/analysis/ru/RussianAnalyzer.java | 29 ++++++++++++++++-- .../lucene/analysis/ru/RussianCharsets.java | 2 +- .../analysis/ru/RussianLetterTokenizer.java | 14 ++++++++- .../analysis/ru/RussianLowerCaseFilter.java | 11 +++++++ .../lucene/analysis/ru/RussianStemFilter.java | 7 +++++ .../lucene/analysis/ru/RussianStemmer.java | 16 ++++++++++ .../analysis/ru/TestRussianAnalyzer.java | 5 ++-- 11 files changed, 121 insertions(+), 9 deletions(-) diff --git a/contrib/CHANGES.txt b/contrib/CHANGES.txt index 4e0f311920a..83f530ad392 100644 --- a/contrib/CHANGES.txt +++ b/contrib/CHANGES.txt @@ -40,6 +40,10 @@ API Changes The SpanScorer API (now QueryScorer) has also been improved to more closely match the API of the previous QueryScorer implementation. (Mark Miller) + 5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian + Analyzers. If you need to index text in these encodings, please use Java's + character set conversion facilities (InputStreamReader, etc) during I/O, + so that Lucene can analyze this text as Unicode instead. (Robert Muir) Bug fixes diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java index 4de196e79d6..ddd9073456a 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java @@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer * Charset for Greek letters. * Represents encoding for 24 lowercase Greek letters. * Predefined charsets can be taken from {@link GreekCharsets} class + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ private char[] charset; @@ -166,15 +167,27 @@ public final class GreekAnalyzer extends Analyzer /** * Builds an analyzer. + * @deprecated Use {@link #GreekAnalyzer()} instead. */ public GreekAnalyzer(char[] charset) { this.charset = charset; stopSet = StopFilter.makeStopSet(makeStopWords(charset)); } + + /** + * Builds an analyzer with the given stop words. + * @param stopwords Array of stopwords to use. + */ + public GreekAnalyzer(String [] stopwords) + { + charset = GreekCharsets.UnicodeGreek; + stopSet = StopFilter.makeStopSet(stopwords); + } /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #GreekAnalyzer(String[])} instead. */ public GreekAnalyzer(char[] charset, String[] stopwords) { @@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer stopSet = StopFilter.makeStopSet(stopwords); } - // Takes greek stop words and translates them to a String array, using - // the given charset + /** + * Takes greek stop words and translates them to a String array, using + * the given charset. + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ private static String[] makeStopWords(char[] charset) { String[] res = new String[GREEK_STOP_WORDS.length]; @@ -203,12 +219,22 @@ public final class GreekAnalyzer extends Analyzer /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #GreekAnalyzer(Map)} instead. */ public GreekAnalyzer(char[] charset, Map stopwords) { this.charset = charset; stopSet = new HashSet(stopwords.keySet()); } + + /** + * Builds an analyzer with the given stop words. + */ + public GreekAnalyzer(Map stopwords) + { + charset = GreekCharsets.UnicodeGreek; + stopSet = new HashSet(stopwords.keySet()); + } /** * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}. diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java index 80470976fd6..a2416a46690 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java @@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el; * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding * the definition of a new charset as well as the required logic in the toLowerCase() method. *

+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ public class GreekCharsets { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java index 8e94ac55ef1..03c46bd23d3 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java @@ -28,16 +28,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; */ public final class GreekLowerCaseFilter extends TokenFilter { + /** + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ char[] charset; private TermAttribute termAtt; + /** + * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead. + */ public GreekLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public GreekLowerCaseFilter(TokenStream in) + { + this(in, GreekCharsets.UnicodeGreek); + } public boolean incrementToken() throws IOException { if (input.incrementToken()) { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java index a6ab1819c3b..468b9b89cc2 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java @@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer * Charset for Russian letters. * Represents encoding for 32 lowercase Russian letters. * Predefined charsets can be taken from RussianCharSets class + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ private char[] charset; @@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer /** * Builds an analyzer. + * @deprecated Use {@link #RussianAnalyzer()} instead. */ public RussianAnalyzer(char[] charset) { @@ -211,15 +213,27 @@ public final class RussianAnalyzer extends Analyzer /** * Builds an analyzer with the given stop words. + * @deprecated Use {@link #RussianAnalyzer(String[])} instead. */ public RussianAnalyzer(char[] charset, String[] stopwords) { this.charset = charset; stopSet = StopFilter.makeStopSet(stopwords); } + + /** + * Builds an analyzer with the given stop words. + */ + public RussianAnalyzer(String[] stopwords) + { + this.charset = RussianCharsets.UnicodeRussian; + stopSet = StopFilter.makeStopSet(stopwords); + } - // Takes russian stop words and translates them to a String array, using - // the given charset + /** Takes russian stop words and translates them to a String array, using + * the given charset. + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ private static String[] makeStopWords(char[] charset) { String[] res = new String[RUSSIAN_STOP_WORDS.length]; @@ -240,12 +254,23 @@ public final class RussianAnalyzer extends Analyzer /** * Builds an analyzer with the given stop words. * @todo create a Set version of this ctor + * @deprecated Use {@link #RussianAnalyzer(Map)} instead. */ public RussianAnalyzer(char[] charset, Map stopwords) { this.charset = charset; stopSet = new HashSet(stopwords.keySet()); } + + /** + * Builds an analyzer with the given stop words. + * @todo create a Set version of this ctor + */ + public RussianAnalyzer(Map stopwords) + { + charset = RussianCharsets.UnicodeRussian; + stopSet = new HashSet(stopwords.keySet()); + } /** * Creates a {@link TokenStream} which tokenizes all the text in the diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java index bacbd325c06..ba05652f016 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java @@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru; * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset * and adding logic to toLowerCase() method for that charset. *

- * + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 * @version $Id$ */ public class RussianCharsets diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java index 3a38e3e6799..c3f33f48965 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java @@ -37,14 +37,25 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs public class RussianLetterTokenizer extends CharTokenizer { - /** Construct a new LetterTokenizer. */ + /** + * Charset this tokenizer uses. + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ private char[] charset; + /** + * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. + */ public RussianLetterTokenizer(Reader in, char[] charset) { super(in); this.charset = charset; } + + public RussianLetterTokenizer(Reader in) + { + this(in, RussianCharsets.UnicodeRussian); + } /** * Collects only characters which satisfy @@ -52,6 +63,7 @@ public class RussianLetterTokenizer extends CharTokenizer */ protected boolean isTokenChar(char c) { + /* in the next release, this can be implemented as isLetter(c) or [0-9] */ if (Character.isLetter(c)) return true; for (int i = 0; i < charset.length; i++) diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java index 41eed6ae91c..1407f08179f 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java @@ -31,16 +31,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; */ public final class RussianLowerCaseFilter extends TokenFilter { + /** + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ char[] charset; private TermAttribute termAtt; + /** + * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead. + */ public RussianLowerCaseFilter(TokenStream in, char[] charset) { super(in); this.charset = charset; termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + + public RussianLowerCaseFilter(TokenStream in) + { + this(in, RussianCharsets.UnicodeRussian); + } public final boolean incrementToken() throws IOException { diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java index 4aed458a364..3e44411abf6 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java @@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter private TermAttribute termAtt; + /** + * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead. + */ public RussianStemFilter(TokenStream in, char[] charset) { super(in); @@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter termAtt = (TermAttribute) addAttribute(TermAttribute.class); } + public RussianStemFilter(TokenStream in) + { + this(in, RussianCharsets.UnicodeRussian); + } /** * Returns the next token in the stream, or null at EOS */ diff --git a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java index c5b2e98e4de..42f076d9668 100644 --- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java +++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java @@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru; */ class RussianStemmer { + /** + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 + */ private char[] charset; // positions of RV, R1 and R2 respectively @@ -255,6 +258,7 @@ class RussianStemmer /** * RussianStemmer constructor comment. + * @deprecated Use {@link #RussianStemmer()} instead. */ public RussianStemmer(char[] charset) { @@ -529,6 +533,7 @@ class RussianStemmer * Insert the method's description here. * Creation date: (16/03/2002 10:58:42 PM) * @param newCharset char[] + * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 */ public void setCharset(char[] newCharset) { @@ -620,6 +625,7 @@ class RussianStemmer /** * Static method for stemming with different charsets + * @deprecated Use {@link #stemWord(String)} instead. */ public static String stem(String theWord, char[] charset) { @@ -627,4 +633,14 @@ class RussianStemmer stemmer.setCharset(charset); return stemmer.stem(theWord); } + + /** + * Static method for stemming. + */ + public static String stemWord(String theWord) + { + RussianStemmer stemmer = new RussianStemmer(); + stemmer.setCharset(RussianCharsets.UnicodeRussian); + return stemmer.stem(theWord); + } } diff --git a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java index 96c7dc0b719..0683d0b2d61 100644 --- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java +++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java @@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase public void testUnicode() throws IOException { - RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian); + RussianAnalyzer ra = new RussianAnalyzer(); inWords = new InputStreamReader( new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")), @@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase RussianLetterTokenizer sample = new RussianLetterTokenizer( - sampleUnicode, - RussianCharsets.UnicodeRussian); + sampleUnicode); TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class); TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);