LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
2009-08-22 20:36:06 +00:00 · 2009-08-22 20:36:06 +00:00 · 1ebbe2abd1
parent 136f054a05
commit 1ebbe2abd1
11 changed files with 121 additions and 9 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -40,6 +40,10 @@ API Changes
    The SpanScorer API (now QueryScorer) has also been improved to more closely
    match the API of the previous QueryScorer implementation.  (Mark Miller)  
 5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
    Analyzers. If you need to index text in these encodings, please use Java's
    character set conversion facilities (InputStreamReader, etc) during I/O, 
    so that Lucene can analyze this text as Unicode instead.  (Robert Muir)  
 Bug fixes
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
     * Charset for Greek letters.
     * Represents encoding for 24 lowercase Greek letters.
     * Predefined charsets can be taken from {@link GreekCharsets} class
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
@ -166,6 +167,7 @@ public final class GreekAnalyzer extends Analyzer
    /**
     * Builds an analyzer.
     * @deprecated Use {@link #GreekAnalyzer()} instead.
     */
    public GreekAnalyzer(char[] charset)
    {
@ -175,6 +177,17 @@ public final class GreekAnalyzer extends Analyzer
    /**
     * Builds an analyzer with the given stop words.
     * @param stopwords Array of stopwords to use.
     */
    public GreekAnalyzer(String [] stopwords)
    {
    	charset = GreekCharsets.UnicodeGreek;
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
     */
    public GreekAnalyzer(char[] charset, String[] stopwords)
    {
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
        stopSet = StopFilter.makeStopSet(stopwords);
    }
-    // Takes greek stop words and translates them to a String array, using
+    /**
-    // the given charset
+     * Takes greek stop words and translates them to a String array, using
     * the given charset.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[GREEK_STOP_WORDS.length];
@ -203,6 +219,7 @@ public final class GreekAnalyzer extends Analyzer
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
     */
    public GreekAnalyzer(char[] charset, Map stopwords)
    {
@ -210,6 +227,15 @@ public final class GreekAnalyzer extends Analyzer
        stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Builds an analyzer with the given stop words.
     */
    public GreekAnalyzer(Map stopwords)
    {
    	charset = GreekCharsets.UnicodeGreek;
    	stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
     *
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
 * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
 * the definition of a new charset as well as the required logic in the toLowerCase() method.
 * </p>
 * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 */
 public class GreekCharsets
 {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -28,10 +28,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    char[] charset;
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
     */
    public GreekLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
@ -39,6 +45,11 @@ public final class GreekLowerCaseFilter extends TokenFilter
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }
    public GreekLowerCaseFilter(TokenStream in)
    {
    	this(in, GreekCharsets.UnicodeGreek);
    }
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        char[] chArray = termAtt.termBuffer();
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
     * Charset for Russian letters.
     * Represents encoding for 32 lowercase Russian letters.
     * Predefined charsets can be taken from RussianCharSets class
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer
    /**
     * Builds an analyzer.
     * @deprecated Use {@link #RussianAnalyzer()} instead.
     */
    public RussianAnalyzer(char[] charset)
    {
@ -211,6 +213,7 @@ public final class RussianAnalyzer extends Analyzer
    /**
     * Builds an analyzer with the given stop words.
     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
     */
    public RussianAnalyzer(char[] charset, String[] stopwords)
    {
@ -218,8 +221,19 @@ public final class RussianAnalyzer extends Analyzer
        stopSet = StopFilter.makeStopSet(stopwords);
    }
-    // Takes russian stop words and translates them to a String array, using
+    /**
-    // the given charset
+     * Builds an analyzer with the given stop words.
     */
    public RussianAnalyzer(String[] stopwords)
    {
    	this.charset = RussianCharsets.UnicodeRussian;
    	stopSet = StopFilter.makeStopSet(stopwords);
    }
    /** Takes russian stop words and translates them to a String array, using
     * the given charset.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[RUSSIAN_STOP_WORDS.length];
@ -240,6 +254,7 @@ public final class RussianAnalyzer extends Analyzer
    /**
     * Builds an analyzer with the given stop words.
     * @todo create a Set version of this ctor
     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
     */
    public RussianAnalyzer(char[] charset, Map stopwords)
    {
@ -247,6 +262,16 @@ public final class RussianAnalyzer extends Analyzer
        stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Builds an analyzer with the given stop words.
     * @todo create a Set version of this ctor
     */
    public RussianAnalyzer(Map stopwords)
    {
    	charset = RussianCharsets.UnicodeRussian;
    	stopSet = new HashSet(stopwords.keySet());
    }
    /**
     * Creates a {@link TokenStream} which tokenizes all the text in the 
     * provided {@link Reader}.
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
 * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
 * and adding logic to toLowerCase() method for that charset.
 * </p>
- *
+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 * @version $Id$
 */
 public class RussianCharsets
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -37,21 +37,33 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs
 public class RussianLetterTokenizer extends CharTokenizer
 {
-    /** Construct a new LetterTokenizer. */
+    /** 
     * Charset this tokenizer uses.
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;
    /**
     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
     */
    public RussianLetterTokenizer(Reader in, char[] charset)
    {
        super(in);
        this.charset = charset;
    }
    public RussianLetterTokenizer(Reader in)
    {
    	this(in, RussianCharsets.UnicodeRussian);
    }
    /**
     * Collects only characters which satisfy
     * {@link Character#isLetter(char)}.
     */
    protected boolean isTokenChar(char c)
    {
    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
        if (Character.isLetter(c))
            return true;
        for (int i = 0; i < charset.length; i++)
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -31,10 +31,16 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    char[] charset;
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
     */
    public RussianLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
@ -42,6 +48,11 @@ public final class RussianLowerCaseFilter extends TokenFilter
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }
    public RussianLowerCaseFilter(TokenStream in)
    {
    	this(in, RussianCharsets.UnicodeRussian);
    }
    public final boolean incrementToken() throws IOException
    {
      if (input.incrementToken()) {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter
    private TermAttribute termAtt;
    /**
     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
     */
    public RussianStemFilter(TokenStream in, char[] charset)
    {
        super(in);
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }
    public RussianStemFilter(TokenStream in)
    {
    	this(in, RussianCharsets.UnicodeRussian);
    }
    /**
     * Returns the next token in the stream, or null at EOS
     */
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
 */
 class RussianStemmer
 {
    /**
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
     */
    private char[] charset;
    // positions of RV, R1 and R2 respectively
@ -255,6 +258,7 @@ class RussianStemmer
    /**
     * RussianStemmer constructor comment.
     * @deprecated Use {@link #RussianStemmer()} instead.
     */
    public RussianStemmer(char[] charset)
    {
@ -529,6 +533,7 @@ class RussianStemmer
     * Insert the method's description here.
     * Creation date: (16/03/2002 10:58:42 PM)
     * @param newCharset char[]
     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    public void setCharset(char[] newCharset)
    {
@ -620,6 +625,7 @@ class RussianStemmer
    /**
     * Static method for stemming with different charsets
     * @deprecated Use {@link #stemWord(String)} instead.
     */
    public static String stem(String theWord, char[] charset)
    {
@ -627,4 +633,14 @@ class RussianStemmer
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    }
    /**
     * Static method for stemming.
     */
    public static String stemWord(String theWord)
    {
        RussianStemmer stemmer = new RussianStemmer();
        stemmer.setCharset(RussianCharsets.UnicodeRussian);
        return stemmer.stem(theWord);
    }
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase
    public void testUnicode() throws IOException
    {
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
+        RussianAnalyzer ra = new RussianAnalyzer();
        inWords =
            new InputStreamReader(
                new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase
        RussianLetterTokenizer sample =
            new RussianLetterTokenizer(
-                sampleUnicode,
+                sampleUnicode);
                RussianCharsets.UnicodeRussian);
        TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
        TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);