LUCENE-1793: Deprecate custom encoding support in Greek and Russian analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@806886 13f79535-47bb-0310-9956-ffa450edef68
2009-08-22 20:36:06 +00:00 · 2009-08-22 20:36:06 +00:00 · 1ebbe2abd1
parent 136f054a05
commit 1ebbe2abd1
11 changed files with 121 additions and 9 deletions
--- a/contrib/CHANGES.txt
+++ b/contrib/CHANGES.txt
@ -40,6 +40,10 @@ API Changes
    The SpanScorer API (now QueryScorer) has also been improved to more closely
    match the API of the previous QueryScorer implementation.  (Mark Miller)  

+ 5. LUCENE-1793: Deprecate the custom encoding support in the Greek and Russian
+    Analyzers. If you need to index text in these encodings, please use Java's
+    character set conversion facilities (InputStreamReader, etc) during I/O, 
+    so that Lucene can analyze this text as Unicode instead.  (Robert Muir)  

 Bug fixes

--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekAnalyzer.java
@ -155,6 +155,7 @@ public final class GreekAnalyzer extends Analyzer
     * Charset for Greek letters.
     * Represents encoding for 24 lowercase Greek letters.
     * Predefined charsets can be taken from {@link GreekCharsets} class
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;

@ -166,15 +167,27 @@ public final class GreekAnalyzer extends Analyzer

    /**
     * Builds an analyzer.
+     * @deprecated Use {@link #GreekAnalyzer()} instead.
     */
    public GreekAnalyzer(char[] charset)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(makeStopWords(charset));
    }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     * @param stopwords Array of stopwords to use.
+     */
+    public GreekAnalyzer(String [] stopwords)
+    {
+    	charset = GreekCharsets.UnicodeGreek;
+    	stopSet = StopFilter.makeStopSet(stopwords);
+    }

    /**
     * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #GreekAnalyzer(String[])} instead.
     */
    public GreekAnalyzer(char[] charset, String[] stopwords)
    {
@ -182,8 +195,11 @@ public final class GreekAnalyzer extends Analyzer
        stopSet = StopFilter.makeStopSet(stopwords);
    }

-    // Takes greek stop words and translates them to a String array, using
-    // the given charset
+    /**
+     * Takes greek stop words and translates them to a String array, using
+     * the given charset.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[GREEK_STOP_WORDS.length];
@ -203,12 +219,22 @@ public final class GreekAnalyzer extends Analyzer

    /**
     * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #GreekAnalyzer(Map)} instead.
     */
    public GreekAnalyzer(char[] charset, Map stopwords)
    {
        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public GreekAnalyzer(Map stopwords)
+    {
+    	charset = GreekCharsets.UnicodeGreek;
+    	stopSet = new HashSet(stopwords.keySet());
+    }

    /**
     * Creates a {@link TokenStream} which tokenizes all the text in the provided {@link Reader}.
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekCharsets.java
@ -24,6 +24,7 @@ package org.apache.lucene.analysis.el;
 * including accented ones. One should be able to add other encoding schemes (see RFC 1947) by adding
 * the definition of a new charset as well as the required logic in the toLowerCase() method.
 * </p>
+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 */
 public class GreekCharsets
 {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java
@ -28,16 +28,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 */
 public final class GreekLowerCaseFilter extends TokenFilter
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
    char[] charset;

    private TermAttribute termAtt;
    
+    /**
+     * @deprecated Use {@link #GreekLowerCaseFilter(TokenStream)} instead.
+     */
    public GreekLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
        this.charset = charset;
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }
+    
+    public GreekLowerCaseFilter(TokenStream in)
+    {
+    	this(in, GreekCharsets.UnicodeGreek);
+    }

    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianAnalyzer.java
@ -190,6 +190,7 @@ public final class RussianAnalyzer extends Analyzer
     * Charset for Russian letters.
     * Represents encoding for 32 lowercase Russian letters.
     * Predefined charsets can be taken from RussianCharSets class
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    private char[] charset;

@ -202,6 +203,7 @@ public final class RussianAnalyzer extends Analyzer

    /**
     * Builds an analyzer.
+     * @deprecated Use {@link #RussianAnalyzer()} instead.
     */
    public RussianAnalyzer(char[] charset)
    {
@ -211,15 +213,27 @@ public final class RussianAnalyzer extends Analyzer

    /**
     * Builds an analyzer with the given stop words.
+     * @deprecated Use {@link #RussianAnalyzer(String[])} instead.
     */
    public RussianAnalyzer(char[] charset, String[] stopwords)
    {
        this.charset = charset;
        stopSet = StopFilter.makeStopSet(stopwords);
    }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     */
+    public RussianAnalyzer(String[] stopwords)
+    {
+    	this.charset = RussianCharsets.UnicodeRussian;
+    	stopSet = StopFilter.makeStopSet(stopwords);
+    }

-    // Takes russian stop words and translates them to a String array, using
-    // the given charset
+    /** Takes russian stop words and translates them to a String array, using
+     * the given charset.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
    private static String[] makeStopWords(char[] charset)
    {
        String[] res = new String[RUSSIAN_STOP_WORDS.length];
@ -240,12 +254,23 @@ public final class RussianAnalyzer extends Analyzer
    /**
     * Builds an analyzer with the given stop words.
     * @todo create a Set version of this ctor
+     * @deprecated Use {@link #RussianAnalyzer(Map)} instead.
     */
    public RussianAnalyzer(char[] charset, Map stopwords)
    {
        this.charset = charset;
        stopSet = new HashSet(stopwords.keySet());
    }
+    
+    /**
+     * Builds an analyzer with the given stop words.
+     * @todo create a Set version of this ctor
+     */
+    public RussianAnalyzer(Map stopwords)
+    {
+    	charset = RussianCharsets.UnicodeRussian;
+    	stopSet = new HashSet(stopwords.keySet());
+    }

    /**
     * Creates a {@link TokenStream} which tokenizes all the text in the 
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianCharsets.java
@ -24,7 +24,7 @@ package org.apache.lucene.analysis.ru;
 * One should be able to add other encoding schemes (like ISO-8859-5 or customized) by adding a new charset
 * and adding logic to toLowerCase() method for that charset.
 * </p>
- *
+ * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
 * @version $Id$
 */
 public class RussianCharsets
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLetterTokenizer.java
@ -37,14 +37,25 @@ import org.apache.lucene.analysis.LetterTokenizer; // for javadocs

 public class RussianLetterTokenizer extends CharTokenizer
 {
-    /** Construct a new LetterTokenizer. */
+    /** 
+     * Charset this tokenizer uses.
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
    private char[] charset;

+    /**
+     * @deprecated Use {@link #RussianLetterTokenizer(Reader)} instead. 
+     */
    public RussianLetterTokenizer(Reader in, char[] charset)
    {
        super(in);
        this.charset = charset;
    }
+    
+    public RussianLetterTokenizer(Reader in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }

    /**
     * Collects only characters which satisfy
@ -52,6 +63,7 @@ public class RussianLetterTokenizer extends CharTokenizer
     */
    protected boolean isTokenChar(char c)
    {
+    	/* in the next release, this can be implemented as isLetter(c) or [0-9] */
        if (Character.isLetter(c))
            return true;
        for (int i = 0; i < charset.length; i++)
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianLowerCaseFilter.java
@ -31,16 +31,27 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 */
 public final class RussianLowerCaseFilter extends TokenFilter
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
+     */
    char[] charset;

    private TermAttribute termAtt;

+    /**
+     * @deprecated Use {@link #RussianLowerCaseFilter(TokenStream)} instead.
+     */
    public RussianLowerCaseFilter(TokenStream in, char[] charset)
    {
        super(in);
        this.charset = charset;
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }
+    
+    public RussianLowerCaseFilter(TokenStream in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }

    public final boolean incrementToken() throws IOException
    {
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemFilter.java
@ -42,6 +42,9 @@ public final class RussianStemFilter extends TokenFilter

    private TermAttribute termAtt;

+    /**
+     * @deprecated Use {@link #RussianStemFilter(TokenStream)} instead.
+     */
    public RussianStemFilter(TokenStream in, char[] charset)
    {
        super(in);
@ -49,6 +52,10 @@ public final class RussianStemFilter extends TokenFilter
        termAtt = (TermAttribute) addAttribute(TermAttribute.class);
    }

+    public RussianStemFilter(TokenStream in)
+    {
+    	this(in, RussianCharsets.UnicodeRussian);
+    }
    /**
     * Returns the next token in the stream, or null at EOS
     */
--- a/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
+++ b/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@ -25,6 +25,9 @@ package org.apache.lucene.analysis.ru;
 */
 class RussianStemmer
 {
+    /**
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0 
+     */
    private char[] charset;

    // positions of RV, R1 and R2 respectively
@ -255,6 +258,7 @@ class RussianStemmer

    /**
     * RussianStemmer constructor comment.
+     * @deprecated Use {@link #RussianStemmer()} instead.
     */
    public RussianStemmer(char[] charset)
    {
@ -529,6 +533,7 @@ class RussianStemmer
     * Insert the method's description here.
     * Creation date: (16/03/2002 10:58:42 PM)
     * @param newCharset char[]
+     * @deprecated Support for non-Unicode encodings will be removed in Lucene 3.0
     */
    public void setCharset(char[] newCharset)
    {
@ -620,6 +625,7 @@ class RussianStemmer

    /**
     * Static method for stemming with different charsets
+     * @deprecated Use {@link #stemWord(String)} instead.
     */
    public static String stem(String theWord, char[] charset)
    {
@ -627,4 +633,14 @@ class RussianStemmer
        stemmer.setCharset(charset);
        return stemmer.stem(theWord);
    }
+    
+    /**
+     * Static method for stemming.
+     */
+    public static String stemWord(String theWord)
+    {
+        RussianStemmer stemmer = new RussianStemmer();
+        stemmer.setCharset(RussianCharsets.UnicodeRussian);
+        return stemmer.stem(theWord);
+    }
 }
--- a/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
+++ b/contrib/analyzers/common/src/test/org/apache/lucene/analysis/ru/TestRussianAnalyzer.java
@ -60,7 +60,7 @@ public class TestRussianAnalyzer extends TestCase

    public void testUnicode() throws IOException
    {
-        RussianAnalyzer ra = new RussianAnalyzer(RussianCharsets.UnicodeRussian);
+        RussianAnalyzer ra = new RussianAnalyzer();
        inWords =
            new InputStreamReader(
                new FileInputStream(new File(dataDir, "/org/apache/lucene/analysis/ru/testUTF8.txt")),
@ -75,8 +75,7 @@ public class TestRussianAnalyzer extends TestCase

        RussianLetterTokenizer sample =
            new RussianLetterTokenizer(
-                sampleUnicode,
-                RussianCharsets.UnicodeRussian);
+                sampleUnicode);

        TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
        TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);