LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-12-07 16:19:17 +00:00
parent 2b9726ae81
commit 620b2a0619
12 changed files with 120 additions and 76 deletions

View File

@ -171,6 +171,9 @@ API Changes
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead. new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
(Robert Muir, Uwe Schindler) (Robert Muir, Uwe Schindler)
* LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes
most languages correctly including Arabic. (Steven Rowe, Robert Muir)
New features New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser. * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.analysis.util.WordlistLoader;
@ -132,7 +133,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}. * used to tokenize all the text in the provided {@link Reader}.
* *
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link ArabicLetterTokenizer} filtered with * built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter}, * {@link LowerCaseFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter} * {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}. * if a stem exclusion set is provided and {@link ArabicStemFilter}.
@ -140,7 +141,8 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
@Override @Override
protected TokenStreamComponents createComponents(String fieldName, protected TokenStreamComponents createComponents(String fieldName,
Reader reader) { Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source); TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized! // the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords); result = new StopFilter( matchVersion, result, stopwords);

View File

@ -20,6 +20,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -38,7 +39,9 @@ import org.apache.lucene.util.Version;
* detect token characters. See {@link #isTokenChar(int)} and * detect token characters. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li> * {@link #normalize(int)} for details.</li>
* </ul> * </ul>
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
*/ */
@Deprecated
public class ArabicLetterTokenizer extends LetterTokenizer { public class ArabicLetterTokenizer extends LetterTokenizer {
/** /**
* Construct a new ArabicLetterTokenizer. * Construct a new ArabicLetterTokenizer.

View File

@ -22,12 +22,14 @@ import java.io.Reader;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version; import org.apache.lucene.util.Version;
@ -109,14 +111,19 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}. * used to tokenize all the text in the provided {@link Reader}.
* *
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from a {@link ArabicLetterTokenizer} filtered with * built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}, * {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words * {@link PersianNormalizationFilter} and Persian Stop words
*/ */
@Override @Override
protected TokenStreamComponents createComponents(String fieldName, protected TokenStreamComponents createComponents(String fieldName,
Reader reader) { Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); final Tokenizer source;
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
source = new StandardTokenizer(matchVersion, reader);
} else {
source = new ArabicLetterTokenizer(matchVersion, reader);
}
TokenStream result = new LowerCaseFilter(matchVersion, source); TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result); result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */ /* additional persian-specific normalization */
@ -127,4 +134,14 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/ */
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
} }
/**
* Wraps the Reader with {@link PersianCharFilter}
*/
@Override
protected Reader initReader(Reader reader) {
return matchVersion.onOrAfter(Version.LUCENE_31) ?
new PersianCharFilter(CharReader.get(reader)) :
reader;
}
} }

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.fa;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.CharFilter;
/**
* CharFilter that replaces instances of Zero-width non-joiner with an
* ordinary space.
*/
public class PersianCharFilter extends CharFilter {
public PersianCharFilter(CharStream in) {
super(in);
}
public int read(char[] cbuf, int off, int len) throws IOException {
final int charsRead = super.read(cbuf, off, len);
if (charsRead > 0) {
final int end = off + charsRead;
while (off < end) {
if (cbuf[off] == '\u200C')
cbuf[off] = ' ';
off++;
}
}
return charsRead;
}
}

View File

@ -75,8 +75,9 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
final Reader reader) throws IOException { final Reader reader) throws IOException {
TokenStreamComponents streamChain = (TokenStreamComponents) TokenStreamComponents streamChain = (TokenStreamComponents)
getPreviousTokenStream(); getPreviousTokenStream();
if (streamChain == null || !streamChain.reset(reader)) { final Reader r = initReader(reader);
streamChain = createComponents(fieldName, reader); if (streamChain == null || !streamChain.reset(r)) {
streamChain = createComponents(fieldName, r);
setPreviousTokenStream(streamChain); setPreviousTokenStream(streamChain);
} }
return streamChain.getTokenStream(); return streamChain.getTokenStream();
@ -95,7 +96,14 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
@Override @Override
public final TokenStream tokenStream(final String fieldName, public final TokenStream tokenStream(final String fieldName,
final Reader reader) { final Reader reader) {
return createComponents(fieldName, reader).getTokenStream(); return createComponents(fieldName, initReader(reader)).getTokenStream();
}
/**
* Override this if you want to add a CharFilter chain.
*/
protected Reader initReader(Reader reader) {
return reader;
} }
/** /**

View File

@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
/** /**
* Testcase for {@link TestArabicLetterTokenizer} * Testcase for {@link TestArabicLetterTokenizer}
* @deprecated (3.1) Remove in Lucene 5.0
*/ */
@Deprecated
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase { public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
public void testArabicLetterTokenizer() throws IOException { public void testArabicLetterTokenizer() throws IOException {

View File

@ -1,44 +0,0 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
/**
* Test IndicTokenizer
*/
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
/** Test tokenizing Indic vowels, signs, and punctuation */
public void testBasics() throws IOException {
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
assertTokenStreamContents(ts,
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
}
/** Test that words with format chars such as ZWJ are kept */
public void testFormat() throws Exception {
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
new StringReader("शार्‍मा शार्‍मा"));
assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" });
}
}

View File

@ -23,7 +23,9 @@ import java.io.Reader;
/** /**
* Factory for {@link ArabicLetterTokenizer} * Factory for {@link ArabicLetterTokenizer}
* @deprecated (3.1) Use StandardTokenizerFactory instead.
**/ **/
@Deprecated
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{ public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
public ArabicLetterTokenizer create(Reader input) { public ArabicLetterTokenizer create(Reader input) {

View File

@ -17,15 +17,16 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import java.io.Reader; import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.Tokenizer; /**
import org.apache.lucene.analysis.in.IndicTokenizer; * Factory for {@link PersianCharFilter}
*/
public class PersianCharFilterFactory extends BaseCharFilterFactory {
/** Factory for {@link IndicTokenizer} */ @Override
public class IndicTokenizerFactory extends BaseTokenizerFactory { public CharStream create(CharStream input) {
public Tokenizer create(Reader input) { return new PersianCharFilter(input);
assureMatchVersion();
return new IndicTokenizer(luceneMatchVersion, input);
} }
} }

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.Reader; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokenizer;
public class TestArabicFilters extends BaseTokenTestCase { public class TestArabicFilters extends BaseTokenTestCase {
/** /**
* Test ArabicLetterTokenizerFactory * Test ArabicLetterTokenizerFactory
* @deprecated (3.1) Remove in Lucene 5.0
*/ */
@Deprecated
public void testTokenizer() throws Exception { public void testTokenizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم"); Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
@ -43,7 +46,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
*/ */
public void testNormalizer() throws Exception { public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم"); Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory(); ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM); factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM); filterFactory.init(DEFAULT_VERSION_PARAM);
@ -57,7 +60,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
*/ */
public void testStemmer() throws Exception { public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم"); Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory(); ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory(); ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
factory.init(DEFAULT_VERSION_PARAM); factory.init(DEFAULT_VERSION_PARAM);
@ -67,4 +70,16 @@ public class TestArabicFilters extends BaseTokenTestCase {
stream = stemFactory.create(stream); stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"}); assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
} }
/**
* Test PersianCharFilterFactory
*/
public void testPersianCharFilter() throws Exception {
Reader reader = new StringReader("می‌خورد");
PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory();
StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory();
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader)));
assertTokenStreamContents(stream, new String[] { "می", "خورد" });
}
} }

View File

@ -27,24 +27,12 @@ import org.apache.lucene.analysis.Tokenizer;
* Simple tests to ensure the Hindi filter Factories are working. * Simple tests to ensure the Hindi filter Factories are working.
*/ */
public class TestHindiFilters extends BaseTokenTestCase { public class TestHindiFilters extends BaseTokenTestCase {
/**
* Test IndicTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।");
IndicTokenizerFactory factory = new IndicTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
}
/** /**
* Test IndicNormalizationFilterFactory * Test IndicNormalizationFilterFactory
*/ */
public void testIndicNormalizer() throws Exception { public void testIndicNormalizer() throws Exception {
Reader reader = new StringReader("ত্‍ अाैर"); Reader reader = new StringReader("ত্‍ अाैर");
IndicTokenizerFactory factory = new IndicTokenizerFactory(); StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory(); IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM); factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM); filterFactory.init(DEFAULT_VERSION_PARAM);
@ -58,7 +46,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
*/ */
public void testHindiNormalizer() throws Exception { public void testHindiNormalizer() throws Exception {
Reader reader = new StringReader("क़िताब"); Reader reader = new StringReader("क़िताब");
IndicTokenizerFactory factory = new IndicTokenizerFactory(); StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM); factory.init(DEFAULT_VERSION_PARAM);
@ -74,7 +62,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
*/ */
public void testStemmer() throws Exception { public void testStemmer() throws Exception {
Reader reader = new StringReader("किताबें"); Reader reader = new StringReader("किताबें");
IndicTokenizerFactory factory = new IndicTokenizerFactory(); StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory(); HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();