LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2010-12-07 16:19:17 +00:00
parent 2b9726ae81
commit 620b2a0619
12 changed files with 120 additions and 76 deletions

View File

@ -171,6 +171,9 @@ API Changes
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
(Robert Muir, Uwe Schindler)
* LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes
most languages correctly including Arabic. (Steven Rowe, Robert Muir)
New features
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.

View File

@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.analysis.util.WordlistLoader;
@ -132,7 +133,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from an {@link ArabicLetterTokenizer} filtered with
* built from an {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link StopFilter},
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
@ -140,7 +141,8 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
TokenStream result = new LowerCaseFilter(matchVersion, source);
// the order here is important: the stopword list is not normalized!
result = new StopFilter( matchVersion, result, stopwords);

View File

@ -20,6 +20,7 @@ import java.io.Reader;
import org.apache.lucene.analysis.CharTokenizer;
import org.apache.lucene.analysis.core.LetterTokenizer;
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
@ -38,7 +39,9 @@ import org.apache.lucene.util.Version;
* detect token characters. See {@link #isTokenChar(int)} and
* {@link #normalize(int)} for details.</li>
* </ul>
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
*/
@Deprecated
public class ArabicLetterTokenizer extends LetterTokenizer {
/**
* Construct a new ArabicLetterTokenizer.

View File

@ -22,12 +22,14 @@ import java.io.Reader;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;
@ -109,14 +111,19 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
* used to tokenize all the text in the provided {@link Reader}.
*
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
* built from a {@link ArabicLetterTokenizer} filtered with
* built from a {@link StandardTokenizer} filtered with
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
* {@link PersianNormalizationFilter} and Persian Stop words
*/
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
final Tokenizer source;
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
source = new StandardTokenizer(matchVersion, reader);
} else {
source = new ArabicLetterTokenizer(matchVersion, reader);
}
TokenStream result = new LowerCaseFilter(matchVersion, source);
result = new ArabicNormalizationFilter(result);
/* additional persian-specific normalization */
@ -127,4 +134,14 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
*/
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
}
/**
* Wraps the Reader with {@link PersianCharFilter}
*/
@Override
protected Reader initReader(Reader reader) {
return matchVersion.onOrAfter(Version.LUCENE_31) ?
new PersianCharFilter(CharReader.get(reader)) :
reader;
}
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.analysis.fa;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.charfilter.CharFilter;
/**
* CharFilter that replaces instances of Zero-width non-joiner with an
* ordinary space.
*/
public class PersianCharFilter extends CharFilter {
public PersianCharFilter(CharStream in) {
super(in);
}
public int read(char[] cbuf, int off, int len) throws IOException {
final int charsRead = super.read(cbuf, off, len);
if (charsRead > 0) {
final int end = off + charsRead;
while (off < end) {
if (cbuf[off] == '\u200C')
cbuf[off] = ' ';
off++;
}
}
return charsRead;
}
}

View File

@ -75,8 +75,9 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
final Reader reader) throws IOException {
TokenStreamComponents streamChain = (TokenStreamComponents)
getPreviousTokenStream();
if (streamChain == null || !streamChain.reset(reader)) {
streamChain = createComponents(fieldName, reader);
final Reader r = initReader(reader);
if (streamChain == null || !streamChain.reset(r)) {
streamChain = createComponents(fieldName, r);
setPreviousTokenStream(streamChain);
}
return streamChain.getTokenStream();
@ -95,7 +96,14 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
@Override
public final TokenStream tokenStream(final String fieldName,
final Reader reader) {
return createComponents(fieldName, reader).getTokenStream();
return createComponents(fieldName, initReader(reader)).getTokenStream();
}
/**
* Override this if you want to add a CharFilter chain.
*/
protected Reader initReader(Reader reader) {
return reader;
}
/**

View File

@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
/**
* Testcase for {@link TestArabicLetterTokenizer}
* @deprecated (3.1) Remove in Lucene 5.0
*/
@Deprecated
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
public void testArabicLetterTokenizer() throws IOException {

View File

@ -1,44 +0,0 @@
package org.apache.lucene.analysis.in;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
/**
* Test IndicTokenizer
*/
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
/** Test tokenizing Indic vowels, signs, and punctuation */
public void testBasics() throws IOException {
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
assertTokenStreamContents(ts,
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
}
/** Test that words with format chars such as ZWJ are kept */
public void testFormat() throws Exception {
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
new StringReader("शार्‍मा शार्‍मा"));
assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" });
}
}

View File

@ -23,7 +23,9 @@ import java.io.Reader;
/**
* Factory for {@link ArabicLetterTokenizer}
* @deprecated (3.1) Use StandardTokenizerFactory instead.
**/
@Deprecated
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
public ArabicLetterTokenizer create(Reader input) {

View File

@ -17,15 +17,16 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import java.io.Reader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.fa.PersianCharFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.in.IndicTokenizer;
/**
* Factory for {@link PersianCharFilter}
*/
public class PersianCharFilterFactory extends BaseCharFilterFactory {
/** Factory for {@link IndicTokenizer} */
public class IndicTokenizerFactory extends BaseTokenizerFactory {
public Tokenizer create(Reader input) {
assureMatchVersion();
return new IndicTokenizer(luceneMatchVersion, input);
@Override
public CharStream create(CharStream input) {
return new PersianCharFilter(input);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokenizer;
public class TestArabicFilters extends BaseTokenTestCase {
/**
* Test ArabicLetterTokenizerFactory
* @deprecated (3.1) Remove in Lucene 5.0
*/
@Deprecated
public void testTokenizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
@ -43,7 +46,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
*/
public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM);
@ -57,7 +60,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
StandardTokenizerFactory factory = new StandardTokenizerFactory();
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
@ -67,4 +70,16 @@ public class TestArabicFilters extends BaseTokenTestCase {
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
}
/**
* Test PersianCharFilterFactory
*/
public void testPersianCharFilter() throws Exception {
Reader reader = new StringReader("می‌خورد");
PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory();
StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory();
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader)));
assertTokenStreamContents(stream, new String[] { "می", "خورد" });
}
}

View File

@ -27,24 +27,12 @@ import org.apache.lucene.analysis.Tokenizer;
* Simple tests to ensure the Hindi filter Factories are working.
*/
public class TestHindiFilters extends BaseTokenTestCase {
/**
* Test IndicTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।");
IndicTokenizerFactory factory = new IndicTokenizerFactory();
factory.init(DEFAULT_VERSION_PARAM);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
}
/**
* Test IndicNormalizationFilterFactory
*/
public void testIndicNormalizer() throws Exception {
Reader reader = new StringReader("ত্‍ अाैर");
IndicTokenizerFactory factory = new IndicTokenizerFactory();
StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
filterFactory.init(DEFAULT_VERSION_PARAM);
@ -58,7 +46,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
*/
public void testHindiNormalizer() throws Exception {
Reader reader = new StringReader("क़िताब");
IndicTokenizerFactory factory = new IndicTokenizerFactory();
StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
factory.init(DEFAULT_VERSION_PARAM);
@ -74,7 +62,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("किताबें");
IndicTokenizerFactory factory = new IndicTokenizerFactory();
StandardTokenizerFactory factory = new StandardTokenizerFactory();
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();