mirror of https://github.com/apache/lucene.git
LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2b9726ae81
commit
620b2a0619
|
@ -171,6 +171,9 @@ API Changes
|
|||
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
|
||||
(Robert Muir, Uwe Schindler)
|
||||
|
||||
* LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes
|
||||
most languages correctly including Arabic. (Steven Rowe, Robert Muir)
|
||||
|
||||
New features
|
||||
|
||||
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer;
|
|||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.CharArraySet;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||
|
@ -132,7 +133,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from an {@link ArabicLetterTokenizer} filtered with
|
||||
* built from an {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
|
||||
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
|
||||
|
@ -140,7 +141,8 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
|||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
|
||||
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
// the order here is important: the stopword list is not normalized!
|
||||
result = new StopFilter( matchVersion, result, stopwords);
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.io.Reader;
|
|||
|
||||
import org.apache.lucene.analysis.CharTokenizer;
|
||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
|
||||
import org.apache.lucene.util.AttributeSource;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -38,7 +39,9 @@ import org.apache.lucene.util.Version;
|
|||
* detect token characters. See {@link #isTokenChar(int)} and
|
||||
* {@link #normalize(int)} for details.</li>
|
||||
* </ul>
|
||||
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
|
||||
*/
|
||||
@Deprecated
|
||||
public class ArabicLetterTokenizer extends LetterTokenizer {
|
||||
/**
|
||||
* Construct a new ArabicLetterTokenizer.
|
||||
|
|
|
@ -22,12 +22,14 @@ import java.io.Reader;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.core.StopFilter;
|
||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||
import org.apache.lucene.util.Version;
|
||||
|
||||
|
@ -109,14 +111,19 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
* used to tokenize all the text in the provided {@link Reader}.
|
||||
*
|
||||
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||
* built from a {@link ArabicLetterTokenizer} filtered with
|
||||
* built from a {@link StandardTokenizer} filtered with
|
||||
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
|
||||
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||
*/
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName,
|
||||
Reader reader) {
|
||||
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
|
||||
final Tokenizer source;
|
||||
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||
source = new StandardTokenizer(matchVersion, reader);
|
||||
} else {
|
||||
source = new ArabicLetterTokenizer(matchVersion, reader);
|
||||
}
|
||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||
result = new ArabicNormalizationFilter(result);
|
||||
/* additional persian-specific normalization */
|
||||
|
@ -127,4 +134,14 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
|||
*/
|
||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Wraps the Reader with {@link PersianCharFilter}
|
||||
*/
|
||||
@Override
|
||||
protected Reader initReader(Reader reader) {
|
||||
return matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||
new PersianCharFilter(CharReader.get(reader)) :
|
||||
reader;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.lucene.analysis.fa;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||
|
||||
/**
|
||||
* CharFilter that replaces instances of Zero-width non-joiner with an
|
||||
* ordinary space.
|
||||
*/
|
||||
public class PersianCharFilter extends CharFilter {
|
||||
|
||||
public PersianCharFilter(CharStream in) {
|
||||
super(in);
|
||||
}
|
||||
|
||||
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||
final int charsRead = super.read(cbuf, off, len);
|
||||
if (charsRead > 0) {
|
||||
final int end = off + charsRead;
|
||||
while (off < end) {
|
||||
if (cbuf[off] == '\u200C')
|
||||
cbuf[off] = ' ';
|
||||
off++;
|
||||
}
|
||||
}
|
||||
return charsRead;
|
||||
}
|
||||
}
|
|
@ -75,8 +75,9 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
|
|||
final Reader reader) throws IOException {
|
||||
TokenStreamComponents streamChain = (TokenStreamComponents)
|
||||
getPreviousTokenStream();
|
||||
if (streamChain == null || !streamChain.reset(reader)) {
|
||||
streamChain = createComponents(fieldName, reader);
|
||||
final Reader r = initReader(reader);
|
||||
if (streamChain == null || !streamChain.reset(r)) {
|
||||
streamChain = createComponents(fieldName, r);
|
||||
setPreviousTokenStream(streamChain);
|
||||
}
|
||||
return streamChain.getTokenStream();
|
||||
|
@ -95,7 +96,14 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
|
|||
@Override
|
||||
public final TokenStream tokenStream(final String fieldName,
|
||||
final Reader reader) {
|
||||
return createComponents(fieldName, reader).getTokenStream();
|
||||
return createComponents(fieldName, initReader(reader)).getTokenStream();
|
||||
}
|
||||
|
||||
/**
|
||||
* Override this if you want to add a CharFilter chain.
|
||||
*/
|
||||
protected Reader initReader(Reader reader) {
|
||||
return reader;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
|
|||
|
||||
/**
|
||||
* Testcase for {@link TestArabicLetterTokenizer}
|
||||
* @deprecated (3.1) Remove in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testArabicLetterTokenizer() throws IOException {
|
||||
|
|
|
@ -1,44 +0,0 @@
|
|||
package org.apache.lucene.analysis.in;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Test IndicTokenizer
|
||||
*/
|
||||
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
||||
/** Test tokenizing Indic vowels, signs, and punctuation */
|
||||
public void testBasics() throws IOException {
|
||||
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
||||
}
|
||||
|
||||
/** Test that words with format chars such as ZWJ are kept */
|
||||
public void testFormat() throws Exception {
|
||||
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
||||
new StringReader("शार्मा शार्मा"));
|
||||
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
||||
}
|
||||
}
|
|
@ -23,7 +23,9 @@ import java.io.Reader;
|
|||
|
||||
/**
|
||||
* Factory for {@link ArabicLetterTokenizer}
|
||||
* @deprecated (3.1) Use StandardTokenizerFactory instead.
|
||||
**/
|
||||
@Deprecated
|
||||
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
||||
|
||||
public ArabicLetterTokenizer create(Reader input) {
|
||||
|
|
|
@ -17,15 +17,16 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.fa.PersianCharFilter;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
||||
/**
|
||||
* Factory for {@link PersianCharFilter}
|
||||
*/
|
||||
public class PersianCharFilterFactory extends BaseCharFilterFactory {
|
||||
|
||||
/** Factory for {@link IndicTokenizer} */
|
||||
public class IndicTokenizerFactory extends BaseTokenizerFactory {
|
||||
public Tokenizer create(Reader input) {
|
||||
assureMatchVersion();
|
||||
return new IndicTokenizer(luceneMatchVersion, input);
|
||||
@Override
|
||||
public CharStream create(CharStream input) {
|
||||
return new PersianCharFilter(input);
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
|
@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
public class TestArabicFilters extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test ArabicLetterTokenizerFactory
|
||||
* @deprecated (3.1) Remove in Lucene 5.0
|
||||
*/
|
||||
@Deprecated
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
|
@ -43,7 +46,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
filterFactory.init(DEFAULT_VERSION_PARAM);
|
||||
|
@ -57,7 +60,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
|
||||
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
|
@ -67,4 +70,16 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
|||
stream = stemFactory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test PersianCharFilterFactory
|
||||
*/
|
||||
public void testPersianCharFilter() throws Exception {
|
||||
Reader reader = new StringReader("میخورد");
|
||||
PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory();
|
||||
StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory();
|
||||
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
|
||||
TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader)));
|
||||
assertTokenStreamContents(stream, new String[] { "می", "خورد" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,24 +27,12 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
* Simple tests to ensure the Hindi filter Factories are working.
|
||||
*/
|
||||
public class TestHindiFilters extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test IndicTokenizerFactory
|
||||
*/
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।");
|
||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test IndicNormalizationFilterFactory
|
||||
*/
|
||||
public void testIndicNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("ত্ अाैर");
|
||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
filterFactory.init(DEFAULT_VERSION_PARAM);
|
||||
|
@ -58,7 +46,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testHindiNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("क़िताब");
|
||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
||||
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
||||
factory.init(DEFAULT_VERSION_PARAM);
|
||||
|
@ -74,7 +62,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
|
|||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("किताबें");
|
||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
||||
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
||||
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();
|
||||
|
|
Loading…
Reference in New Issue