mirror of https://github.com/apache/lucene.git
LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043114 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
2b9726ae81
commit
620b2a0619
|
@ -171,6 +171,9 @@ API Changes
|
||||||
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
|
new SpanMultiTermQueryWrapper<RegexQuery>(new RegexQuery()) instead.
|
||||||
(Robert Muir, Uwe Schindler)
|
(Robert Muir, Uwe Schindler)
|
||||||
|
|
||||||
|
* LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes
|
||||||
|
most languages correctly including Arabic. (Steven Rowe, Robert Muir)
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
|
* LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser.
|
||||||
|
|
|
@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.util.WordlistLoader;
|
import org.apache.lucene.analysis.util.WordlistLoader;
|
||||||
|
@ -132,7 +133,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
* used to tokenize all the text in the provided {@link Reader}.
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* built from an {@link ArabicLetterTokenizer} filtered with
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
|
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter}
|
||||||
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
|
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
|
||||||
|
@ -140,7 +141,8 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
|
final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||||
|
new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader);
|
||||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||||
// the order here is important: the stopword list is not normalized!
|
// the order here is important: the stopword list is not normalized!
|
||||||
result = new StopFilter( matchVersion, result, stopwords);
|
result = new StopFilter( matchVersion, result, stopwords);
|
||||||
|
|
|
@ -20,6 +20,7 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CharTokenizer;
|
import org.apache.lucene.analysis.CharTokenizer;
|
||||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -38,7 +39,9 @@ import org.apache.lucene.util.Version;
|
||||||
* detect token characters. See {@link #isTokenChar(int)} and
|
* detect token characters. See {@link #isTokenChar(int)} and
|
||||||
* {@link #normalize(int)} for details.</li>
|
* {@link #normalize(int)} for details.</li>
|
||||||
* </ul>
|
* </ul>
|
||||||
|
* @deprecated (3.1) Use {@link StandardTokenizer} instead.
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class ArabicLetterTokenizer extends LetterTokenizer {
|
public class ArabicLetterTokenizer extends LetterTokenizer {
|
||||||
/**
|
/**
|
||||||
* Construct a new ArabicLetterTokenizer.
|
* Construct a new ArabicLetterTokenizer.
|
||||||
|
|
|
@ -22,12 +22,14 @@ import java.io.Reader;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||||
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
import org.apache.lucene.analysis.ar.ArabicNormalizationFilter;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -109,14 +111,19 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
* used to tokenize all the text in the provided {@link Reader}.
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
* @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* built from a {@link ArabicLetterTokenizer} filtered with
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
|
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
|
||||||
* {@link PersianNormalizationFilter} and Persian Stop words
|
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
Reader reader) {
|
Reader reader) {
|
||||||
final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader);
|
final Tokenizer source;
|
||||||
|
if (matchVersion.onOrAfter(Version.LUCENE_31)) {
|
||||||
|
source = new StandardTokenizer(matchVersion, reader);
|
||||||
|
} else {
|
||||||
|
source = new ArabicLetterTokenizer(matchVersion, reader);
|
||||||
|
}
|
||||||
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
TokenStream result = new LowerCaseFilter(matchVersion, source);
|
||||||
result = new ArabicNormalizationFilter(result);
|
result = new ArabicNormalizationFilter(result);
|
||||||
/* additional persian-specific normalization */
|
/* additional persian-specific normalization */
|
||||||
|
@ -127,4 +134,14 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
*/
|
*/
|
||||||
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Wraps the Reader with {@link PersianCharFilter}
|
||||||
|
*/
|
||||||
|
@Override
|
||||||
|
protected Reader initReader(Reader reader) {
|
||||||
|
return matchVersion.onOrAfter(Version.LUCENE_31) ?
|
||||||
|
new PersianCharFilter(CharReader.get(reader)) :
|
||||||
|
reader;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,47 @@
|
||||||
|
package org.apache.lucene.analysis.fa;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.apache.lucene.analysis.charfilter.CharFilter;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CharFilter that replaces instances of Zero-width non-joiner with an
|
||||||
|
* ordinary space.
|
||||||
|
*/
|
||||||
|
public class PersianCharFilter extends CharFilter {
|
||||||
|
|
||||||
|
public PersianCharFilter(CharStream in) {
|
||||||
|
super(in);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int read(char[] cbuf, int off, int len) throws IOException {
|
||||||
|
final int charsRead = super.read(cbuf, off, len);
|
||||||
|
if (charsRead > 0) {
|
||||||
|
final int end = off + charsRead;
|
||||||
|
while (off < end) {
|
||||||
|
if (cbuf[off] == '\u200C')
|
||||||
|
cbuf[off] = ' ';
|
||||||
|
off++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return charsRead;
|
||||||
|
}
|
||||||
|
}
|
|
@ -75,8 +75,9 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
|
||||||
final Reader reader) throws IOException {
|
final Reader reader) throws IOException {
|
||||||
TokenStreamComponents streamChain = (TokenStreamComponents)
|
TokenStreamComponents streamChain = (TokenStreamComponents)
|
||||||
getPreviousTokenStream();
|
getPreviousTokenStream();
|
||||||
if (streamChain == null || !streamChain.reset(reader)) {
|
final Reader r = initReader(reader);
|
||||||
streamChain = createComponents(fieldName, reader);
|
if (streamChain == null || !streamChain.reset(r)) {
|
||||||
|
streamChain = createComponents(fieldName, r);
|
||||||
setPreviousTokenStream(streamChain);
|
setPreviousTokenStream(streamChain);
|
||||||
}
|
}
|
||||||
return streamChain.getTokenStream();
|
return streamChain.getTokenStream();
|
||||||
|
@ -95,7 +96,14 @@ public abstract class ReusableAnalyzerBase extends Analyzer {
|
||||||
@Override
|
@Override
|
||||||
public final TokenStream tokenStream(final String fieldName,
|
public final TokenStream tokenStream(final String fieldName,
|
||||||
final Reader reader) {
|
final Reader reader) {
|
||||||
return createComponents(fieldName, reader).getTokenStream();
|
return createComponents(fieldName, initReader(reader)).getTokenStream();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Override this if you want to add a CharFilter chain.
|
||||||
|
*/
|
||||||
|
protected Reader initReader(Reader reader) {
|
||||||
|
return reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -25,7 +25,9 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Testcase for {@link TestArabicLetterTokenizer}
|
* Testcase for {@link TestArabicLetterTokenizer}
|
||||||
|
* @deprecated (3.1) Remove in Lucene 5.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
|
public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testArabicLetterTokenizer() throws IOException {
|
public void testArabicLetterTokenizer() throws IOException {
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
package org.apache.lucene.analysis.in;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
|
||||||
* this work for additional information regarding copyright ownership.
|
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
|
||||||
* (the "License"); you may not use this file except in compliance with
|
|
||||||
* the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Test IndicTokenizer
|
|
||||||
*/
|
|
||||||
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
|
||||||
/** Test tokenizing Indic vowels, signs, and punctuation */
|
|
||||||
public void testBasics() throws IOException {
|
|
||||||
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
|
||||||
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
|
||||||
assertTokenStreamContents(ts,
|
|
||||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Test that words with format chars such as ZWJ are kept */
|
|
||||||
public void testFormat() throws Exception {
|
|
||||||
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
|
||||||
new StringReader("शार्मा शार्मा"));
|
|
||||||
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -23,7 +23,9 @@ import java.io.Reader;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Factory for {@link ArabicLetterTokenizer}
|
* Factory for {@link ArabicLetterTokenizer}
|
||||||
|
* @deprecated (3.1) Use StandardTokenizerFactory instead.
|
||||||
**/
|
**/
|
||||||
|
@Deprecated
|
||||||
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{
|
||||||
|
|
||||||
public ArabicLetterTokenizer create(Reader input) {
|
public ArabicLetterTokenizer create(Reader input) {
|
||||||
|
|
|
@ -17,15 +17,16 @@ package org.apache.solr.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.Reader;
|
import org.apache.lucene.analysis.CharStream;
|
||||||
|
import org.apache.lucene.analysis.fa.PersianCharFilter;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
/**
|
||||||
import org.apache.lucene.analysis.in.IndicTokenizer;
|
* Factory for {@link PersianCharFilter}
|
||||||
|
*/
|
||||||
|
public class PersianCharFilterFactory extends BaseCharFilterFactory {
|
||||||
|
|
||||||
/** Factory for {@link IndicTokenizer} */
|
@Override
|
||||||
public class IndicTokenizerFactory extends BaseTokenizerFactory {
|
public CharStream create(CharStream input) {
|
||||||
public Tokenizer create(Reader input) {
|
return new PersianCharFilter(input);
|
||||||
assureMatchVersion();
|
|
||||||
return new IndicTokenizer(luceneMatchVersion, input);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
||||||
|
@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
public class TestArabicFilters extends BaseTokenTestCase {
|
public class TestArabicFilters extends BaseTokenTestCase {
|
||||||
/**
|
/**
|
||||||
* Test ArabicLetterTokenizerFactory
|
* Test ArabicLetterTokenizerFactory
|
||||||
|
* @deprecated (3.1) Remove in Lucene 5.0
|
||||||
*/
|
*/
|
||||||
|
@Deprecated
|
||||||
public void testTokenizer() throws Exception {
|
public void testTokenizer() throws Exception {
|
||||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||||
|
@ -43,7 +46,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testNormalizer() throws Exception {
|
public void testNormalizer() throws Exception {
|
||||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||||
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
|
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
filterFactory.init(DEFAULT_VERSION_PARAM);
|
filterFactory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
@ -57,7 +60,7 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemmer() throws Exception {
|
public void testStemmer() throws Exception {
|
||||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||||
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
|
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
|
||||||
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
|
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
@ -67,4 +70,16 @@ public class TestArabicFilters extends BaseTokenTestCase {
|
||||||
stream = stemFactory.create(stream);
|
stream = stemFactory.create(stream);
|
||||||
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
|
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test PersianCharFilterFactory
|
||||||
|
*/
|
||||||
|
public void testPersianCharFilter() throws Exception {
|
||||||
|
Reader reader = new StringReader("میخورد");
|
||||||
|
PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory();
|
||||||
|
StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory();
|
||||||
|
tokenizerFactory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader)));
|
||||||
|
assertTokenStreamContents(stream, new String[] { "می", "خورد" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,24 +27,12 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
* Simple tests to ensure the Hindi filter Factories are working.
|
* Simple tests to ensure the Hindi filter Factories are working.
|
||||||
*/
|
*/
|
||||||
public class TestHindiFilters extends BaseTokenTestCase {
|
public class TestHindiFilters extends BaseTokenTestCase {
|
||||||
/**
|
|
||||||
* Test IndicTokenizerFactory
|
|
||||||
*/
|
|
||||||
public void testTokenizer() throws Exception {
|
|
||||||
Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।");
|
|
||||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
|
||||||
Tokenizer stream = factory.create(reader);
|
|
||||||
assertTokenStreamContents(stream,
|
|
||||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test IndicNormalizationFilterFactory
|
* Test IndicNormalizationFilterFactory
|
||||||
*/
|
*/
|
||||||
public void testIndicNormalizer() throws Exception {
|
public void testIndicNormalizer() throws Exception {
|
||||||
Reader reader = new StringReader("ত্ अाैर");
|
Reader reader = new StringReader("ত্ अाैर");
|
||||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||||
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
|
IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory();
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
filterFactory.init(DEFAULT_VERSION_PARAM);
|
filterFactory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
@ -58,7 +46,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testHindiNormalizer() throws Exception {
|
public void testHindiNormalizer() throws Exception {
|
||||||
Reader reader = new StringReader("क़िताब");
|
Reader reader = new StringReader("क़िताब");
|
||||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||||
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
||||||
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
||||||
factory.init(DEFAULT_VERSION_PARAM);
|
factory.init(DEFAULT_VERSION_PARAM);
|
||||||
|
@ -74,7 +62,7 @@ public class TestHindiFilters extends BaseTokenTestCase {
|
||||||
*/
|
*/
|
||||||
public void testStemmer() throws Exception {
|
public void testStemmer() throws Exception {
|
||||||
Reader reader = new StringReader("किताबें");
|
Reader reader = new StringReader("किताबें");
|
||||||
IndicTokenizerFactory factory = new IndicTokenizerFactory();
|
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||||
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory();
|
||||||
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory();
|
||||||
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();
|
HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();
|
||||||
|
|
Loading…
Reference in New Issue