From 620b2a0619a2066f9323d3df71cdc9b6f551f89c Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Tue, 7 Dec 2010 16:19:17 +0000 Subject: [PATCH] LUCENE-2747: Deprecate/remove language-specific tokenizers in favor of StandardTokenizer git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1043114 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/contrib/CHANGES.txt | 3 ++ .../lucene/analysis/ar/ArabicAnalyzer.java | 6 ++- .../analysis/ar/ArabicLetterTokenizer.java | 3 ++ .../lucene/analysis/fa/PersianAnalyzer.java | 21 ++++++++- .../lucene/analysis/fa/PersianCharFilter.java | 47 +++++++++++++++++++ .../analysis/util/ReusableAnalyzerBase.java | 14 ++++-- .../ar/TestArabicLetterTokenizer.java | 2 + .../analysis/in/TestIndicTokenizer.java | 44 ----------------- .../ArabicLetterTokenizerFactory.java | 2 + ...ory.java => PersianCharFilterFactory.java} | 17 +++---- .../solr/analysis/TestArabicFilters.java | 19 +++++++- .../solr/analysis/TestHindiFilters.java | 18 ++----- 12 files changed, 120 insertions(+), 76 deletions(-) create mode 100644 modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java delete mode 100644 modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java rename solr/src/java/org/apache/solr/analysis/{IndicTokenizerFactory.java => PersianCharFilterFactory.java} (70%) diff --git a/lucene/contrib/CHANGES.txt b/lucene/contrib/CHANGES.txt index 57283cb1608..494b0ef9162 100644 --- a/lucene/contrib/CHANGES.txt +++ b/lucene/contrib/CHANGES.txt @@ -171,6 +171,9 @@ API Changes new SpanMultiTermQueryWrapper(new RegexQuery()) instead. (Robert Muir, Uwe Schindler) + * LUCENE-2747: Deprecated ArabicLetterTokenizer. StandardTokenizer now tokenizes + most languages correctly including Arabic. (Steven Rowe, Robert Muir) + New features * LUCENE-2306: Add NumericRangeFilter and NumericRangeQuery support to XMLQueryParser. diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java index 43d64b9169e..2c2bcd44b48 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicAnalyzer.java @@ -27,6 +27,7 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; @@ -132,7 +133,7 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} - * built from an {@link ArabicLetterTokenizer} filtered with + * built from an {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link StopFilter}, * {@link ArabicNormalizationFilter}, {@link KeywordMarkerFilter} * if a stem exclusion set is provided and {@link ArabicStemFilter}. @@ -140,7 +141,8 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase { @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); + final Tokenizer source = matchVersion.onOrAfter(Version.LUCENE_31) ? + new StandardTokenizer(matchVersion, reader) : new ArabicLetterTokenizer(matchVersion, reader); TokenStream result = new LowerCaseFilter(matchVersion, source); // the order here is important: the stopword list is not normalized! result = new StopFilter( matchVersion, result, stopwords); diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java index 243b0c7566b..26f06d3ffa0 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/ar/ArabicLetterTokenizer.java @@ -20,6 +20,7 @@ import java.io.Reader; import org.apache.lucene.analysis.CharTokenizer; import org.apache.lucene.analysis.core.LetterTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; // javadoc @link import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.Version; @@ -38,7 +39,9 @@ import org.apache.lucene.util.Version; * detect token characters. See {@link #isTokenChar(int)} and * {@link #normalize(int)} for details. * + * @deprecated (3.1) Use {@link StandardTokenizer} instead. */ +@Deprecated public class ArabicLetterTokenizer extends LetterTokenizer { /** * Construct a new ArabicLetterTokenizer. diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java index 09e061821c3..3bb166c1537 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianAnalyzer.java @@ -22,12 +22,14 @@ import java.io.Reader; import java.util.Set; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.ar.ArabicLetterTokenizer; import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; @@ -109,14 +111,19 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase { * used to tokenize all the text in the provided {@link Reader}. * * @return {@link org.apache.lucene.analysis.util.ReusableAnalyzerBase.TokenStreamComponents} - * built from a {@link ArabicLetterTokenizer} filtered with + * built from a {@link StandardTokenizer} filtered with * {@link LowerCaseFilter}, {@link ArabicNormalizationFilter}, * {@link PersianNormalizationFilter} and Persian Stop words */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { - final Tokenizer source = new ArabicLetterTokenizer(matchVersion, reader); + final Tokenizer source; + if (matchVersion.onOrAfter(Version.LUCENE_31)) { + source = new StandardTokenizer(matchVersion, reader); + } else { + source = new ArabicLetterTokenizer(matchVersion, reader); + } TokenStream result = new LowerCaseFilter(matchVersion, source); result = new ArabicNormalizationFilter(result); /* additional persian-specific normalization */ @@ -127,4 +134,14 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase { */ return new TokenStreamComponents(source, new StopFilter(matchVersion, result, stopwords)); } + + /** + * Wraps the Reader with {@link PersianCharFilter} + */ + @Override + protected Reader initReader(Reader reader) { + return matchVersion.onOrAfter(Version.LUCENE_31) ? + new PersianCharFilter(CharReader.get(reader)) : + reader; + } } diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java new file mode 100644 index 00000000000..c1ed38acfa5 --- /dev/null +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/fa/PersianCharFilter.java @@ -0,0 +1,47 @@ +package org.apache.lucene.analysis.fa; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.charfilter.CharFilter; + +/** + * CharFilter that replaces instances of Zero-width non-joiner with an + * ordinary space. + */ +public class PersianCharFilter extends CharFilter { + + public PersianCharFilter(CharStream in) { + super(in); + } + + public int read(char[] cbuf, int off, int len) throws IOException { + final int charsRead = super.read(cbuf, off, len); + if (charsRead > 0) { + final int end = off + charsRead; + while (off < end) { + if (cbuf[off] == '\u200C') + cbuf[off] = ' '; + off++; + } + } + return charsRead; + } +} diff --git a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java index 4555f2257ac..9b7e2a9afef 100644 --- a/modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java +++ b/modules/analysis/common/src/java/org/apache/lucene/analysis/util/ReusableAnalyzerBase.java @@ -75,8 +75,9 @@ public abstract class ReusableAnalyzerBase extends Analyzer { final Reader reader) throws IOException { TokenStreamComponents streamChain = (TokenStreamComponents) getPreviousTokenStream(); - if (streamChain == null || !streamChain.reset(reader)) { - streamChain = createComponents(fieldName, reader); + final Reader r = initReader(reader); + if (streamChain == null || !streamChain.reset(r)) { + streamChain = createComponents(fieldName, r); setPreviousTokenStream(streamChain); } return streamChain.getTokenStream(); @@ -95,7 +96,14 @@ public abstract class ReusableAnalyzerBase extends Analyzer { @Override public final TokenStream tokenStream(final String fieldName, final Reader reader) { - return createComponents(fieldName, reader).getTokenStream(); + return createComponents(fieldName, initReader(reader)).getTokenStream(); + } + + /** + * Override this if you want to add a CharFilter chain. + */ + protected Reader initReader(Reader reader) { + return reader; } /** diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java index ebfae55bb67..cde265fb608 100644 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java +++ b/modules/analysis/common/src/test/org/apache/lucene/analysis/ar/TestArabicLetterTokenizer.java @@ -25,7 +25,9 @@ import org.apache.lucene.util.Version; /** * Testcase for {@link TestArabicLetterTokenizer} + * @deprecated (3.1) Remove in Lucene 5.0 */ +@Deprecated public class TestArabicLetterTokenizer extends BaseTokenStreamTestCase { public void testArabicLetterTokenizer() throws IOException { diff --git a/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java b/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java deleted file mode 100644 index 9a2cd81a61e..00000000000 --- a/modules/analysis/common/src/test/org/apache/lucene/analysis/in/TestIndicTokenizer.java +++ /dev/null @@ -1,44 +0,0 @@ -package org.apache.lucene.analysis.in; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; - -import org.apache.lucene.analysis.BaseTokenStreamTestCase; -import org.apache.lucene.analysis.TokenStream; - -/** - * Test IndicTokenizer - */ -public class TestIndicTokenizer extends BaseTokenStreamTestCase { - /** Test tokenizing Indic vowels, signs, and punctuation */ - public void testBasics() throws IOException { - TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT, - new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।")); - assertTokenStreamContents(ts, - new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" }); - } - - /** Test that words with format chars such as ZWJ are kept */ - public void testFormat() throws Exception { - TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT, - new StringReader("शार्‍मा शार्‍मा")); - assertTokenStreamContents(ts, new String[] { "शार्‍मा", "शार्‍मा" }); - } -} diff --git a/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java index f22f2232e5e..75e0341aeed 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicLetterTokenizerFactory.java @@ -23,7 +23,9 @@ import java.io.Reader; /** * Factory for {@link ArabicLetterTokenizer} + * @deprecated (3.1) Use StandardTokenizerFactory instead. **/ +@Deprecated public class ArabicLetterTokenizerFactory extends BaseTokenizerFactory{ public ArabicLetterTokenizer create(Reader input) { diff --git a/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java similarity index 70% rename from solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java rename to solr/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java index 0da0c7cb35a..14098634d9c 100644 --- a/solr/src/java/org/apache/solr/analysis/IndicTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/PersianCharFilterFactory.java @@ -17,15 +17,16 @@ package org.apache.solr.analysis; * limitations under the License. */ -import java.io.Reader; +import org.apache.lucene.analysis.CharStream; +import org.apache.lucene.analysis.fa.PersianCharFilter; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.in.IndicTokenizer; +/** + * Factory for {@link PersianCharFilter} + */ +public class PersianCharFilterFactory extends BaseCharFilterFactory { -/** Factory for {@link IndicTokenizer} */ -public class IndicTokenizerFactory extends BaseTokenizerFactory { - public Tokenizer create(Reader input) { - assureMatchVersion(); - return new IndicTokenizer(luceneMatchVersion, input); + @Override + public CharStream create(CharStream input) { + return new PersianCharFilter(input); } } diff --git a/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java b/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java index aeecbbd984c..c56dd3e126c 100644 --- a/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java +++ b/solr/src/test/org/apache/solr/analysis/TestArabicFilters.java @@ -20,6 +20,7 @@ package org.apache.solr.analysis; import java.io.Reader; import java.io.StringReader; +import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; @@ -29,7 +30,9 @@ import org.apache.lucene.analysis.Tokenizer; public class TestArabicFilters extends BaseTokenTestCase { /** * Test ArabicLetterTokenizerFactory + * @deprecated (3.1) Remove in Lucene 5.0 */ + @Deprecated public void testTokenizer() throws Exception { Reader reader = new StringReader("الذين مَلكت أيمانكم"); ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); @@ -43,7 +46,7 @@ public class TestArabicFilters extends BaseTokenTestCase { */ public void testNormalizer() throws Exception { Reader reader = new StringReader("الذين مَلكت أيمانكم"); - ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); filterFactory.init(DEFAULT_VERSION_PARAM); @@ -57,7 +60,7 @@ public class TestArabicFilters extends BaseTokenTestCase { */ public void testStemmer() throws Exception { Reader reader = new StringReader("الذين مَلكت أيمانكم"); - ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory(); ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); @@ -67,4 +70,16 @@ public class TestArabicFilters extends BaseTokenTestCase { stream = stemFactory.create(stream); assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"}); } + + /** + * Test PersianCharFilterFactory + */ + public void testPersianCharFilter() throws Exception { + Reader reader = new StringReader("می‌خورد"); + PersianCharFilterFactory charfilterFactory = new PersianCharFilterFactory(); + StandardTokenizerFactory tokenizerFactory = new StandardTokenizerFactory(); + tokenizerFactory.init(DEFAULT_VERSION_PARAM); + TokenStream stream = tokenizerFactory.create(charfilterFactory.create(CharReader.get(reader))); + assertTokenStreamContents(stream, new String[] { "می", "خورد" }); + } } diff --git a/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java b/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java index 880be0e1e8e..3063b5f0944 100644 --- a/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java +++ b/solr/src/test/org/apache/solr/analysis/TestHindiFilters.java @@ -27,24 +27,12 @@ import org.apache.lucene.analysis.Tokenizer; * Simple tests to ensure the Hindi filter Factories are working. */ public class TestHindiFilters extends BaseTokenTestCase { - /** - * Test IndicTokenizerFactory - */ - public void testTokenizer() throws Exception { - Reader reader = new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); - factory.init(DEFAULT_VERSION_PARAM); - Tokenizer stream = factory.create(reader); - assertTokenStreamContents(stream, - new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" }); - } - /** * Test IndicNormalizationFilterFactory */ public void testIndicNormalizer() throws Exception { Reader reader = new StringReader("ত্‍ अाैर"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory filterFactory = new IndicNormalizationFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); filterFactory.init(DEFAULT_VERSION_PARAM); @@ -58,7 +46,7 @@ public class TestHindiFilters extends BaseTokenTestCase { */ public void testHindiNormalizer() throws Exception { Reader reader = new StringReader("क़िताब"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); factory.init(DEFAULT_VERSION_PARAM); @@ -74,7 +62,7 @@ public class TestHindiFilters extends BaseTokenTestCase { */ public void testStemmer() throws Exception { Reader reader = new StringReader("किताबें"); - IndicTokenizerFactory factory = new IndicTokenizerFactory(); + StandardTokenizerFactory factory = new StandardTokenizerFactory(); IndicNormalizationFilterFactory indicFilterFactory = new IndicNormalizationFilterFactory(); HindiNormalizationFilterFactory hindiFilterFactory = new HindiNormalizationFilterFactory(); HindiStemFilterFactory stemFactory = new HindiStemFilterFactory();