diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 85b4957b69c..8926dd9e7c3 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -196,6 +196,9 @@ Other * LUCENE-5822: Convert README to Markdown (Jason Gerlowski via Mike Drob) +* LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer. + (Ahmet Arslan via Steve Rowe) + ======================= Lucene 6.7.0 ======================= New Features diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py index 94de1ac72f4..3f28e82c50e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/htmlentity.py @@ -19,7 +19,7 @@ import re # for inclusion in HTMLStripCharFilter.jflex. def main(): - print get_apache_license() + print(get_apache_license()) codes = {} regex = re.compile(r'\s*= 80: - print output_line + print(output_line) output_line = ' ' output_line += new_entry if key in ('quot','copy','gt','lt','reg','amp'): new_entry = ' | "%s"' % key.upper() if len(output_line) + len(new_entry) >= 80: - print output_line + print(output_line) output_line = ' ' output_line += new_entry - print output_line, ')' + print(output_line, ')') - print '%{' - print ' private static final Map upperCaseVariantsAccepted' - print ' = new HashMap<>();' - print ' static {' - print ' upperCaseVariantsAccepted.put("quot", "QUOT");' - print ' upperCaseVariantsAccepted.put("copy", "COPY");' - print ' upperCaseVariantsAccepted.put("gt", "GT");' - print ' upperCaseVariantsAccepted.put("lt", "LT");' - print ' upperCaseVariantsAccepted.put("reg", "REG");' - print ' upperCaseVariantsAccepted.put("amp", "AMP");' - print ' }' - print ' private static final CharArrayMap entityValues' - print ' = new CharArrayMap<>(%i, false);' % len(keys) - print ' static {' - print ' String[] entities = {' + print('%{') + print(' private static final Map upperCaseVariantsAccepted') + print(' = new HashMap<>();') + print(' static {') + print(' upperCaseVariantsAccepted.put("quot", "QUOT");') + print(' upperCaseVariantsAccepted.put("copy", "COPY");') + print(' upperCaseVariantsAccepted.put("gt", "GT");') + print(' upperCaseVariantsAccepted.put("lt", "LT");') + print(' upperCaseVariantsAccepted.put("reg", "REG");') + print(' upperCaseVariantsAccepted.put("amp", "AMP");') + print(' }') + print(' private static final CharArrayMap entityValues') + print(' = new CharArrayMap<>(%i, false);' % len(keys)) + print(' static {') + print(' String[] entities = {') output_line = ' ' for key in keys: new_entry = ' "%s", "%s",' % (key, codes[key]) if len(output_line) + len(new_entry) >= 80: - print output_line + print(output_line) output_line = ' ' output_line += new_entry - print output_line[:-1] - print ' };' - print ' for (int i = 0 ; i < entities.length ; i += 2) {' - print ' Character value = entities[i + 1].charAt(0);' - print ' entityValues.put(entities[i], value);' - print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);' - print ' if (upperCaseVariant != null) {' - print ' entityValues.put(upperCaseVariant, value);' - print ' }' - print ' }' - print " }" - print "%}" + print(output_line[:-1]) + print(' };') + print(' for (int i = 0 ; i < entities.length ; i += 2) {') + print(' Character value = entities[i + 1].charAt(0);') + print(' entityValues.put(entities[i], value);') + print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);') + print(' if (upperCaseVariant != null) {') + print(' entityValues.put(upperCaseVariant, value);') + print(' }') + print(' }') + print(" }") + print("%}") def get_entity_text(): # The text below is taken verbatim from diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java index 73114598178..7e5105df3bb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.analysis.standard; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -342,17 +343,17 @@ class ClassicTokenizerImpl { /* user code: */ -public static final int ALPHANUM = StandardTokenizer.ALPHANUM; -public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; -public static final int ACRONYM = StandardTokenizer.ACRONYM; -public static final int COMPANY = StandardTokenizer.COMPANY; -public static final int EMAIL = StandardTokenizer.EMAIL; -public static final int HOST = StandardTokenizer.HOST; -public static final int NUM = StandardTokenizer.NUM; -public static final int CJ = StandardTokenizer.CJ; -public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; +public static final int ALPHANUM = ClassicTokenizer.ALPHANUM; +public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE; +public static final int ACRONYM = ClassicTokenizer.ACRONYM; +public static final int COMPANY = ClassicTokenizer.COMPANY; +public static final int EMAIL = ClassicTokenizer.EMAIL; +public static final int HOST = ClassicTokenizer.HOST; +public static final int NUM = ClassicTokenizer.NUM; +public static final int CJ = ClassicTokenizer.CJ; +public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP; -public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; +public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES; public final int yychar() { diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex index 4d6ad167811..07d78570f06 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.standard; -import java.io.Reader; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; /** @@ -36,17 +35,17 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; %{ -public static final int ALPHANUM = StandardTokenizer.ALPHANUM; -public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; -public static final int ACRONYM = StandardTokenizer.ACRONYM; -public static final int COMPANY = StandardTokenizer.COMPANY; -public static final int EMAIL = StandardTokenizer.EMAIL; -public static final int HOST = StandardTokenizer.HOST; -public static final int NUM = StandardTokenizer.NUM; -public static final int CJ = StandardTokenizer.CJ; -public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; +public static final int ALPHANUM = ClassicTokenizer.ALPHANUM; +public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE; +public static final int ACRONYM = ClassicTokenizer.ACRONYM; +public static final int COMPANY = ClassicTokenizer.COMPANY; +public static final int EMAIL = ClassicTokenizer.EMAIL; +public static final int HOST = ClassicTokenizer.HOST; +public static final int NUM = ClassicTokenizer.NUM; +public static final int CJ = ClassicTokenizer.CJ; +public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP; -public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; +public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES; public final int yychar() { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java index 6d514d171a8..c9ae2e61e46 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/core/TestAnalyzers.java @@ -121,13 +121,12 @@ public class TestAnalyzers extends BaseTokenStreamTestCase { @SuppressWarnings("unused") public void _testStandardConstants() { int x = StandardTokenizer.ALPHANUM; - x = StandardTokenizer.APOSTROPHE; - x = StandardTokenizer.ACRONYM; - x = StandardTokenizer.COMPANY; - x = StandardTokenizer.EMAIL; - x = StandardTokenizer.HOST; x = StandardTokenizer.NUM; - x = StandardTokenizer.CJ; + x = StandardTokenizer.SOUTHEAST_ASIAN; + x = StandardTokenizer.IDEOGRAPHIC; + x = StandardTokenizer.HIRAGANA; + x = StandardTokenizer.KATAKANA; + x = StandardTokenizer.HANGUL; String[] y = StandardTokenizer.TOKEN_TYPES; } diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index ed52f034405..04101246460 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -40,56 +40,25 @@ public final class StandardTokenizer extends Tokenizer { /** A private instance of the JFlex-constructed scanner */ private StandardTokenizerImpl scanner; - // TODO: how can we remove these old types?! /** Alpha/numeric token type */ - public static final int ALPHANUM = 0; - /** @deprecated (3.1) */ - @Deprecated - public static final int APOSTROPHE = 1; - /** @deprecated (3.1) */ - @Deprecated - public static final int ACRONYM = 2; - /** @deprecated (3.1) */ - @Deprecated - public static final int COMPANY = 3; - /** Email token type */ - public static final int EMAIL = 4; - /** @deprecated (3.1) */ - @Deprecated - public static final int HOST = 5; + public static final int ALPHANUM = 0; /** Numeric token type */ - public static final int NUM = 6; - /** @deprecated (3.1) */ - @Deprecated - public static final int CJ = 7; - - /** @deprecated (3.1) */ - @Deprecated - public static final int ACRONYM_DEP = 8; - + public static final int NUM = 1; /** Southeast Asian token type */ - public static final int SOUTHEAST_ASIAN = 9; - /** Idiographic token type */ - public static final int IDEOGRAPHIC = 10; + public static final int SOUTHEAST_ASIAN = 2; + /** Ideographic token type */ + public static final int IDEOGRAPHIC = 3; /** Hiragana token type */ - public static final int HIRAGANA = 11; + public static final int HIRAGANA = 4; /** Katakana token type */ - public static final int KATAKANA = 12; - + public static final int KATAKANA = 5; /** Hangul token type */ - public static final int HANGUL = 13; + public static final int HANGUL = 6; /** String token types that correspond to token type int constants */ public static final String [] TOKEN_TYPES = new String [] { "", - "", - "", - "", - "", - "", "", - "", - "", "", "", "", diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java index 5d7b240a4f2..8b288c22c06 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.java @@ -16,6 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + package org.apache.lucene.analysis.standard; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -435,7 +436,7 @@ public final class StandardTokenizerImpl { */ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN; - /** Idiographic token type */ + /** Ideographic token type */ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; /** Hiragana token type */ diff --git a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex index 11b2cbdddfd..a1e7b17b901 100644 --- a/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex +++ b/lucene/core/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex @@ -82,7 +82,7 @@ ComplexContextEx = \p{LB:Complex_Context} */ public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN; - /** Idiographic token type */ + /** Ideographic token type */ public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC; /** Hiragana token type */