LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer

This commit is contained in:
Steve Rowe 2017-07-06 19:02:11 -04:00
parent 74b609cf88
commit a31f9e8dd6
8 changed files with 72 additions and 100 deletions

View File

@ -196,6 +196,9 @@ Other
* LUCENE-5822: Convert README to Markdown (Jason Gerlowski via Mike Drob)
* LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer.
(Ahmet Arslan via Steve Rowe)
======================= Lucene 6.7.0 =======================
New Features

View File

@ -19,7 +19,7 @@ import re
# for inclusion in HTMLStripCharFilter.jflex.
def main():
print get_apache_license()
print(get_apache_license())
codes = {}
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
for line in get_entity_text().split('\n'):
@ -38,51 +38,51 @@ def main():
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
first_entry = False
if len(output_line) + len(new_entry) >= 80:
print output_line
print(output_line)
output_line = ' '
output_line += new_entry
if key in ('quot','copy','gt','lt','reg','amp'):
new_entry = ' | "%s"' % key.upper()
if len(output_line) + len(new_entry) >= 80:
print output_line
print(output_line)
output_line = ' '
output_line += new_entry
print output_line, ')'
print(output_line, ')')
print '%{'
print ' private static final Map<String,String> upperCaseVariantsAccepted'
print ' = new HashMap<>();'
print ' static {'
print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
print ' upperCaseVariantsAccepted.put("copy", "COPY");'
print ' upperCaseVariantsAccepted.put("gt", "GT");'
print ' upperCaseVariantsAccepted.put("lt", "LT");'
print ' upperCaseVariantsAccepted.put("reg", "REG");'
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
print ' }'
print ' private static final CharArrayMap<Character> entityValues'
print ' = new CharArrayMap<>(%i, false);' % len(keys)
print ' static {'
print ' String[] entities = {'
print('%{')
print(' private static final Map<String,String> upperCaseVariantsAccepted')
print(' = new HashMap<>();')
print(' static {')
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
print(' upperCaseVariantsAccepted.put("gt", "GT");')
print(' upperCaseVariantsAccepted.put("lt", "LT");')
print(' upperCaseVariantsAccepted.put("reg", "REG");')
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
print(' }')
print(' private static final CharArrayMap<Character> entityValues')
print(' = new CharArrayMap<>(%i, false);' % len(keys))
print(' static {')
print(' String[] entities = {')
output_line = ' '
for key in keys:
new_entry = ' "%s", "%s",' % (key, codes[key])
if len(output_line) + len(new_entry) >= 80:
print output_line
print(output_line)
output_line = ' '
output_line += new_entry
print output_line[:-1]
print ' };'
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
print ' Character value = entities[i + 1].charAt(0);'
print ' entityValues.put(entities[i], value);'
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
print ' if (upperCaseVariant != null) {'
print ' entityValues.put(upperCaseVariant, value);'
print ' }'
print ' }'
print " }"
print "%}"
print(output_line[:-1])
print(' };')
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
print(' Character value = entities[i + 1].charAt(0);')
print(' entityValues.put(entities[i], value);')
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
print(' if (upperCaseVariant != null) {')
print(' entityValues.put(upperCaseVariant, value);')
print(' }')
print(' }')
print(" }")
print("%}")
def get_entity_text():
# The text below is taken verbatim from

View File

@ -16,6 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -342,17 +343,17 @@ class ClassicTokenizerImpl {
/* user code: */
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
public static final int ACRONYM = ClassicTokenizer.ACRONYM;
public static final int COMPANY = ClassicTokenizer.COMPANY;
public static final int EMAIL = ClassicTokenizer.EMAIL;
public static final int HOST = ClassicTokenizer.HOST;
public static final int NUM = ClassicTokenizer.NUM;
public static final int CJ = ClassicTokenizer.CJ;
public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
public final int yychar()
{

View File

@ -17,7 +17,6 @@
package org.apache.lucene.analysis.standard;
import java.io.Reader;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
@ -36,17 +35,17 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
%{
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
public static final int ACRONYM = StandardTokenizer.ACRONYM;
public static final int COMPANY = StandardTokenizer.COMPANY;
public static final int EMAIL = StandardTokenizer.EMAIL;
public static final int HOST = StandardTokenizer.HOST;
public static final int NUM = StandardTokenizer.NUM;
public static final int CJ = StandardTokenizer.CJ;
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
public static final int ACRONYM = ClassicTokenizer.ACRONYM;
public static final int COMPANY = ClassicTokenizer.COMPANY;
public static final int EMAIL = ClassicTokenizer.EMAIL;
public static final int HOST = ClassicTokenizer.HOST;
public static final int NUM = ClassicTokenizer.NUM;
public static final int CJ = ClassicTokenizer.CJ;
public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
public final int yychar()
{

View File

@ -121,13 +121,12 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
@SuppressWarnings("unused")
public void _testStandardConstants() {
int x = StandardTokenizer.ALPHANUM;
x = StandardTokenizer.APOSTROPHE;
x = StandardTokenizer.ACRONYM;
x = StandardTokenizer.COMPANY;
x = StandardTokenizer.EMAIL;
x = StandardTokenizer.HOST;
x = StandardTokenizer.NUM;
x = StandardTokenizer.CJ;
x = StandardTokenizer.SOUTHEAST_ASIAN;
x = StandardTokenizer.IDEOGRAPHIC;
x = StandardTokenizer.HIRAGANA;
x = StandardTokenizer.KATAKANA;
x = StandardTokenizer.HANGUL;
String[] y = StandardTokenizer.TOKEN_TYPES;
}

View File

@ -40,56 +40,25 @@ public final class StandardTokenizer extends Tokenizer {
/** A private instance of the JFlex-constructed scanner */
private StandardTokenizerImpl scanner;
// TODO: how can we remove these old types?!
/** Alpha/numeric token type */
public static final int ALPHANUM = 0;
/** @deprecated (3.1) */
@Deprecated
public static final int APOSTROPHE = 1;
/** @deprecated (3.1) */
@Deprecated
public static final int ACRONYM = 2;
/** @deprecated (3.1) */
@Deprecated
public static final int COMPANY = 3;
/** Email token type */
public static final int EMAIL = 4;
/** @deprecated (3.1) */
@Deprecated
public static final int HOST = 5;
public static final int ALPHANUM = 0;
/** Numeric token type */
public static final int NUM = 6;
/** @deprecated (3.1) */
@Deprecated
public static final int CJ = 7;
/** @deprecated (3.1) */
@Deprecated
public static final int ACRONYM_DEP = 8;
public static final int NUM = 1;
/** Southeast Asian token type */
public static final int SOUTHEAST_ASIAN = 9;
/** Idiographic token type */
public static final int IDEOGRAPHIC = 10;
public static final int SOUTHEAST_ASIAN = 2;
/** Ideographic token type */
public static final int IDEOGRAPHIC = 3;
/** Hiragana token type */
public static final int HIRAGANA = 11;
public static final int HIRAGANA = 4;
/** Katakana token type */
public static final int KATAKANA = 12;
public static final int KATAKANA = 5;
/** Hangul token type */
public static final int HANGUL = 13;
public static final int HANGUL = 6;
/** String token types that correspond to token type int constants */
public static final String [] TOKEN_TYPES = new String [] {
"<ALPHANUM>",
"<APOSTROPHE>",
"<ACRONYM>",
"<COMPANY>",
"<EMAIL>",
"<HOST>",
"<NUM>",
"<CJ>",
"<ACRONYM_DEP>",
"<SOUTHEAST_ASIAN>",
"<IDEOGRAPHIC>",
"<HIRAGANA>",

View File

@ -16,6 +16,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.standard;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@ -435,7 +436,7 @@ public final class StandardTokenizerImpl {
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
/** Idiographic token type */
/** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
/** Hiragana token type */

View File

@ -82,7 +82,7 @@ ComplexContextEx = \p{LB:Complex_Context}
*/
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
/** Idiographic token type */
/** Ideographic token type */
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
/** Hiragana token type */