mirror of https://github.com/apache/lucene.git
LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer
This commit is contained in:
parent
74b609cf88
commit
a31f9e8dd6
|
@ -196,6 +196,9 @@ Other
|
|||
|
||||
* LUCENE-5822: Convert README to Markdown (Jason Gerlowski via Mike Drob)
|
||||
|
||||
* LUCENE-7773: Remove unused/deprecated token types from StandardTokenizer.
|
||||
(Ahmet Arslan via Steve Rowe)
|
||||
|
||||
======================= Lucene 6.7.0 =======================
|
||||
|
||||
New Features
|
||||
|
|
|
@ -19,7 +19,7 @@ import re
|
|||
# for inclusion in HTMLStripCharFilter.jflex.
|
||||
|
||||
def main():
|
||||
print get_apache_license()
|
||||
print(get_apache_license())
|
||||
codes = {}
|
||||
regex = re.compile(r'\s*<!ENTITY\s+(\S+)\s+"&(?:#38;)?#(\d+);"')
|
||||
for line in get_entity_text().split('\n'):
|
||||
|
@ -38,51 +38,51 @@ def main():
|
|||
new_entry = ('"%s"' if first_entry else ' | "%s"') % key
|
||||
first_entry = False
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
if key in ('quot','copy','gt','lt','reg','amp'):
|
||||
new_entry = ' | "%s"' % key.upper()
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line, ')'
|
||||
print(output_line, ')')
|
||||
|
||||
print '%{'
|
||||
print ' private static final Map<String,String> upperCaseVariantsAccepted'
|
||||
print ' = new HashMap<>();'
|
||||
print ' static {'
|
||||
print ' upperCaseVariantsAccepted.put("quot", "QUOT");'
|
||||
print ' upperCaseVariantsAccepted.put("copy", "COPY");'
|
||||
print ' upperCaseVariantsAccepted.put("gt", "GT");'
|
||||
print ' upperCaseVariantsAccepted.put("lt", "LT");'
|
||||
print ' upperCaseVariantsAccepted.put("reg", "REG");'
|
||||
print ' upperCaseVariantsAccepted.put("amp", "AMP");'
|
||||
print ' }'
|
||||
print ' private static final CharArrayMap<Character> entityValues'
|
||||
print ' = new CharArrayMap<>(%i, false);' % len(keys)
|
||||
print ' static {'
|
||||
print ' String[] entities = {'
|
||||
print('%{')
|
||||
print(' private static final Map<String,String> upperCaseVariantsAccepted')
|
||||
print(' = new HashMap<>();')
|
||||
print(' static {')
|
||||
print(' upperCaseVariantsAccepted.put("quot", "QUOT");')
|
||||
print(' upperCaseVariantsAccepted.put("copy", "COPY");')
|
||||
print(' upperCaseVariantsAccepted.put("gt", "GT");')
|
||||
print(' upperCaseVariantsAccepted.put("lt", "LT");')
|
||||
print(' upperCaseVariantsAccepted.put("reg", "REG");')
|
||||
print(' upperCaseVariantsAccepted.put("amp", "AMP");')
|
||||
print(' }')
|
||||
print(' private static final CharArrayMap<Character> entityValues')
|
||||
print(' = new CharArrayMap<>(%i, false);' % len(keys))
|
||||
print(' static {')
|
||||
print(' String[] entities = {')
|
||||
output_line = ' '
|
||||
for key in keys:
|
||||
new_entry = ' "%s", "%s",' % (key, codes[key])
|
||||
if len(output_line) + len(new_entry) >= 80:
|
||||
print output_line
|
||||
print(output_line)
|
||||
output_line = ' '
|
||||
output_line += new_entry
|
||||
print output_line[:-1]
|
||||
print ' };'
|
||||
print ' for (int i = 0 ; i < entities.length ; i += 2) {'
|
||||
print ' Character value = entities[i + 1].charAt(0);'
|
||||
print ' entityValues.put(entities[i], value);'
|
||||
print ' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);'
|
||||
print ' if (upperCaseVariant != null) {'
|
||||
print ' entityValues.put(upperCaseVariant, value);'
|
||||
print ' }'
|
||||
print ' }'
|
||||
print " }"
|
||||
print "%}"
|
||||
print(output_line[:-1])
|
||||
print(' };')
|
||||
print(' for (int i = 0 ; i < entities.length ; i += 2) {')
|
||||
print(' Character value = entities[i + 1].charAt(0);')
|
||||
print(' entityValues.put(entities[i], value);')
|
||||
print(' String upperCaseVariant = upperCaseVariantsAccepted.get(entities[i]);')
|
||||
print(' if (upperCaseVariant != null) {')
|
||||
print(' entityValues.put(upperCaseVariant, value);')
|
||||
print(' }')
|
||||
print(' }')
|
||||
print(" }")
|
||||
print("%}")
|
||||
|
||||
def get_entity_text():
|
||||
# The text below is taken verbatim from
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -342,17 +343,17 @@ class ClassicTokenizerImpl {
|
|||
|
||||
/* user code: */
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = ClassicTokenizer.ACRONYM;
|
||||
public static final int COMPANY = ClassicTokenizer.COMPANY;
|
||||
public static final int EMAIL = ClassicTokenizer.EMAIL;
|
||||
public static final int HOST = ClassicTokenizer.HOST;
|
||||
public static final int NUM = ClassicTokenizer.NUM;
|
||||
public static final int CJ = ClassicTokenizer.CJ;
|
||||
public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
||||
/**
|
||||
|
@ -36,17 +35,17 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
|
||||
%{
|
||||
|
||||
public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = StandardTokenizer.ACRONYM;
|
||||
public static final int COMPANY = StandardTokenizer.COMPANY;
|
||||
public static final int EMAIL = StandardTokenizer.EMAIL;
|
||||
public static final int HOST = StandardTokenizer.HOST;
|
||||
public static final int NUM = StandardTokenizer.NUM;
|
||||
public static final int CJ = StandardTokenizer.CJ;
|
||||
public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
|
||||
public static final int ALPHANUM = ClassicTokenizer.ALPHANUM;
|
||||
public static final int APOSTROPHE = ClassicTokenizer.APOSTROPHE;
|
||||
public static final int ACRONYM = ClassicTokenizer.ACRONYM;
|
||||
public static final int COMPANY = ClassicTokenizer.COMPANY;
|
||||
public static final int EMAIL = ClassicTokenizer.EMAIL;
|
||||
public static final int HOST = ClassicTokenizer.HOST;
|
||||
public static final int NUM = ClassicTokenizer.NUM;
|
||||
public static final int CJ = ClassicTokenizer.CJ;
|
||||
public static final int ACRONYM_DEP = ClassicTokenizer.ACRONYM_DEP;
|
||||
|
||||
public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
|
||||
public static final String [] TOKEN_TYPES = ClassicTokenizer.TOKEN_TYPES;
|
||||
|
||||
public final int yychar()
|
||||
{
|
||||
|
|
|
@ -121,13 +121,12 @@ public class TestAnalyzers extends BaseTokenStreamTestCase {
|
|||
@SuppressWarnings("unused")
|
||||
public void _testStandardConstants() {
|
||||
int x = StandardTokenizer.ALPHANUM;
|
||||
x = StandardTokenizer.APOSTROPHE;
|
||||
x = StandardTokenizer.ACRONYM;
|
||||
x = StandardTokenizer.COMPANY;
|
||||
x = StandardTokenizer.EMAIL;
|
||||
x = StandardTokenizer.HOST;
|
||||
x = StandardTokenizer.NUM;
|
||||
x = StandardTokenizer.CJ;
|
||||
x = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||
x = StandardTokenizer.IDEOGRAPHIC;
|
||||
x = StandardTokenizer.HIRAGANA;
|
||||
x = StandardTokenizer.KATAKANA;
|
||||
x = StandardTokenizer.HANGUL;
|
||||
String[] y = StandardTokenizer.TOKEN_TYPES;
|
||||
}
|
||||
|
||||
|
|
|
@ -40,56 +40,25 @@ public final class StandardTokenizer extends Tokenizer {
|
|||
/** A private instance of the JFlex-constructed scanner */
|
||||
private StandardTokenizerImpl scanner;
|
||||
|
||||
// TODO: how can we remove these old types?!
|
||||
/** Alpha/numeric token type */
|
||||
public static final int ALPHANUM = 0;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int APOSTROPHE = 1;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int ACRONYM = 2;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int COMPANY = 3;
|
||||
/** Email token type */
|
||||
public static final int EMAIL = 4;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int HOST = 5;
|
||||
public static final int ALPHANUM = 0;
|
||||
/** Numeric token type */
|
||||
public static final int NUM = 6;
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int CJ = 7;
|
||||
|
||||
/** @deprecated (3.1) */
|
||||
@Deprecated
|
||||
public static final int ACRONYM_DEP = 8;
|
||||
|
||||
public static final int NUM = 1;
|
||||
/** Southeast Asian token type */
|
||||
public static final int SOUTHEAST_ASIAN = 9;
|
||||
/** Idiographic token type */
|
||||
public static final int IDEOGRAPHIC = 10;
|
||||
public static final int SOUTHEAST_ASIAN = 2;
|
||||
/** Ideographic token type */
|
||||
public static final int IDEOGRAPHIC = 3;
|
||||
/** Hiragana token type */
|
||||
public static final int HIRAGANA = 11;
|
||||
public static final int HIRAGANA = 4;
|
||||
/** Katakana token type */
|
||||
public static final int KATAKANA = 12;
|
||||
|
||||
public static final int KATAKANA = 5;
|
||||
/** Hangul token type */
|
||||
public static final int HANGUL = 13;
|
||||
public static final int HANGUL = 6;
|
||||
|
||||
/** String token types that correspond to token type int constants */
|
||||
public static final String [] TOKEN_TYPES = new String [] {
|
||||
"<ALPHANUM>",
|
||||
"<APOSTROPHE>",
|
||||
"<ACRONYM>",
|
||||
"<COMPANY>",
|
||||
"<EMAIL>",
|
||||
"<HOST>",
|
||||
"<NUM>",
|
||||
"<CJ>",
|
||||
"<ACRONYM_DEP>",
|
||||
"<SOUTHEAST_ASIAN>",
|
||||
"<IDEOGRAPHIC>",
|
||||
"<HIRAGANA>",
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.lucene.analysis.standard;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
|
@ -435,7 +436,7 @@ public final class StandardTokenizerImpl {
|
|||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
/** Idiographic token type */
|
||||
/** Ideographic token type */
|
||||
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
|
||||
|
||||
/** Hiragana token type */
|
||||
|
|
|
@ -82,7 +82,7 @@ ComplexContextEx = \p{LB:Complex_Context}
|
|||
*/
|
||||
public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
|
||||
|
||||
/** Idiographic token type */
|
||||
/** Ideographic token type */
|
||||
public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
|
||||
|
||||
/** Hiragana token type */
|
||||
|
|
Loading…
Reference in New Issue