mirror of https://github.com/apache/lucene.git
LUCENE-1068: updated StandardTokenizer, Analyzer to allow for the replaceInvalidAcronym
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@607161 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
93b9adc280
commit
da2a912919
|
@ -197,6 +197,12 @@ Bug fixes
|
||||||
than actually exist , a ConcurrentMidificationException
|
than actually exist , a ConcurrentMidificationException
|
||||||
is thrown. (Doron Cohen)
|
is thrown. (Doron Cohen)
|
||||||
|
|
||||||
|
27. LUCENE-1068: Changed StandardTokenizer to fix an issue with it marking
|
||||||
|
the type of some tokens incorrectly. This is done by adding a new flag named
|
||||||
|
replaceInvalidAcronym which defaults to false, the current, incorrect behavior. Setting
|
||||||
|
this flag to true fixes the problem. This flag is a temporary fix and is already
|
||||||
|
marked as being deprecated. 3.x will implement the correct approach. (Shai Erera via Grant Ingersoll)
|
||||||
|
|
||||||
|
|
||||||
New features
|
New features
|
||||||
|
|
||||||
|
|
|
@ -33,6 +33,16 @@ import java.util.Set;
|
||||||
public class StandardAnalyzer extends Analyzer {
|
public class StandardAnalyzer extends Analyzer {
|
||||||
private Set stopSet;
|
private Set stopSet;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specifies whether deprecated acronyms should be replaced with HOST type.
|
||||||
|
* This is false by default to support backward compatibility.
|
||||||
|
*
|
||||||
|
* @deprecated this should be removed in the next release (3.0).
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*/
|
||||||
|
private boolean replaceInvalidAcronym = false;
|
||||||
|
|
||||||
/** An array containing some common English words that are usually not
|
/** An array containing some common English words that are usually not
|
||||||
useful for searching. */
|
useful for searching. */
|
||||||
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
public static final String[] STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
|
||||||
|
@ -66,10 +76,75 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
stopSet = WordlistLoader.getWordSet(stopwords);
|
stopSet = WordlistLoader.getWordSet(stopwords);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public StandardAnalyzer(boolean replaceInvalidAcronym) {
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param stopwords The stopwords to use
|
||||||
|
* @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public StandardAnalyzer(Reader stopwords, boolean replaceInvalidAcronym) throws IOException{
|
||||||
|
this(stopwords);
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param stopwords The stopwords to use
|
||||||
|
* @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public StandardAnalyzer(File stopwords, boolean replaceInvalidAcronym) throws IOException{
|
||||||
|
this(stopwords);
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param stopwords The stopwords to use
|
||||||
|
* @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public StandardAnalyzer(String [] stopwords, boolean replaceInvalidAcronym) throws IOException{
|
||||||
|
this(stopwords);
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param stopwords The stopwords to use
|
||||||
|
* @param replaceInvalidAcronym Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public StandardAnalyzer(Set stopwords, boolean replaceInvalidAcronym) throws IOException{
|
||||||
|
this(stopwords);
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader);
|
TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
|
||||||
result = new StandardFilter(result);
|
result = new StandardFilter(result);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
result = new StopFilter(result, stopSet);
|
result = new StopFilter(result, stopSet);
|
||||||
|
@ -90,9 +165,32 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
|
||||||
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
|
streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
|
||||||
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
|
streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
|
||||||
} else
|
} else {
|
||||||
streams.tokenStream.reset(reader);
|
streams.tokenStream.reset(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||||
|
|
||||||
return streams.filteredTokenStream;
|
return streams.filteredTokenStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @return true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*/
|
||||||
|
public boolean isReplaceInvalidAcronym() {
|
||||||
|
return replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param replaceInvalidAcronym Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*/
|
||||||
|
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -43,6 +43,17 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
public class StandardTokenizer extends Tokenizer {
|
public class StandardTokenizer extends Tokenizer {
|
||||||
/** A private instance of the JFlex-constructed scanner */
|
/** A private instance of the JFlex-constructed scanner */
|
||||||
private final StandardTokenizerImpl scanner;
|
private final StandardTokenizerImpl scanner;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Specifies whether deprecated acronyms should be replaced with HOST type.
|
||||||
|
* This is false by default to support backward compatibility.
|
||||||
|
*<p/>
|
||||||
|
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*
|
||||||
|
* @deprecated this should be removed in the next release (3.0).
|
||||||
|
*/
|
||||||
|
private boolean replaceInvalidAcronym = false;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = reader;
|
this.input = reader;
|
||||||
}
|
}
|
||||||
|
@ -56,7 +67,20 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
this.scanner = new StandardTokenizerImpl(input);
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/**
|
||||||
|
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
|
||||||
|
* the <code>input</code> to the newly created JFlex scanner.
|
||||||
|
*
|
||||||
|
* @param input The input reader
|
||||||
|
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms with HOST.
|
||||||
|
*
|
||||||
|
* See http://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*/
|
||||||
|
public StandardTokenizer(Reader input, boolean replaceInvalidAcronym) {
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
this.input = input;
|
||||||
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
|
}/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
|
@ -72,7 +96,19 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
result.setStartOffset(start);
|
result.setStartOffset(start);
|
||||||
result.setEndOffset(start+result.termLength());
|
result.setEndOffset(start+result.termLength());
|
||||||
|
// This 'if' should be removed in the next release. For now, it converts
|
||||||
|
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||||
|
// remain.
|
||||||
|
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||||
|
if (replaceInvalidAcronym) {
|
||||||
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||||
|
result.setTermLength(result.termLength() - 1); // remove extra '.'
|
||||||
|
} else {
|
||||||
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||||
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,4 +126,26 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
input = reader;
|
input = reader;
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
|
||||||
|
* when they should have been labeled as hosts instead.
|
||||||
|
* @return true if StandardTokenizer now returns these tokens as Hosts, otherwise false
|
||||||
|
*
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*/
|
||||||
|
public boolean isReplaceInvalidAcronym() {
|
||||||
|
return replaceInvalidAcronym;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
* @param replaceInvalidAcronym Set to true to replace mischaracterized acronyms as HOST.
|
||||||
|
* @deprecated Remove in 3.X and make true the only valid value
|
||||||
|
*
|
||||||
|
* See https://issues.apache.org/jira/browse/LUCENE-1068
|
||||||
|
*/
|
||||||
|
public void setReplaceInvalidAcronym(boolean replaceInvalidAcronym) {
|
||||||
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.4.1 on 8/9/07 10:15 AM */
|
/* The following code was generated by JFlex 1.4.1 on 12/18/07 9:22 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.standard;
|
package org.apache.lucene.analysis.standard;
|
||||||
|
|
||||||
|
@ -25,8 +25,8 @@ import org.apache.lucene.analysis.Token;
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
* <a href="http://www.jflex.de/">JFlex</a> 1.4.1
|
||||||
* on 8/9/07 10:15 AM from the specification file
|
* on 12/18/07 9:22 PM from the specification file
|
||||||
* <tt>/tango/mike/src/lucene.tokenfix/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
|
* <tt>/Volumes/User/grantingersoll/projects/lucene/java/lucene-clean/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class StandardTokenizerImpl {
|
class StandardTokenizerImpl {
|
||||||
|
|
||||||
|
@ -63,12 +63,13 @@ class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_ACTION = zzUnpackAction();
|
private static final int [] ZZ_ACTION = zzUnpackAction();
|
||||||
|
|
||||||
private static final String ZZ_ACTION_PACKED_0 =
|
private static final String ZZ_ACTION_PACKED_0 =
|
||||||
"\1\0\1\1\4\2\1\3\1\1\14\0\1\4\4\5"+
|
"\1\0\1\1\4\2\1\3\1\1\6\0\2\2\6\0"+
|
||||||
"\2\6\2\0\1\7\1\0\1\7\3\5\6\7\3\5"+
|
"\1\4\4\5\2\6\2\0\1\7\1\0\1\7\3\5"+
|
||||||
"\1\10\4\0\1\10\2\0\2\10\2\5\1\11";
|
"\6\7\3\5\1\10\1\0\1\11\2\0\1\10\1\11"+
|
||||||
|
"\1\0\2\11\2\10\2\5\1\12";
|
||||||
|
|
||||||
private static int [] zzUnpackAction() {
|
private static int [] zzUnpackAction() {
|
||||||
int [] result = new int[57];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -96,14 +97,14 @@ class StandardTokenizerImpl {
|
||||||
"\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+
|
"\0\0\0\17\0\36\0\55\0\74\0\113\0\17\0\132"+
|
||||||
"\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
|
"\0\151\0\170\0\207\0\226\0\245\0\264\0\303\0\322"+
|
||||||
"\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+
|
"\0\341\0\360\0\377\0\u010e\0\u011d\0\u012c\0\u013b\0\u014a"+
|
||||||
"\0\u0159\0\207\0\u0168\0\u0177\0\u0186\0\u0195\0\u01a4\0\u01b3"+
|
"\0\u0159\0\u0168\0\u0177\0\207\0\u0186\0\u0195\0\u01a4\0\u01b3"+
|
||||||
"\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+
|
"\0\u01c2\0\u01d1\0\u01e0\0\u01ef\0\u01fe\0\u020d\0\u021c\0\u022b"+
|
||||||
"\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+
|
"\0\u023a\0\u0249\0\u0258\0\u0267\0\u0276\0\u0285\0\u0294\0\u02a3"+
|
||||||
"\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\170\0\377\0\u02ee\0\u02fd"+
|
"\0\u02b2\0\u02c1\0\u02d0\0\u02df\0\u02ee\0\u02fd\0\u012c\0\341"+
|
||||||
"\0\u030c";
|
"\0\170\0\u011d\0\u030c\0\u031b\0\u032a";
|
||||||
|
|
||||||
private static int [] zzUnpackRowMap() {
|
private static int [] zzUnpackRowMap() {
|
||||||
int [] result = new int[57];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -127,49 +128,50 @@ class StandardTokenizerImpl {
|
||||||
|
|
||||||
private static final String ZZ_TRANS_PACKED_0 =
|
private static final String ZZ_TRANS_PACKED_0 =
|
||||||
"\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+
|
"\10\2\1\3\1\4\1\5\1\6\1\7\1\10\1\2"+
|
||||||
"\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\3"+
|
"\20\0\1\11\1\12\1\13\1\14\2\15\1\16\1\17"+
|
||||||
"\1\4\1\5\1\6\5\0\1\17\1\0\1\20\2\21"+
|
"\1\4\1\20\1\6\5\0\1\21\1\0\1\22\2\23"+
|
||||||
"\1\22\3\4\1\6\4\0\1\11\1\23\1\13\1\14"+
|
"\1\24\3\4\1\6\4\0\1\11\1\25\1\13\1\14"+
|
||||||
"\2\21\1\22\1\5\1\4\1\5\1\6\5\0\1\24"+
|
"\2\23\1\24\1\20\1\4\1\20\1\6\5\0\1\26"+
|
||||||
"\1\0\1\20\2\15\1\16\4\6\21\0\1\2\10\0"+
|
"\1\0\1\22\2\15\1\16\4\6\21\0\1\2\10\0"+
|
||||||
"\1\25\1\0\1\25\14\0\1\26\1\27\1\30\1\31"+
|
"\1\27\1\0\1\27\14\0\1\30\1\31\1\32\1\33"+
|
||||||
"\13\0\1\32\1\0\1\32\14\0\1\33\1\34\1\33"+
|
"\13\0\1\34\1\0\1\34\14\0\1\35\1\36\1\35"+
|
||||||
"\1\34\13\0\1\35\2\36\1\37\13\0\1\16\2\40"+
|
"\1\36\13\0\1\37\2\40\1\41\13\0\1\16\2\42"+
|
||||||
"\14\0\1\41\2\42\1\43\13\0\4\34\13\0\1\44"+
|
"\5\0\1\11\1\26\1\13\1\14\2\15\1\16\1\17"+
|
||||||
"\2\45\1\46\13\0\1\47\2\50\1\51\13\0\1\52"+
|
"\1\4\1\20\1\6\4\0\1\11\1\21\1\13\1\14"+
|
||||||
"\1\42\1\53\1\43\13\0\1\54\2\27\1\31\4\0"+
|
"\2\23\1\24\1\20\1\4\1\20\1\6\13\0\1\43"+
|
||||||
"\1\11\6\0\1\25\1\0\1\25\6\0\1\55\1\0"+
|
"\2\44\1\45\13\0\4\36\13\0\1\46\2\47\1\50"+
|
||||||
"\1\20\2\56\1\0\1\26\1\27\1\30\1\31\5\0"+
|
"\13\0\1\51\2\52\1\53\13\0\1\54\1\44\1\55"+
|
||||||
"\1\57\1\0\1\20\2\60\1\61\3\27\1\31\5\0"+
|
"\1\45\13\0\1\56\2\31\1\33\4\0\1\11\6\0"+
|
||||||
"\1\62\1\0\1\20\2\60\1\61\1\30\1\27\1\30"+
|
"\1\27\1\0\1\27\6\0\1\57\1\0\1\22\2\60"+
|
||||||
"\1\31\5\0\1\63\1\0\1\20\2\56\1\0\4\31"+
|
"\1\0\1\56\2\31\1\33\5\0\1\61\1\0\1\22"+
|
||||||
"\5\0\1\64\2\0\1\64\2\0\1\33\1\34\1\33"+
|
"\2\62\1\63\3\31\1\33\5\0\1\64\1\0\1\22"+
|
||||||
"\1\34\5\0\1\64\2\0\1\64\2\0\4\34\5\0"+
|
"\2\62\1\63\3\31\1\33\5\0\1\65\1\0\1\22"+
|
||||||
"\1\56\1\0\1\20\2\56\1\0\1\35\2\36\1\37"+
|
"\2\60\1\0\4\33\5\0\1\66\2\0\1\66\2\0"+
|
||||||
"\5\0\1\60\1\0\1\20\2\60\1\61\3\36\1\37"+
|
"\1\35\1\36\1\35\1\36\5\0\1\66\2\0\1\66"+
|
||||||
"\5\0\1\56\1\0\1\20\2\56\1\0\4\37\5\0"+
|
"\2\0\4\36\5\0\1\60\1\0\1\22\2\60\1\0"+
|
||||||
"\1\61\2\0\3\61\3\40\6\0\1\24\1\0\1\20"+
|
"\1\37\2\40\1\41\5\0\1\62\1\0\1\22\2\62"+
|
||||||
"\2\15\1\16\1\41\2\42\1\43\5\0\1\17\1\0"+
|
"\1\63\3\40\1\41\5\0\1\60\1\0\1\22\2\60"+
|
||||||
"\1\20\2\21\1\22\3\42\1\43\5\0\1\24\1\0"+
|
"\1\0\4\41\5\0\1\63\2\0\3\63\3\42\6\0"+
|
||||||
"\1\20\2\15\1\16\4\43\5\0\1\15\1\0\1\20"+
|
"\1\67\1\0\1\22\2\15\1\16\1\43\2\44\1\45"+
|
||||||
"\2\15\1\16\1\44\2\45\1\46\5\0\1\21\1\0"+
|
"\5\0\1\70\1\0\1\22\2\23\1\24\3\44\1\45"+
|
||||||
"\1\20\2\21\1\22\3\45\1\46\5\0\1\15\1\0"+
|
"\5\0\1\67\1\0\1\22\2\15\1\16\4\45\5\0"+
|
||||||
"\1\20\2\15\1\16\4\46\5\0\1\16\2\0\3\16"+
|
"\1\15\1\0\1\22\2\15\1\16\1\46\2\47\1\50"+
|
||||||
"\1\47\2\50\1\51\5\0\1\22\2\0\3\22\3\50"+
|
"\5\0\1\23\1\0\1\22\2\23\1\24\3\47\1\50"+
|
||||||
"\1\51\5\0\1\16\2\0\3\16\4\51\5\0\1\65"+
|
"\5\0\1\15\1\0\1\22\2\15\1\16\4\50\5\0"+
|
||||||
"\1\0\1\20\2\15\1\16\1\52\1\42\1\53\1\43"+
|
"\1\16\2\0\3\16\1\51\2\52\1\53\5\0\1\24"+
|
||||||
"\5\0\1\66\1\0\1\20\2\21\1\22\1\53\1\42"+
|
"\2\0\3\24\3\52\1\53\5\0\1\16\2\0\3\16"+
|
||||||
"\1\53\1\43\5\0\1\63\1\0\1\20\2\56\1\0"+
|
"\4\53\5\0\1\71\1\0\1\22\2\15\1\16\1\43"+
|
||||||
"\1\54\2\27\1\31\13\0\1\67\1\31\1\67\1\31"+
|
"\2\44\1\45\5\0\1\72\1\0\1\22\2\23\1\24"+
|
||||||
"\13\0\4\37\13\0\4\43\13\0\4\46\13\0\4\51"+
|
"\3\44\1\45\5\0\1\65\1\0\1\22\2\60\1\0"+
|
||||||
"\13\0\1\70\1\43\1\70\1\43\13\0\4\31\13\0"+
|
"\1\56\2\31\1\33\13\0\1\73\1\33\1\73\1\33"+
|
||||||
"\4\71\5\0\1\55\1\0\1\20\2\56\1\0\1\67"+
|
"\13\0\4\41\13\0\4\45\13\0\4\50\13\0\4\53"+
|
||||||
"\1\31\1\67\1\31\5\0\1\65\1\0\1\20\2\15"+
|
"\13\0\1\74\1\45\1\74\1\45\13\0\4\33\13\0"+
|
||||||
"\1\16\1\70\1\43\1\70\1\43\5\0\1\64\2\0"+
|
"\4\75\5\0\1\57\1\0\1\22\2\60\1\0\4\33"+
|
||||||
"\1\64\2\0\4\71\3\0";
|
"\5\0\1\71\1\0\1\22\2\15\1\16\4\45\5\0"+
|
||||||
|
"\1\66\2\0\1\66\2\0\4\75\3\0";
|
||||||
|
|
||||||
private static int [] zzUnpackTrans() {
|
private static int [] zzUnpackTrans() {
|
||||||
int [] result = new int[795];
|
int [] result = new int[825];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -207,11 +209,12 @@ class StandardTokenizerImpl {
|
||||||
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
|
||||||
|
|
||||||
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
private static final String ZZ_ATTRIBUTE_PACKED_0 =
|
||||||
"\1\0\1\11\4\1\1\11\1\1\14\0\7\1\2\0"+
|
"\1\0\1\11\4\1\1\11\1\1\6\0\2\1\6\0"+
|
||||||
"\1\1\1\0\16\1\4\0\1\1\2\0\5\1";
|
"\7\1\2\0\1\1\1\0\16\1\1\0\1\1\2\0"+
|
||||||
|
"\2\1\1\0\7\1";
|
||||||
|
|
||||||
private static int [] zzUnpackAttribute() {
|
private static int [] zzUnpackAttribute() {
|
||||||
int [] result = new int[57];
|
int [] result = new int[61];
|
||||||
int offset = 0;
|
int offset = 0;
|
||||||
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
|
||||||
return result;
|
return result;
|
||||||
|
@ -288,6 +291,12 @@ public static final int EMAIL = 4;
|
||||||
public static final int HOST = 5;
|
public static final int HOST = 5;
|
||||||
public static final int NUM = 6;
|
public static final int NUM = 6;
|
||||||
public static final int CJ = 7;
|
public static final int CJ = 7;
|
||||||
|
/**
|
||||||
|
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||||
|
* as ACRONYMs. It is deprecated and will be removed in the next
|
||||||
|
* release.
|
||||||
|
*/
|
||||||
|
public static final int ACRONYM_DEP = 8;
|
||||||
|
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<ALPHANUM>",
|
"<ALPHANUM>",
|
||||||
|
@ -297,7 +306,8 @@ public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<EMAIL>",
|
"<EMAIL>",
|
||||||
"<HOST>",
|
"<HOST>",
|
||||||
"<NUM>",
|
"<NUM>",
|
||||||
"<CJ>"
|
"<CJ>",
|
||||||
|
"<ACRONYM_DEP>"
|
||||||
};
|
};
|
||||||
|
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
|
@ -605,39 +615,43 @@ final void getText(Token t) {
|
||||||
case 5:
|
case 5:
|
||||||
{ return HOST;
|
{ return HOST;
|
||||||
}
|
}
|
||||||
case 10: break;
|
case 11: break;
|
||||||
|
case 9:
|
||||||
|
{ return ACRONYM_DEP;
|
||||||
|
}
|
||||||
|
case 12: break;
|
||||||
case 8:
|
case 8:
|
||||||
{ return ACRONYM;
|
{ return ACRONYM;
|
||||||
}
|
}
|
||||||
case 11: break;
|
case 13: break;
|
||||||
case 1:
|
case 1:
|
||||||
{ /* ignore */
|
{ /* ignore */
|
||||||
}
|
}
|
||||||
case 12: break;
|
case 14: break;
|
||||||
case 7:
|
case 7:
|
||||||
{ return NUM;
|
{ return NUM;
|
||||||
}
|
}
|
||||||
case 13: break;
|
case 15: break;
|
||||||
case 3:
|
case 3:
|
||||||
{ return CJ;
|
{ return CJ;
|
||||||
}
|
}
|
||||||
case 14: break;
|
case 16: break;
|
||||||
case 2:
|
case 2:
|
||||||
{ return ALPHANUM;
|
{ return ALPHANUM;
|
||||||
}
|
}
|
||||||
case 15: break;
|
case 17: break;
|
||||||
case 6:
|
case 6:
|
||||||
{ return COMPANY;
|
{ return COMPANY;
|
||||||
}
|
}
|
||||||
case 16: break;
|
case 18: break;
|
||||||
case 4:
|
case 4:
|
||||||
{ return APOSTROPHE;
|
{ return APOSTROPHE;
|
||||||
}
|
}
|
||||||
case 17: break;
|
case 19: break;
|
||||||
case 9:
|
case 10:
|
||||||
{ return EMAIL;
|
{ return EMAIL;
|
||||||
}
|
}
|
||||||
case 18: break;
|
case 20: break;
|
||||||
default:
|
default:
|
||||||
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
|
||||||
zzAtEOF = true;
|
zzAtEOF = true;
|
||||||
|
|
|
@ -38,6 +38,12 @@ public static final int EMAIL = 4;
|
||||||
public static final int HOST = 5;
|
public static final int HOST = 5;
|
||||||
public static final int NUM = 6;
|
public static final int NUM = 6;
|
||||||
public static final int CJ = 7;
|
public static final int CJ = 7;
|
||||||
|
/**
|
||||||
|
* @deprecated this solves a bug where HOSTs that end with '.' are identified
|
||||||
|
* as ACRONYMs. It is deprecated and will be removed in the next
|
||||||
|
* release.
|
||||||
|
*/
|
||||||
|
public static final int ACRONYM_DEP = 8;
|
||||||
|
|
||||||
public static final String [] TOKEN_TYPES = new String [] {
|
public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<ALPHANUM>",
|
"<ALPHANUM>",
|
||||||
|
@ -47,7 +53,8 @@ public static final String [] TOKEN_TYPES = new String [] {
|
||||||
"<EMAIL>",
|
"<EMAIL>",
|
||||||
"<HOST>",
|
"<HOST>",
|
||||||
"<NUM>",
|
"<NUM>",
|
||||||
"<CJ>"
|
"<CJ>",
|
||||||
|
"<ACRONYM_DEP>"
|
||||||
};
|
};
|
||||||
|
|
||||||
public final int yychar()
|
public final int yychar()
|
||||||
|
@ -72,7 +79,9 @@ APOSTROPHE = {ALPHA} ("'" {ALPHA})+
|
||||||
|
|
||||||
// acronyms: U.S.A., I.B.M., etc.
|
// acronyms: U.S.A., I.B.M., etc.
|
||||||
// use a post-filter to remove dots
|
// use a post-filter to remove dots
|
||||||
ACRONYM = {ALPHA} "." ({ALPHA} ".")+
|
ACRONYM = {LETTER} "." ({LETTER} ".")+
|
||||||
|
|
||||||
|
ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
|
||||||
|
|
||||||
// company names like AT&T and Excite@Home.
|
// company names like AT&T and Excite@Home.
|
||||||
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
COMPANY = {ALPHA} ("&"|"@") {ALPHA}
|
||||||
|
@ -125,6 +134,7 @@ WHITESPACE = \r\n | [ \r\n\t\f]
|
||||||
{HOST} { return HOST; }
|
{HOST} { return HOST; }
|
||||||
{NUM} { return NUM; }
|
{NUM} { return NUM; }
|
||||||
{CJ} { return CJ; }
|
{CJ} { return CJ; }
|
||||||
|
{ACRONYM_DEP} { return ACRONYM_DEP; }
|
||||||
|
|
||||||
/** Ignore the rest */
|
/** Ignore the rest */
|
||||||
. | {WHITESPACE} { /* ignore */ }
|
. | {WHITESPACE} { /* ignore */ }
|
||||||
|
|
|
@ -92,6 +92,12 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
public void testDomainNames() throws Exception {
|
public void testDomainNames() throws Exception {
|
||||||
// domain names
|
// domain names
|
||||||
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
|
assertAnalyzesTo(a, "www.nutch.org", new String[]{"www.nutch.org"});
|
||||||
|
//Notice the trailing . See https://issues.apache.org/jira/browse/LUCENE-1068.
|
||||||
|
//TODO: Remove in 3.x
|
||||||
|
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "wwwnutchorg" }, new String[] { "<ACRONYM>" });
|
||||||
|
// the following should be recognized as HOST. The code that sets replaceDepAcronym should be removed in the next release.
|
||||||
|
((StandardAnalyzer) a).setReplaceInvalidAcronym(true);
|
||||||
|
assertAnalyzesTo(a, "www.nutch.org.", new String[]{ "www.nutch.org" }, new String[] { "<HOST>" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testEMailAddresses() throws Exception {
|
public void testEMailAddresses() throws Exception {
|
||||||
|
@ -195,4 +201,11 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
|
"<ALPHANUM>", "<NUM>", "<HOST>", "<NUM>", "<ALPHANUM>",
|
||||||
"<ALPHANUM>", "<HOST>"});
|
"<ALPHANUM>", "<HOST>"});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated this should be removed in the 3.0. */
|
||||||
|
public void testDeprecatedAcronyms() throws Exception {
|
||||||
|
// test backward compatibility for applications that require the old behavior.
|
||||||
|
// this should be removed once replaceDepAcronym is removed.
|
||||||
|
assertAnalyzesTo(a, "lucene.apache.org.", new String[]{ "luceneapacheorg" }, new String[] { "<ACRONYM>" });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue