mirror of https://github.com/apache/lucene.git
LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f0d5002066
commit
26bc874e62
|
@ -90,6 +90,10 @@ API Changes
|
|||
modified so it is token producer's responsibility
|
||||
to call Token.clear(). (Doron Cohen)
|
||||
|
||||
14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
|
||||
255 characters) tokens. You can increase this limit by calling
|
||||
StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless)
|
||||
|
||||
|
||||
Bug fixes
|
||||
|
||||
|
|
|
@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
|
|||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||
TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
|
||||
result = new StandardFilter(result);
|
||||
StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
|
||||
tokenStream.setMaxTokenLength(maxTokenLength);
|
||||
TokenStream result = new StandardFilter(tokenStream);
|
||||
result = new LowerCaseFilter(result);
|
||||
result = new StopFilter(result, stopSet);
|
||||
return result;
|
||||
|
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
|
|||
StandardTokenizer tokenStream;
|
||||
TokenStream filteredTokenStream;
|
||||
}
|
||||
|
||||
/** Default maximum allowed token length */
|
||||
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
||||
|
||||
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/**
|
||||
* Set maximum allowed token length. If a token is seen
|
||||
* that exceeds this length then it is discarded. This
|
||||
* setting only takes effect the next time tokenStream or
|
||||
* reusableTokenStream is called.
|
||||
*/
|
||||
public void setMaxTokenLength(int length) {
|
||||
maxTokenLength = length;
|
||||
}
|
||||
|
||||
/**
|
||||
* @see #setMaxTokenLength
|
||||
*/
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||
|
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
|
|||
} else {
|
||||
streams.tokenStream.reset(reader);
|
||||
}
|
||||
streams.tokenStream.setMaxTokenLength(maxTokenLength);
|
||||
|
||||
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||
|
||||
|
|
|
@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
|
|||
this.input = reader;
|
||||
}
|
||||
|
||||
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||
|
||||
/** Set the max allowed token length. Any token longer
|
||||
* than this is skipped. */
|
||||
public void setMaxTokenLength(int length) {
|
||||
this.maxTokenLength = length;
|
||||
}
|
||||
|
||||
/** @see #setMaxTokenLength */
|
||||
public int getMaxTokenLength() {
|
||||
return maxTokenLength;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
||||
* <code>input</code> to a newly created JFlex scanner.
|
||||
|
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
|
|||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||
this.input = input;
|
||||
this.scanner = new StandardTokenizerImpl(input);
|
||||
}/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(Token result) throws IOException {
|
||||
}
|
||||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(Token result) throws IOException {
|
||||
int posIncr = 1;
|
||||
|
||||
while(true) {
|
||||
int tokenType = scanner.getNextToken();
|
||||
|
||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||
return null;
|
||||
}
|
||||
|
||||
result.clear();
|
||||
scanner.getText(result);
|
||||
final int start = scanner.yychar();
|
||||
result.setStartOffset(start);
|
||||
result.setEndOffset(start+result.termLength());
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||
result.setTermLength(result.termLength() - 1); // remove extra '.'
|
||||
if (scanner.yylength() <= maxTokenLength) {
|
||||
result.clear();
|
||||
result.setPositionIncrement(posIncr);
|
||||
scanner.getText(result);
|
||||
final int start = scanner.yychar();
|
||||
result.setStartOffset(start);
|
||||
result.setEndOffset(start+result.termLength());
|
||||
// This 'if' should be removed in the next release. For now, it converts
|
||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||
// remain.
|
||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||
if (replaceInvalidAcronym) {
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||
result.setTermLength(result.termLength() - 1); // remove extra '.'
|
||||
} else {
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||
}
|
||||
} else {
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
} else {
|
||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||
}
|
||||
return result;
|
||||
return result;
|
||||
} else
|
||||
// When we skip a too-long term, we still increment the
|
||||
// position increment
|
||||
posIncr++;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
|||
}
|
||||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
|
||||
assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
|
||||
}
|
||||
|
||||
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
for (int i = 0; i < expectedImages.length; i++) {
|
||||
Token t = ts.next();
|
||||
|
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
|||
if (expectedTypes != null) {
|
||||
assertEquals(expectedTypes[i], t.type());
|
||||
}
|
||||
if (expectedPosIncrs != null) {
|
||||
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
|
||||
}
|
||||
}
|
||||
assertNull(ts.next());
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
||||
public void testMaxTermLength() throws Exception {
|
||||
StandardAnalyzer sa = new StandardAnalyzer();
|
||||
sa.setMaxTokenLength(5);
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
}
|
||||
|
||||
public void testMaxTermLength2() throws Exception {
|
||||
StandardAnalyzer sa = new StandardAnalyzer();
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
|
||||
sa.setMaxTokenLength(5);
|
||||
|
||||
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
|
||||
}
|
||||
|
||||
public void testMaxTermLength3() throws Exception {
|
||||
char[] chars = new char[255];
|
||||
for(int i=0;i<255;i++)
|
||||
chars[i] = 'a';
|
||||
String longTerm = new String(chars, 0, 255);
|
||||
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
|
||||
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||
}
|
||||
|
||||
public void testAlphanumeric() throws Exception {
|
||||
// alphanumeric tokens
|
||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||
|
|
|
@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
|
|||
// maximum length term, and search on that term:
|
||||
doc = new Document();
|
||||
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer = new IndexWriter(dir, new StandardAnalyzer());
|
||||
StandardAnalyzer sa = new StandardAnalyzer();
|
||||
sa.setMaxTokenLength(100000);
|
||||
writer = new IndexWriter(dir, sa);
|
||||
writer.addDocument(doc);
|
||||
writer.close();
|
||||
reader = IndexReader.open(dir);
|
||||
|
|
Loading…
Reference in New Issue