LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-01-06 15:37:44 +00:00
parent f0d5002066
commit 26bc874e62
5 changed files with 112 additions and 26 deletions

View File

@ -90,6 +90,10 @@ API Changes
modified so it is token producer's responsibility modified so it is token producer's responsibility
to call Token.clear(). (Doron Cohen) to call Token.clear(). (Doron Cohen)
14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
255 characters) tokens. You can increase this limit by calling
StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless)
Bug fixes Bug fixes

View File

@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
/** Constructs a {@link StandardTokenizer} filtered by a {@link /** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) { public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym); StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
result = new StandardFilter(result); tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result); result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet); result = new StopFilter(result, stopSet);
return result; return result;
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
StandardTokenizer tokenStream; StandardTokenizer tokenStream;
TokenStream filteredTokenStream; TokenStream filteredTokenStream;
} }
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream(); SavedStreams streams = (SavedStreams) getPreviousTokenStream();
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
} else { } else {
streams.tokenStream.reset(reader); streams.tokenStream.reset(reader);
} }
streams.tokenStream.setMaxTokenLength(maxTokenLength);
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);

View File

@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
this.input = reader; this.input = reader;
} }
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/** @see #setMaxTokenLength */
public int getMaxTokenLength() {
return maxTokenLength;
}
/** /**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the * Creates a new instance of the {@link StandardTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner. * <code>input</code> to a newly created JFlex scanner.
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
this.replaceInvalidAcronym = replaceInvalidAcronym; this.replaceInvalidAcronym = replaceInvalidAcronym;
this.input = input; this.input = input;
this.scanner = new StandardTokenizerImpl(input); this.scanner = new StandardTokenizerImpl(input);
}/* }
* (non-Javadoc)
* /*
* @see org.apache.lucene.analysis.TokenStream#next() * (non-Javadoc)
*/ *
public Token next(Token result) throws IOException { * @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken(); int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) { if (tokenType == StandardTokenizerImpl.YYEOF) {
return null; return null;
} }
result.clear(); if (scanner.yylength() <= maxTokenLength) {
scanner.getText(result); result.clear();
final int start = scanner.yychar(); result.setPositionIncrement(posIncr);
result.setStartOffset(start); scanner.getText(result);
result.setEndOffset(start+result.termLength()); final int start = scanner.yychar();
// This 'if' should be removed in the next release. For now, it converts result.setStartOffset(start);
// invalid acronyms to HOST. When removed, only the 'else' part should result.setEndOffset(start+result.termLength());
// remain. // This 'if' should be removed in the next release. For now, it converts
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { // invalid acronyms to HOST. When removed, only the 'else' part should
if (replaceInvalidAcronym) { // remain.
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
result.setTermLength(result.termLength() - 1); // remove extra '.' if (replaceInvalidAcronym) {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
result.setTermLength(result.termLength() - 1); // remove extra '.'
} else {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
} else { } else {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
} }
} else { return result;
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } else
} // When we skip a too-long term, we still increment the
return result; // position increment
posIncr++;
}
} }
/* /*

View File

@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
} }
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception { public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
}
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input)); TokenStream ts = a.tokenStream("dummy", new StringReader(input));
for (int i = 0; i < expectedImages.length; i++) { for (int i = 0; i < expectedImages.length; i++) {
Token t = ts.next(); Token t = ts.next();
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
if (expectedTypes != null) { if (expectedTypes != null) {
assertEquals(expectedTypes[i], t.type()); assertEquals(expectedTypes[i], t.type());
} }
if (expectedPosIncrs != null) {
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
}
} }
assertNull(ts.next()); assertNull(ts.next());
ts.close(); ts.close();
} }
public void testMaxTermLength() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer();
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testMaxTermLength2() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer();
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
}
public void testMaxTermLength3() throws Exception {
char[] chars = new char[255];
for(int i=0;i<255;i++)
chars[i] = 'a';
String longTerm = new String(chars, 0, 255);
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testAlphanumeric() throws Exception { public void testAlphanumeric() throws Exception {
// alphanumeric tokens // alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); assertAnalyzesTo(a, "B2B", new String[]{"b2b"});

View File

@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
// maximum length term, and search on that term: // maximum length term, and search on that term:
doc = new Document(); doc = new Document();
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED)); doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
writer = new IndexWriter(dir, new StandardAnalyzer()); StandardAnalyzer sa = new StandardAnalyzer();
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, sa);
writer.addDocument(doc); writer.addDocument(doc);
writer.close(); writer.close();
reader = IndexReader.open(dir); reader = IndexReader.open(dir);