LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-01-06 15:37:44 +00:00
parent f0d5002066
commit 26bc874e62
5 changed files with 112 additions and 26 deletions

View File

@ -90,6 +90,10 @@ API Changes
modified so it is token producer's responsibility
to call Token.clear(). (Doron Cohen)
14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
255 characters) tokens. You can increase this limit by calling
StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless)
Bug fixes

View File

@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
/** Constructs a {@link StandardTokenizer} filtered by a {@link
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
public TokenStream tokenStream(String fieldName, Reader reader) {
TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
result = new StandardFilter(result);
StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
tokenStream.setMaxTokenLength(maxTokenLength);
TokenStream result = new StandardFilter(tokenStream);
result = new LowerCaseFilter(result);
result = new StopFilter(result, stopSet);
return result;
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
StandardTokenizer tokenStream;
TokenStream filteredTokenStream;
}
/** Default maximum allowed token length */
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
/**
* Set maximum allowed token length. If a token is seen
* that exceeds this length then it is discarded. This
* setting only takes effect the next time tokenStream or
* reusableTokenStream is called.
*/
public void setMaxTokenLength(int length) {
maxTokenLength = length;
}
/**
* @see #setMaxTokenLength
*/
public int getMaxTokenLength() {
return maxTokenLength;
}
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
} else {
streams.tokenStream.reset(reader);
}
streams.tokenStream.setMaxTokenLength(maxTokenLength);
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);

View File

@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
this.input = reader;
}
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
/** Set the max allowed token length. Any token longer
* than this is skipped. */
public void setMaxTokenLength(int length) {
this.maxTokenLength = length;
}
/** @see #setMaxTokenLength */
public int getMaxTokenLength() {
return maxTokenLength;
}
/**
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
* <code>input</code> to a newly created JFlex scanner.
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
this.replaceInvalidAcronym = replaceInvalidAcronym;
this.input = input;
this.scanner = new StandardTokenizerImpl(input);
}/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
}
/*
* (non-Javadoc)
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
int posIncr = 1;
while(true) {
int tokenType = scanner.getNextToken();
if (tokenType == StandardTokenizerImpl.YYEOF) {
return null;
}
result.clear();
scanner.getText(result);
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start+result.termLength());
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
result.setTermLength(result.termLength() - 1); // remove extra '.'
if (scanner.yylength() <= maxTokenLength) {
result.clear();
result.setPositionIncrement(posIncr);
scanner.getText(result);
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start+result.termLength());
// This 'if' should be removed in the next release. For now, it converts
// invalid acronyms to HOST. When removed, only the 'else' part should
// remain.
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
if (replaceInvalidAcronym) {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
result.setTermLength(result.termLength() - 1); // remove extra '.'
} else {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
}
} else {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
} else {
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
}
return result;
return result;
} else
// When we skip a too-long term, we still increment the
// position increment
posIncr++;
}
}
/*

View File

@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
}
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
}
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
for (int i = 0; i < expectedImages.length; i++) {
Token t = ts.next();
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
if (expectedTypes != null) {
assertEquals(expectedTypes[i], t.type());
}
if (expectedPosIncrs != null) {
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
}
}
assertNull(ts.next());
ts.close();
}
public void testMaxTermLength() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer();
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testMaxTermLength2() throws Exception {
StandardAnalyzer sa = new StandardAnalyzer();
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
sa.setMaxTokenLength(5);
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
}
public void testMaxTermLength3() throws Exception {
char[] chars = new char[255];
for(int i=0;i<255;i++)
chars[i] = 'a';
String longTerm = new String(chars, 0, 255);
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
}
public void testAlphanumeric() throws Exception {
// alphanumeric tokens
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});

View File

@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
// maximum length term, and search on that term:
doc = new Document();
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
writer = new IndexWriter(dir, new StandardAnalyzer());
StandardAnalyzer sa = new StandardAnalyzer();
sa.setMaxTokenLength(100000);
writer = new IndexWriter(dir, sa);
writer.addDocument(doc);
writer.close();
reader = IndexReader.open(dir);