mirror of https://github.com/apache/lucene.git
LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f0d5002066
commit
26bc874e62
|
@ -90,6 +90,10 @@ API Changes
|
||||||
modified so it is token producer's responsibility
|
modified so it is token producer's responsibility
|
||||||
to call Token.clear(). (Doron Cohen)
|
to call Token.clear(). (Doron Cohen)
|
||||||
|
|
||||||
|
14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
|
||||||
|
255 characters) tokens. You can increase this limit by calling
|
||||||
|
StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless)
|
||||||
|
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
|
|
@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
/** Constructs a {@link StandardTokenizer} filtered by a {@link
|
||||||
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
|
StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
|
||||||
result = new StandardFilter(result);
|
tokenStream.setMaxTokenLength(maxTokenLength);
|
||||||
|
TokenStream result = new StandardFilter(tokenStream);
|
||||||
result = new LowerCaseFilter(result);
|
result = new LowerCaseFilter(result);
|
||||||
result = new StopFilter(result, stopSet);
|
result = new StopFilter(result, stopSet);
|
||||||
return result;
|
return result;
|
||||||
|
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
StandardTokenizer tokenStream;
|
StandardTokenizer tokenStream;
|
||||||
TokenStream filteredTokenStream;
|
TokenStream filteredTokenStream;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Default maximum allowed token length */
|
||||||
|
public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
|
||||||
|
|
||||||
|
private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Set maximum allowed token length. If a token is seen
|
||||||
|
* that exceeds this length then it is discarded. This
|
||||||
|
* setting only takes effect the next time tokenStream or
|
||||||
|
* reusableTokenStream is called.
|
||||||
|
*/
|
||||||
|
public void setMaxTokenLength(int length) {
|
||||||
|
maxTokenLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see #setMaxTokenLength
|
||||||
|
*/
|
||||||
|
public int getMaxTokenLength() {
|
||||||
|
return maxTokenLength;
|
||||||
|
}
|
||||||
|
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
|
||||||
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
SavedStreams streams = (SavedStreams) getPreviousTokenStream();
|
||||||
|
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
|
||||||
} else {
|
} else {
|
||||||
streams.tokenStream.reset(reader);
|
streams.tokenStream.reset(reader);
|
||||||
}
|
}
|
||||||
|
streams.tokenStream.setMaxTokenLength(maxTokenLength);
|
||||||
|
|
||||||
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
|
||||||
|
|
||||||
|
|
|
@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
this.input = reader;
|
this.input = reader;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
|
||||||
|
|
||||||
|
/** Set the max allowed token length. Any token longer
|
||||||
|
* than this is skipped. */
|
||||||
|
public void setMaxTokenLength(int length) {
|
||||||
|
this.maxTokenLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @see #setMaxTokenLength */
|
||||||
|
public int getMaxTokenLength() {
|
||||||
|
return maxTokenLength;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
||||||
* <code>input</code> to a newly created JFlex scanner.
|
* <code>input</code> to a newly created JFlex scanner.
|
||||||
|
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
this.input = input;
|
this.input = input;
|
||||||
this.scanner = new StandardTokenizerImpl(input);
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
}/*
|
}
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
/*
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* (non-Javadoc)
|
||||||
*/
|
*
|
||||||
public Token next(Token result) throws IOException {
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
|
*/
|
||||||
|
public Token next(Token result) throws IOException {
|
||||||
|
int posIncr = 1;
|
||||||
|
|
||||||
|
while(true) {
|
||||||
int tokenType = scanner.getNextToken();
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
result.clear();
|
if (scanner.yylength() <= maxTokenLength) {
|
||||||
scanner.getText(result);
|
result.clear();
|
||||||
final int start = scanner.yychar();
|
result.setPositionIncrement(posIncr);
|
||||||
result.setStartOffset(start);
|
scanner.getText(result);
|
||||||
result.setEndOffset(start+result.termLength());
|
final int start = scanner.yychar();
|
||||||
// This 'if' should be removed in the next release. For now, it converts
|
result.setStartOffset(start);
|
||||||
// invalid acronyms to HOST. When removed, only the 'else' part should
|
result.setEndOffset(start+result.termLength());
|
||||||
// remain.
|
// This 'if' should be removed in the next release. For now, it converts
|
||||||
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||||
if (replaceInvalidAcronym) {
|
// remain.
|
||||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||||
result.setTermLength(result.termLength() - 1); // remove extra '.'
|
if (replaceInvalidAcronym) {
|
||||||
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||||
|
result.setTermLength(result.termLength() - 1); // remove extra '.'
|
||||||
|
} else {
|
||||||
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||||
}
|
}
|
||||||
} else {
|
return result;
|
||||||
result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
} else
|
||||||
}
|
// When we skip a too-long term, we still increment the
|
||||||
return result;
|
// position increment
|
||||||
|
posIncr++;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
|
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
|
||||||
|
assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
for (int i = 0; i < expectedImages.length; i++) {
|
for (int i = 0; i < expectedImages.length; i++) {
|
||||||
Token t = ts.next();
|
Token t = ts.next();
|
||||||
|
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
if (expectedTypes != null) {
|
if (expectedTypes != null) {
|
||||||
assertEquals(expectedTypes[i], t.type());
|
assertEquals(expectedTypes[i], t.type());
|
||||||
}
|
}
|
||||||
|
if (expectedPosIncrs != null) {
|
||||||
|
assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
assertNull(ts.next());
|
assertNull(ts.next());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void testMaxTermLength() throws Exception {
|
||||||
|
StandardAnalyzer sa = new StandardAnalyzer();
|
||||||
|
sa.setMaxTokenLength(5);
|
||||||
|
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxTermLength2() throws Exception {
|
||||||
|
StandardAnalyzer sa = new StandardAnalyzer();
|
||||||
|
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
|
||||||
|
sa.setMaxTokenLength(5);
|
||||||
|
|
||||||
|
assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMaxTermLength3() throws Exception {
|
||||||
|
char[] chars = new char[255];
|
||||||
|
for(int i=0;i<255;i++)
|
||||||
|
chars[i] = 'a';
|
||||||
|
String longTerm = new String(chars, 0, 255);
|
||||||
|
|
||||||
|
assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
|
||||||
|
assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
|
||||||
|
}
|
||||||
|
|
||||||
public void testAlphanumeric() throws Exception {
|
public void testAlphanumeric() throws Exception {
|
||||||
// alphanumeric tokens
|
// alphanumeric tokens
|
||||||
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
|
||||||
|
|
|
@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
|
||||||
// maximum length term, and search on that term:
|
// maximum length term, and search on that term:
|
||||||
doc = new Document();
|
doc = new Document();
|
||||||
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
|
||||||
writer = new IndexWriter(dir, new StandardAnalyzer());
|
StandardAnalyzer sa = new StandardAnalyzer();
|
||||||
|
sa.setMaxTokenLength(100000);
|
||||||
|
writer = new IndexWriter(dir, sa);
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
writer.close();
|
writer.close();
|
||||||
reader = IndexReader.open(dir);
|
reader = IndexReader.open(dir);
|
||||||
|
|
Loading…
Reference in New Issue