From 26bc874e62aaf2b09f7e428b3bf4dae1068e380b Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Sun, 6 Jan 2008 15:37:44 +0000 Subject: [PATCH] LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 ++ .../analysis/standard/StandardAnalyzer.java | 28 +++++++- .../analysis/standard/StandardTokenizer.java | 71 +++++++++++++------ .../lucene/analysis/TestStandardAnalyzer.java | 31 ++++++++ .../apache/lucene/index/TestIndexWriter.java | 4 +- 5 files changed, 112 insertions(+), 26 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4c2a4c149c9..befd38997ec 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -90,6 +90,10 @@ API Changes modified so it is token producer's responsibility to call Token.clear(). (Doron Cohen) +14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default > + 255 characters) tokens. You can increase this limit by calling + StandardAnalyzer.setMaxTokenLength(...). (Michael McCandless) + Bug fixes diff --git a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java index c46e42c8277..126b9ab4010 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java @@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer { /** Constructs a {@link StandardTokenizer} filtered by a {@link StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */ public TokenStream tokenStream(String fieldName, Reader reader) { - TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym); - result = new StandardFilter(result); + StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym); + tokenStream.setMaxTokenLength(maxTokenLength); + TokenStream result = new StandardFilter(tokenStream); result = new LowerCaseFilter(result); result = new StopFilter(result, stopSet); return result; @@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } + + /** Default maximum allowed token length */ + public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; + + private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; + + /** + * Set maximum allowed token length. If a token is seen + * that exceeds this length then it is discarded. This + * setting only takes effect the next time tokenStream or + * reusableTokenStream is called. + */ + public void setMaxTokenLength(int length) { + maxTokenLength = length; + } + + /** + * @see #setMaxTokenLength + */ + public int getMaxTokenLength() { + return maxTokenLength; + } public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); @@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer { } else { streams.tokenStream.reset(reader); } + streams.tokenStream.setMaxTokenLength(maxTokenLength); streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym); diff --git a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java index 8efde277ae8..845b5cfef0b 100644 --- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java +++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java @@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer { this.input = reader; } + private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH; + + /** Set the max allowed token length. Any token longer + * than this is skipped. */ + public void setMaxTokenLength(int length) { + this.maxTokenLength = length; + } + + /** @see #setMaxTokenLength */ + public int getMaxTokenLength() { + return maxTokenLength; + } + /** * Creates a new instance of the {@link StandardTokenizer}. Attaches the * input to a newly created JFlex scanner. @@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer { this.replaceInvalidAcronym = replaceInvalidAcronym; this.input = input; this.scanner = new StandardTokenizerImpl(input); - }/* - * (non-Javadoc) - * - * @see org.apache.lucene.analysis.TokenStream#next() - */ - public Token next(Token result) throws IOException { + } + + /* + * (non-Javadoc) + * + * @see org.apache.lucene.analysis.TokenStream#next() + */ + public Token next(Token result) throws IOException { + int posIncr = 1; + + while(true) { int tokenType = scanner.getNextToken(); if (tokenType == StandardTokenizerImpl.YYEOF) { return null; } - result.clear(); - scanner.getText(result); - final int start = scanner.yychar(); - result.setStartOffset(start); - result.setEndOffset(start+result.termLength()); - // This 'if' should be removed in the next release. For now, it converts - // invalid acronyms to HOST. When removed, only the 'else' part should - // remain. - if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { - if (replaceInvalidAcronym) { - result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); - result.setTermLength(result.termLength() - 1); // remove extra '.' + if (scanner.yylength() <= maxTokenLength) { + result.clear(); + result.setPositionIncrement(posIncr); + scanner.getText(result); + final int start = scanner.yychar(); + result.setStartOffset(start); + result.setEndOffset(start+result.termLength()); + // This 'if' should be removed in the next release. For now, it converts + // invalid acronyms to HOST. When removed, only the 'else' part should + // remain. + if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) { + if (replaceInvalidAcronym) { + result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]); + result.setTermLength(result.termLength() - 1); // remove extra '.' + } else { + result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); + } } else { - result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]); + result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); } - } else { - result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]); - } - return result; + return result; + } else + // When we skip a too-long term, we still increment the + // position increment + posIncr++; + } } /* diff --git a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java index 14bb48c64ce..6209f3dec7e 100644 --- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java +++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java @@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase { } public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception { + assertAnalyzesTo(a, input, expectedImages, expectedTypes, null); + } + + public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception { TokenStream ts = a.tokenStream("dummy", new StringReader(input)); for (int i = 0; i < expectedImages.length; i++) { Token t = ts.next(); @@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase { if (expectedTypes != null) { assertEquals(expectedTypes[i], t.type()); } + if (expectedPosIncrs != null) { + assertEquals(expectedPosIncrs[i], t.getPositionIncrement()); + } } assertNull(ts.next()); ts.close(); } + public void testMaxTermLength() throws Exception { + StandardAnalyzer sa = new StandardAnalyzer(); + sa.setMaxTokenLength(5); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}); + } + + public void testMaxTermLength2() throws Exception { + StandardAnalyzer sa = new StandardAnalyzer(); + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"}); + sa.setMaxTokenLength(5); + + assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1}); + } + + public void testMaxTermLength3() throws Exception { + char[] chars = new char[255]; + for(int i=0;i<255;i++) + chars[i] = 'a'; + String longTerm = new String(chars, 0, 255); + + assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"}); + assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"}); + } + public void testAlphanumeric() throws Exception { // alphanumeric tokens assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); diff --git a/src/test/org/apache/lucene/index/TestIndexWriter.java b/src/test/org/apache/lucene/index/TestIndexWriter.java index 9cb9d52d935..04027a1e94d 100644 --- a/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase // maximum length term, and search on that term: doc = new Document(); doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED)); - writer = new IndexWriter(dir, new StandardAnalyzer()); + StandardAnalyzer sa = new StandardAnalyzer(); + sa.setMaxTokenLength(100000); + writer = new IndexWriter(dir, sa); writer.addDocument(doc); writer.close(); reader = IndexReader.open(dir);