LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
2008-01-06 15:37:44 +00:00 · 2008-01-06 15:37:44 +00:00 · 26bc874e62
parent f0d5002066
commit 26bc874e62
5 changed files with 112 additions and 26 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -90,6 +90,10 @@ API Changes
    modified so it is token producer's responsibility
    to call Token.clear(). (Doron Cohen)   

+14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
+    255 characters) tokens.  You can increase this limit by calling
+    StandardAnalyzer.setMaxTokenLength(...).  (Michael McCandless)
+

 Bug fixes

--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
  /** Constructs a {@link StandardTokenizer} filtered by a {@link
  StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
-    result = new StandardFilter(result);
+    StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
+    tokenStream.setMaxTokenLength(maxTokenLength);
+    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopSet);
    return result;
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
    StandardTokenizer tokenStream;
    TokenStream filteredTokenStream;
  }
+
+  /** Default maximum allowed token length */
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+  /**
+   * Set maximum allowed token length.  If a token is seen
+   * that exceeds this length then it is discarded.  This
+   * setting only takes effect the next time tokenStream or
+   * reusableTokenStream is called.
+   */
+  public void setMaxTokenLength(int length) {
+    maxTokenLength = length;
+  }
+    
+  /**
+   * @see #setMaxTokenLength
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
  
  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
    } else {
      streams.tokenStream.reset(reader);
    }
+    streams.tokenStream.setMaxTokenLength(maxTokenLength);
    
    streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);

--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
    this.input = reader;
  }

+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  /** Set the max allowed token length.  Any token longer
+   *  than this is skipped. */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /** @see #setMaxTokenLength */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
    /**
     * Creates a new instance of the {@link StandardTokenizer}. Attaches the
     * <code>input</code> to a newly created JFlex scanner.
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
    this.replaceInvalidAcronym = replaceInvalidAcronym;
    this.input = input;
    this.scanner = new StandardTokenizerImpl(input);
-  }/*
-     * (non-Javadoc)
-     *
-     * @see org.apache.lucene.analysis.TokenStream#next()
-     */
-    public Token next(Token result) throws IOException {
+  }
+
+  /*
+   * (non-Javadoc)
+   *
+   * @see org.apache.lucene.analysis.TokenStream#next()
+   */
+  public Token next(Token result) throws IOException {
+      int posIncr = 1;
+
+      while(true) {
 	int tokenType = scanner.getNextToken();

 	if (tokenType == StandardTokenizerImpl.YYEOF) {
 	    return null;
 	}

-        result.clear();
-        scanner.getText(result);
-        final int start = scanner.yychar();
-        result.setStartOffset(start);
-        result.setEndOffset(start+result.termLength());
-        // This 'if' should be removed in the next release. For now, it converts
-        // invalid acronyms to HOST. When removed, only the 'else' part should
-        // remain.
-        if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
-          if (replaceInvalidAcronym) {
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
-            result.setTermLength(result.termLength() - 1); // remove extra '.'
+        if (scanner.yylength() <= maxTokenLength) {
+          result.clear();
+          result.setPositionIncrement(posIncr);
+          scanner.getText(result);
+          final int start = scanner.yychar();
+          result.setStartOffset(start);
+          result.setEndOffset(start+result.termLength());
+          // This 'if' should be removed in the next release. For now, it converts
+          // invalid acronyms to HOST. When removed, only the 'else' part should
+          // remain.
+          if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+            if (replaceInvalidAcronym) {
+              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+              result.setTermLength(result.termLength() - 1); // remove extra '.'
+            } else {
+              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+            }
          } else {
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+            result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
          }
-        } else {
-          result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
-        }
-        return result;
+          return result;
+        } else
+          // When we skip a too-long term, we still increment the
+          // position increment
+          posIncr++;
+      }
    }

    /*
--- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
  }

  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
+    assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
+  }
+
+  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    for (int i = 0; i < expectedImages.length; i++) {
      Token t = ts.next();
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
      if (expectedTypes != null) {
        assertEquals(expectedTypes[i], t.type());
      }
+      if (expectedPosIncrs != null) {
+        assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
+      }
    }
    assertNull(ts.next());
    ts.close();
  }


+  public void testMaxTermLength() throws Exception {
+    StandardAnalyzer sa = new StandardAnalyzer();
+    sa.setMaxTokenLength(5);
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
+  }
+
+  public void testMaxTermLength2() throws Exception {
+    StandardAnalyzer sa = new StandardAnalyzer();
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
+    sa.setMaxTokenLength(5);
+    
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
+  }
+
+  public void testMaxTermLength3() throws Exception {
+    char[] chars = new char[255];
+    for(int i=0;i<255;i++)
+      chars[i] = 'a';
+    String longTerm = new String(chars, 0, 255);
+    
+    assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
+    assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
+  }
+
  public void testAlphanumeric() throws Exception {
    // alphanumeric tokens
    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
      // maximum length term, and search on that term:
      doc = new Document();
      doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
-      writer  = new IndexWriter(dir, new StandardAnalyzer());
+      StandardAnalyzer sa = new StandardAnalyzer();
+      sa.setMaxTokenLength(100000);
+      writer  = new IndexWriter(dir, sa);
      writer.addDocument(doc);
      writer.close();
      reader = IndexReader.open(dir);