LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@609330 13f79535-47bb-0310-9956-ffa450edef68
2008-01-06 15:37:44 +00:00 · 2008-01-06 15:37:44 +00:00 · 26bc874e62
parent f0d5002066
commit 26bc874e62
5 changed files with 112 additions and 26 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -90,6 +90,10 @@ API Changes
    modified so it is token producer's responsibility
    to call Token.clear(). (Doron Cohen)   
 14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
    255 characters) tokens.  You can increase this limit by calling
    StandardAnalyzer.setMaxTokenLength(...).  (Michael McCandless)
 Bug fixes
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -144,8 +144,9 @@ public class StandardAnalyzer extends Analyzer {
  /** Constructs a {@link StandardTokenizer} filtered by a {@link
  StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
  public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
+    StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
-    result = new StandardFilter(result);
+    tokenStream.setMaxTokenLength(maxTokenLength);
    TokenStream result = new StandardFilter(tokenStream);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, stopSet);
    return result;
@ -155,6 +156,28 @@ public class StandardAnalyzer extends Analyzer {
    StandardTokenizer tokenStream;
    TokenStream filteredTokenStream;
  }
  /** Default maximum allowed token length */
  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
  /**
   * Set maximum allowed token length.  If a token is seen
   * that exceeds this length then it is discarded.  This
   * setting only takes effect the next time tokenStream or
   * reusableTokenStream is called.
   */
  public void setMaxTokenLength(int length) {
    maxTokenLength = length;
  }
  /**
   * @see #setMaxTokenLength
   */
  public int getMaxTokenLength() {
    return maxTokenLength;
  }
  public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
@ -168,6 +191,7 @@ public class StandardAnalyzer extends Analyzer {
    } else {
      streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(maxTokenLength);
    streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -58,6 +58,19 @@ public class StandardTokenizer extends Tokenizer {
    this.input = reader;
  }
  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
  /** Set the max allowed token length.  Any token longer
   *  than this is skipped. */
  public void setMaxTokenLength(int length) {
    this.maxTokenLength = length;
  }
  /** @see #setMaxTokenLength */
  public int getMaxTokenLength() {
    return maxTokenLength;
  }
    /**
     * Creates a new instance of the {@link StandardTokenizer}. Attaches the
     * <code>input</code> to a newly created JFlex scanner.
@ -80,37 +93,49 @@ public class StandardTokenizer extends Tokenizer {
    this.replaceInvalidAcronym = replaceInvalidAcronym;
    this.input = input;
    this.scanner = new StandardTokenizerImpl(input);
-  }/*
+  }
-     * (non-Javadoc)
+
-     *
+  /*
-     * @see org.apache.lucene.analysis.TokenStream#next()
+   * (non-Javadoc)
-     */
+   *
-    public Token next(Token result) throws IOException {
+   * @see org.apache.lucene.analysis.TokenStream#next()
   */
  public Token next(Token result) throws IOException {
      int posIncr = 1;
      while(true) {
 	int tokenType = scanner.getNextToken();
 	if (tokenType == StandardTokenizerImpl.YYEOF) {
 	    return null;
 	}
-        result.clear();
+        if (scanner.yylength() <= maxTokenLength) {
-        scanner.getText(result);
+          result.clear();
-        final int start = scanner.yychar();
+          result.setPositionIncrement(posIncr);
-        result.setStartOffset(start);
+          scanner.getText(result);
-        result.setEndOffset(start+result.termLength());
+          final int start = scanner.yychar();
-        // This 'if' should be removed in the next release. For now, it converts
+          result.setStartOffset(start);
-        // invalid acronyms to HOST. When removed, only the 'else' part should
+          result.setEndOffset(start+result.termLength());
-        // remain.
+          // This 'if' should be removed in the next release. For now, it converts
-        if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+          // invalid acronyms to HOST. When removed, only the 'else' part should
-          if (replaceInvalidAcronym) {
+          // remain.
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+          if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
-            result.setTermLength(result.termLength() - 1); // remove extra '.'
+            if (replaceInvalidAcronym) {
              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
              result.setTermLength(result.termLength() - 1); // remove extra '.'
            } else {
              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
            }
          } else {
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+            result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
          }
-        } else {
+          return result;
-          result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
+        } else
-        }
+          // When we skip a too-long term, we still increment the
-        return result;
+          // position increment
          posIncr++;
      }
    }
    /*
--- a/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
+++ b/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
@ -30,6 +30,10 @@ public class TestStandardAnalyzer extends LuceneTestCase {
  }
  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes) throws Exception {
    assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
  }
  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
    TokenStream ts = a.tokenStream("dummy", new StringReader(input));
    for (int i = 0; i < expectedImages.length; i++) {
      Token t = ts.next();
@ -38,12 +42,39 @@ public class TestStandardAnalyzer extends LuceneTestCase {
      if (expectedTypes != null) {
        assertEquals(expectedTypes[i], t.type());
      }
      if (expectedPosIncrs != null) {
        assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
      }
    }
    assertNull(ts.next());
    ts.close();
  }
  public void testMaxTermLength() throws Exception {
    StandardAnalyzer sa = new StandardAnalyzer();
    sa.setMaxTokenLength(5);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
  }
  public void testMaxTermLength2() throws Exception {
    StandardAnalyzer sa = new StandardAnalyzer();
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy", "z"});
    sa.setMaxTokenLength(5);
    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null, new int[]{1, 1, 2, 1});
  }
  public void testMaxTermLength3() throws Exception {
    char[] chars = new char[255];
    for(int i=0;i<255;i++)
      chars[i] = 'a';
    String longTerm = new String(chars, 0, 255);
    assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm, "xy", "z"});
    assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
  }
  public void testAlphanumeric() throws Exception {
    // alphanumeric tokens
    assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
--- a/src/test/org/apache/lucene/index/TestIndexWriter.java
+++ b/src/test/org/apache/lucene/index/TestIndexWriter.java
@ -586,7 +586,9 @@ public class TestIndexWriter extends LuceneTestCase
      // maximum length term, and search on that term:
      doc = new Document();
      doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
-      writer  = new IndexWriter(dir, new StandardAnalyzer());
+      StandardAnalyzer sa = new StandardAnalyzer();
      sa.setMaxTokenLength(100000);
      writer  = new IndexWriter(dir, sa);
      writer.addDocument(doc);
      writer.close();
      reader = IndexReader.open(dir);