diff --git a/CHANGES.txt b/CHANGES.txt index ff6a1fa649b..86718b34e3b 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -150,6 +150,9 @@ Optimizations Documentation +* LUCENE-2008: Javadoc improvements for TokenStream/Tokenizer/Token + (Luke Nezda via Mike McCandless) + Build * LUCENE-486: Remove test->demo dependencies. (Michael Busch) diff --git a/src/java/org/apache/lucene/analysis/CharArraySet.java b/src/java/org/apache/lucene/analysis/CharArraySet.java index 5d350caa11c..99cb0317f74 100644 --- a/src/java/org/apache/lucene/analysis/CharArraySet.java +++ b/src/java/org/apache/lucene/analysis/CharArraySet.java @@ -32,7 +32,7 @@ import java.util.Iterator; * is in the set without the necessity of converting it * to a String first. *

- * Please note: This class implements {@link Set} but + * Please note: This class implements {@link java.util.Set Set} but * does not behave like it should in all cases. The generic type is * {@code Set}, because you can add any object to it, * that has a string representation. The add methods will use diff --git a/src/java/org/apache/lucene/analysis/StopAnalyzer.java b/src/java/org/apache/lucene/analysis/StopAnalyzer.java index 39c2db37919..f6423be4aee 100644 --- a/src/java/org/apache/lucene/analysis/StopAnalyzer.java +++ b/src/java/org/apache/lucene/analysis/StopAnalyzer.java @@ -68,9 +68,7 @@ public final class StopAnalyzer extends Analyzer { /** Builds an analyzer with the stop words from the given set. * @param matchVersion See above - * @param stopWords Set of stop words - * @param enablePositionIncrements See {@link - * StopFilter#setEnablePositionIncrements} */ + * @param stopWords Set of stop words */ public StopAnalyzer(Version matchVersion, Set stopWords) { this.stopWords = stopWords; enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion); diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java index a2959352557..c30bb70f7d7 100644 --- a/src/java/org/apache/lucene/analysis/StopFilter.java +++ b/src/java/org/apache/lucene/analysis/StopFilter.java @@ -54,7 +54,6 @@ public final class StopFilter extends TokenFilter { * @param input Input TokenStream * @param stopWords A Set of Strings or char[] or any other toString()-able set representing the stopwords * @param ignoreCase if true, all words are lower cased first - * @param ignoreCase -Ignore case when stopping. */ public StopFilter(boolean enablePositionIncrements, TokenStream input, Set stopWords, boolean ignoreCase) { diff --git a/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java b/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java index 736ad180d6b..6abaed66dd4 100644 --- a/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TeeSinkTokenFilter.java @@ -53,7 +53,7 @@ d.add(new Field("f2", final2)); d.add(new Field("f3", final3)); d.add(new Field("f4", final4)); * - * In this example, sink1 and sink2 will both get tokens from both + * In this example, sink1 and sink2 will both get tokens from both * reader1 and reader2 after whitespace tokenizer * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired. * It is important, that tees are consumed before sinks (in the above example, the field names must be diff --git a/src/java/org/apache/lucene/analysis/Token.java b/src/java/org/apache/lucene/analysis/Token.java index fdd0fee3520..bb2e78a5aeb 100644 --- a/src/java/org/apache/lucene/analysis/Token.java +++ b/src/java/org/apache/lucene/analysis/Token.java @@ -37,7 +37,7 @@ import org.apache.lucene.util.AttributeImpl;

The start and end offsets permit applications to re-associate a token with its source text, e.g., to display highlighted query terms in a document - browser, or to show matching text fragments in a KWIC (KeyWord In Context) + browser, or to show matching text fragments in a KWIC display, etc.

The type is a string, assigned by a lexical analyzer @@ -59,9 +59,9 @@ import org.apache.lucene.util.AttributeImpl;

-

Tokenizers and filters should try to re-use a Token +

Tokenizers and TokenFilters should try to re-use a Token instance when possible for best performance, by - implementing the {@link TokenStream#next(Token)} API. + implementing the {@link TokenStream#incrementToken()} API. Failing that, to create a new Token you should first use one of the constructors that starts with null text. To load the token from a char[] use {@link #setTermBuffer(char[], int, int)}. @@ -75,30 +75,30 @@ import org.apache.lucene.util.AttributeImpl; set the length of the term text. See LUCENE-969 for details.

-

Typical reuse patterns: +

Typical Token reuse patterns:

    -
  • Copying text from a string (type is reset to #DEFAULT_TYPE if not specified):
    +
  • Copying text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
         return reusableToken.reinit(string, startOffset, endOffset[, type]);
       
  • -
  • Copying some text from a string (type is reset to #DEFAULT_TYPE if not specified):
    +
  • Copying some text from a string (type is reset to {@link #DEFAULT_TYPE} if not specified):
         return reusableToken.reinit(string, 0, string.length(), startOffset, endOffset[, type]);
       
  • -
  • Copying text from char[] buffer (type is reset to #DEFAULT_TYPE if not specified):
    +
  • Copying text from char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
         return reusableToken.reinit(buffer, 0, buffer.length, startOffset, endOffset[, type]);
       
  • -
  • Copying some text from a char[] buffer (type is reset to #DEFAULT_TYPE if not specified):
    +
  • Copying some text from a char[] buffer (type is reset to {@link #DEFAULT_TYPE} if not specified):
         return reusableToken.reinit(buffer, start, end - start, startOffset, endOffset[, type]);
       
  • -
  • Copying from one one Token to another (type is reset to #DEFAULT_TYPE if not specified):
    +
  • Copying from one one Token to another (type is reset to {@link #DEFAULT_TYPE} if not specified):
         return reusableToken.reinit(source.termBuffer(), 0, source.termLength(), source.startOffset(), source.endOffset()[, source.type()]);
       
    @@ -108,7 +108,7 @@ import org.apache.lucene.util.AttributeImpl;
    • clear() initializes all of the fields to default values. This was changed in contrast to Lucene 2.4, but should affect no one.
    • Because TokenStreams can be chained, one cannot assume that the Token's current type is correct.
    • -
    • The startOffset and endOffset represent the start and offset in the source text. So be careful in adjusting them.
    • +
    • The startOffset and endOffset represent the start and offset in the source text, so be careful in adjusting them.
    • When caching a reusable token, clone it. When injecting a cached token into a stream that can be reset, clone it again.

    diff --git a/src/java/org/apache/lucene/analysis/TokenFilter.java b/src/java/org/apache/lucene/analysis/TokenFilter.java index ec4e75355a2..63dbb2dd817 100644 --- a/src/java/org/apache/lucene/analysis/TokenFilter.java +++ b/src/java/org/apache/lucene/analysis/TokenFilter.java @@ -19,15 +19,10 @@ package org.apache.lucene.analysis; import java.io.IOException; -/** A TokenFilter is a TokenStream whose input is another token stream. +/** A TokenFilter is a TokenStream whose input is another TokenStream.

    - This is an abstract class. - NOTE: subclasses must override - {@link #incrementToken()} if the new TokenStream API is used - and {@link #next(Token)} or {@link #next()} if the old - TokenStream API is used. -

    - See {@link TokenStream} + This is an abstract class; subclasses must override {@link #incrementToken()}. + @see TokenStream */ public abstract class TokenFilter extends TokenStream { /** The source of tokens for this filter. */ diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java index 96b0ed99cdd..c3b4e7c5980 100644 --- a/src/java/org/apache/lucene/analysis/TokenStream.java +++ b/src/java/org/apache/lucene/analysis/TokenStream.java @@ -31,14 +31,14 @@ import org.apache.lucene.util.AttributeSource; * A TokenStream enumerates the sequence of tokens, either from * {@link Field}s of a {@link Document} or from query text. *

    - * This is an abstract class. Concrete subclasses are: + * This is an abstract class; concrete subclasses are: *

      *
    • {@link Tokenizer}, a TokenStream whose input is a Reader; and *
    • {@link TokenFilter}, a TokenStream whose input is another * TokenStream. *
    * A new TokenStream API has been introduced with Lucene 2.9. This API - * has moved from being {@link Token} based to {@link Attribute} based. While + * has moved from being {@link Token}-based to {@link Attribute}-based. While * {@link Token} still exists in 2.9 as a convenience class, the preferred way * to store the information of a {@link Token} is to use {@link AttributeImpl}s. *

    @@ -54,14 +54,14 @@ import org.apache.lucene.util.AttributeSource; *

  • Instantiation of TokenStream/{@link TokenFilter}s which add/get * attributes to/from the {@link AttributeSource}. *
  • The consumer calls {@link TokenStream#reset()}. - *
  • the consumer retrieves attributes from the stream and stores local - * references to all attributes it wants to access - *
  • The consumer calls {@link #incrementToken()} until it returns false and - * consumes the attributes after each call. + *
  • The consumer retrieves attributes from the stream and stores local + * references to all attributes it wants to access. + *
  • The consumer calls {@link #incrementToken()} until it returns false + * consuming the attributes after each call. *
  • The consumer calls {@link #end()} so that any end-of-stream operations * can be performed. *
  • The consumer calls {@link #close()} to release any resource when finished - * using the TokenStream + * using the TokenStream. * * To make sure that filters and consumers know which attributes are available, * the attributes must be added during instantiation. Filters and consumers are @@ -72,7 +72,7 @@ import org.apache.lucene.util.AttributeSource; * Javadoc. *

    * Sometimes it is desirable to capture a current state of a TokenStream, - * e.g. for buffering purposes (see {@link CachingTokenFilter}, + * e.g., for buffering purposes (see {@link CachingTokenFilter}, * {@link TeeSinkTokenFilter}). For this usecase * {@link AttributeSource#captureState} and {@link AttributeSource#restoreState} * can be used. @@ -101,7 +101,7 @@ public abstract class TokenStream extends AttributeSource implements Closeable { } /** - * Consumers (ie {@link IndexWriter}) use this method to advance the stream to + * Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to * the next token. Implementing classes must implement this method and update * the appropriate {@link AttributeImpl}s with the attributes of the next * token. diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java index 962d9f69c81..62bdc0ff173 100644 --- a/src/java/org/apache/lucene/analysis/Tokenizer.java +++ b/src/java/org/apache/lucene/analysis/Tokenizer.java @@ -24,20 +24,14 @@ import java.io.IOException; /** A Tokenizer is a TokenStream whose input is a Reader.

    - This is an abstract class. -

    - NOTE: subclasses must override - {@link #incrementToken()} if the new TokenStream API is used - and {@link #next(Token)} or {@link #next()} if the old - TokenStream API is used. + This is an abstract class; subclasses must override {@link #incrementToken()}

    NOTE: Subclasses overriding {@link #incrementToken()} must call {@link AttributeSource#clearAttributes()} before setting attributes. - Subclasses overriding {@link #next(Token)} must call + Subclasses overriding {@link #incrementToken()} must call {@link Token#clear()} before setting Token attributes. */ - public abstract class Tokenizer extends TokenStream { /** The text source for this Tokenizer. */ protected Reader input;