small documentation improvement

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@162070 13f79535-47bb-0310-9956-ffa450edef68
2025-03-04 07:19:18 +00:00 · 2005-04-20 20:53:03 +00:00 · 2005-04-20 20:53:03 +00:00 · e9f22b5bc5
commit e9f22b5bc5
parent 0dcd089588
3 changed files with 19 additions and 3 deletions
--- a/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
@ -22,7 +22,7 @@ import java.util.Set;

 /**
 * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
- * LowerCaseFilter} and {@link StopFilter}.
+ * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
 *
 * @version $Id$
 */
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
@ -5,7 +5,15 @@ import java.io.*;

 /** A grammar-based tokenizer constructed with JavaCC.
 *
- * <p> This should be a good tokenizer for most European-language documents.
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ *   <li>Splits words at punctuation characters, removing punctuation. However, a 
+ *     dot that's not followed by whitespace is considered part of a token.
+ *   <li>Splits words at hyphens, unless there's a number in the token, in which case
+ *     the whole token is interpreted as a product number and is not split.
+ *   <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
 *
 * <p>Many applications have specific tokenizer needs.  If this tokenizer does
 * not suit your application, please consider copying this source code
--- a/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
+++ b/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.jj
@ -31,7 +31,15 @@ import java.io.*;

 /** A grammar-based tokenizer constructed with JavaCC.
 *
- * <p> This should be a good tokenizer for most European-language documents.
+ * <p> This should be a good tokenizer for most European-language documents:
+ *
+ * <ul>
+ *   <li>Splits words at punctuation characters, removing punctuation. However, a 
+ *     dot that's not followed by whitespace is considered part of a token.
+ *   <li>Splits words at hyphens, unless there's a number in the token, in which case
+ *     the whole token is interpreted as a product number and is not split.
+ *   <li>Recognizes email addresses and internet hostnames as one token.
+ * </ul>
 *
 * <p>Many applications have specific tokenizer needs.  If this tokenizer does
 * not suit your application, please consider copying this source code