From 08f690ef16119abb10dfec0cf0a89284c5b748f9 Mon Sep 17 00:00:00 2001 From: Shai Erera Date: Mon, 23 Sep 2013 05:06:51 +0000 Subject: [PATCH] javadocs did not produce good looking HTML git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1525506 13f79535-47bb-0310-9956-ffa450edef68 --- .../miscellaneous/WordDelimiterFilter.java | 76 ++++++++++++------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java index 38dba6e4326..e04043b4cdb 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/miscellaneous/WordDelimiterFilter.java @@ -19,6 +19,8 @@ package org.apache.lucene.analysis.miscellaneous; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.core.WhitespaceTokenizer; +import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; @@ -30,36 +32,54 @@ import org.apache.lucene.util.RamUsageEstimator; import java.io.IOException; /** - * Splits words into subwords and performs optional transformations on subword groups. - * Words are split into subwords with the following rules: - * - split on intra-word delimiters (by default, all non alpha-numeric characters). - * - "Wi-Fi" -> "Wi", "Fi" - * - split on case transitions - * - "PowerShot" -> "Power", "Shot" - * - split on letter-number transitions - * - "SD500" -> "SD", "500" - * - leading and trailing intra-word delimiters on each subword are ignored - * - "//hello---there, 'dude'" -> "hello", "there", "dude" - * - trailing "'s" are removed for each subword - * - "O'Neil's" -> "O", "Neil" - * - Note: this step isn't performed in a separate filter because of possible subword combinations. - * + * Splits words into subwords and performs optional transformations on subword + * groups. Words are split into subwords with the following rules: + * + * * The combinations parameter affects how subwords are combined: - * - combinations="0" causes no subword combinations. - * - "PowerShot" -> 0:"Power", 1:"Shot" (0 and 1 are the token positions) - * - combinations="1" means that in addition to the subwords, maximum runs of non-numeric subwords are catenated and produced at the same position of the last subword in the run. - * - "PowerShot" -> 0:"Power", 1:"Shot" 1:"PowerShot" - * - "A's+B's&C's" -> 0:"A", 1:"B", 2:"C", 2:"ABC" - * - "Super-Duper-XL500-42-AutoCoder!" -> 0:"Super", 1:"Duper", 2:"XL", 2:"SuperDuperXL", 3:"500" 4:"42", 5:"Auto", 6:"Coder", 6:"AutoCoder" - * - * One use for WordDelimiterFilter is to help match words with different subword delimiters. - * For example, if the source text contained "wi-fi" one may want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. - * One way of doing so is to specify combinations="1" in the analyzer used for indexing, and combinations="0" (the default) - * in the analyzer used for querying. Given that the current StandardTokenizer immediately removes many intra-word - * delimiters, it is recommended that this filter be used after a tokenizer that does not do this (such as WhitespaceTokenizer). - * + * + * One use for {@link WordDelimiterFilter} is to help match words with different + * subword delimiters. For example, if the source text contained "wi-fi" one may + * want "wifi" "WiFi" "wi-fi" "wi+fi" queries to all match. One way of doing so + * is to specify combinations="1" in the analyzer used for indexing, and + * combinations="0" (the default) in the analyzer used for querying. Given that + * the current {@link StandardTokenizer} immediately removes many intra-word + * delimiters, it is recommended that this filter be used after a tokenizer that + * does not do this (such as {@link WhitespaceTokenizer}). */ - public final class WordDelimiterFilter extends TokenFilter { public static final int LOWER = 0x01;