From 43113fe2174fc3f01cb09c692167382dc1f20cc4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Mon, 21 Feb 2011 19:05:13 +0000 Subject: [PATCH 1/8] fixed wrong @Override on method inherited from Interface git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073114 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/src/java/org/apache/lucene/index/FieldInfos.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index c5f6d769ae2..d5834bc5cbf 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -102,7 +102,6 @@ public final class FieldInfos implements Iterable { return byNumber.size(); } - @Override public Iterator iterator() { return byNumber.values().iterator(); } From e5309e652b68d8585ec45ab0d46e5ebdd3318fea Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Tue, 22 Feb 2011 14:17:10 +0000 Subject: [PATCH 2/8] LUCENE-2894: add schema.xml samples for solr analysis factories (A to H) git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073336 13f79535-47bb-0310-9956-ffa450edef68 --- .../solr/analysis/ASCIIFoldingFilterFactory.java | 12 +++++++++++- .../analysis/ArabicNormalizationFilterFactory.java | 12 ++++++++++-- .../solr/analysis/ArabicStemFilterFactory.java | 12 ++++++++++-- .../solr/analysis/BrazilianStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/BulgarianStemFilterFactory.java | 12 +++++++++++- .../apache/solr/analysis/CJKTokenizerFactory.java | 11 ++++++++++- .../solr/analysis/CapitalizationFilterFactory.java | 12 +++++++++++- .../apache/solr/analysis/ClassicFilterFactory.java | 9 +++++++++ .../solr/analysis/ClassicTokenizerFactory.java | 8 ++++++++ .../solr/analysis/CollationKeyFilterFactory.java | 12 ++++++++++-- .../solr/analysis/CommonGramsFilterFactory.java | 10 +++++++++- .../analysis/CommonGramsQueryFilterFactory.java | 12 ++++++++++-- .../solr/analysis/CzechStemFilterFactory.java | 11 ++++++++++- .../DelimitedPayloadTokenFilterFactory.java | 13 +++++++++++-- .../DictionaryCompoundWordTokenFilterFactory.java | 13 ++++++++++++- .../solr/analysis/DoubleMetaphoneFilterFactory.java | 11 +++++++++++ .../solr/analysis/EdgeNGramFilterFactory.java | 8 ++++++++ .../solr/analysis/EdgeNGramTokenizerFactory.java | 7 +++++++ .../apache/solr/analysis/ElisionFilterFactory.java | 12 +++++++++++- .../analysis/EnglishMinimalStemFilterFactory.java | 12 +++++++++++- .../analysis/EnglishPossessiveFilterFactory.java | 12 +++++++++++- .../analysis/FinnishLightStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/FrenchLightStemFilterFactory.java | 12 +++++++++++- .../analysis/FrenchMinimalStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/GalicianStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/GermanLightStemFilterFactory.java | 12 +++++++++++- .../analysis/GermanMinimalStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/GermanStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/GreekLowerCaseFilterFactory.java | 12 +++++++++++- .../solr/analysis/GreekStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/HTMLStripCharFilterFactory.java | 13 ++++++++++++- .../analysis/HindiNormalizationFilterFactory.java | 12 +++++++++++- .../solr/analysis/HindiStemFilterFactory.java | 12 +++++++++++- .../analysis/HungarianLightStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/HyphenatedWordsFilterFactory.java | 10 +++++++++- .../HyphenationCompoundWordTokenFilterFactory.java | 11 ++++++++++- 36 files changed, 375 insertions(+), 36 deletions(-) diff --git a/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java index 8513d3453f6..5926713b02d 100644 --- a/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ASCIIFoldingFilterFactory.java @@ -21,7 +21,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link ASCIIFoldingFilter} */ +/** + * Factory for {@link ASCIIFoldingFilter}. + *
+ * <fieldType name="text_ascii" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ASCIIFoldingFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class ASCIIFoldingFilterFactory extends BaseTokenFilterFactory { public ASCIIFoldingFilter create(TokenStream input) { return new ASCIIFoldingFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java index 3c6ac90db9a..7a3e9830237 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicNormalizationFilterFactory.java @@ -21,8 +21,16 @@ import org.apache.lucene.analysis.ar.ArabicNormalizationFilter; /** - * Factory for {@link ArabicNormalizationFilter} - **/ + * Factory for {@link ArabicNormalizationFilter}. + *
+ * <fieldType name="text_arnormal" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ArabicNormalizationFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class ArabicNormalizationFilterFactory extends BaseTokenFilterFactory{ public ArabicNormalizationFilter create(TokenStream input) { diff --git a/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java index 4042bf2b348..b8773019a4c 100644 --- a/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ArabicStemFilterFactory.java @@ -21,8 +21,16 @@ import org.apache.lucene.analysis.ar.ArabicStemFilter; /** - * Factory for {@link ArabicStemFilter} - **/ + * Factory for {@link ArabicStemFilter}. + *
+ * <fieldType name="text_arstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ArabicStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class ArabicStemFilterFactory extends BaseTokenFilterFactory{ diff --git a/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java index 26ca406af26..6d96441d312 100644 --- a/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/BrazilianStemFilterFactory.java @@ -21,7 +21,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.br.BrazilianStemFilter; -/** Factory for {@link BrazilianStemFilter} */ +/** + * Factory for {@link BrazilianStemFilter}. + *
+ * <fieldType name="text_brstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.BrazilianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class BrazilianStemFilterFactory extends BaseTokenFilterFactory { public BrazilianStemFilter create(TokenStream in) { return new BrazilianStemFilter(in); diff --git a/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java index 53acdf93c2e..44563df043b 100644 --- a/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/BulgarianStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.bg.BulgarianStemFilter; -/** Factory for {@link BulgarianStemFilter} */ +/** + * Factory for {@link BulgarianStemFilter}. + *
+ * <fieldType name="text_bgstem" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.BulgarianStemFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class BulgarianStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new BulgarianStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java index 2e0e7f8933a..c3ba41b6be7 100644 --- a/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CJKTokenizerFactory.java @@ -22,7 +22,16 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.cjk.CJKTokenizer; import java.io.Reader; -/** Factory for {@link CJKTokenizer} */ +/** + * Factory for {@link CJKTokenizer}. + *
+ * <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.CJKTokenizerFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * @version $Id$ + */ public class CJKTokenizerFactory extends BaseTokenizerFactory { public CJKTokenizer create(Reader in) { return new CJKTokenizer(in); diff --git a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java index 07ab89ba6a6..81747847333 100644 --- a/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CapitalizationFilterFactory.java @@ -33,7 +33,7 @@ import java.util.StringTokenizer; * The factory takes parameters:
* "onlyFirstWord" - should each word be capitalized or all of the words?
* "keep" - a keep word list. Each word that should be kept separated by whitespace.
- * "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive. + * "keepIgnoreCase - true or false. If true, the keep list will be considered case-insensitive.
* "forceFirstLetter" - Force the first letter to be capitalized even if it is in the keep list
* "okPrefix" - do not change word capitalization if a word begins with something in this list. * for example if "McK" is on the okPrefix list, the word "McKinley" should not be changed to @@ -43,6 +43,16 @@ import java.util.StringTokenizer; * "maxWordCount" - if the token contains more then maxWordCount words, the capitalization is * assumed to be correct.
* + *
+ * <fieldType name="text_cptlztn" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.CapitalizationFilterFactory" onlyFirstWord="true"
+ *     	     keep="java solr lucene" keepIgnoreCase="false"
+ *     	     okPrefix="McK McD McA"/>   
+ *   </analyzer>
+ * </fieldType>
+ * * @version $Id$ * @since solr 1.3 */ diff --git a/solr/src/java/org/apache/solr/analysis/ClassicFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ClassicFilterFactory.java index a7ac168a7f9..ff97f64753d 100644 --- a/solr/src/java/org/apache/solr/analysis/ClassicFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ClassicFilterFactory.java @@ -22,6 +22,15 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.ClassicFilter; /** + * Factory for {@link ClassicFilter}. + *
+ * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+ *     <filter class="solr.ClassicFilterFactory"/>
+ *   </analyzer>
+ * </fieldType>
+ * * @version $Id$ */ public class ClassicFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java index 8acc832d8fc..edaf6e2f996 100644 --- a/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ClassicTokenizerFactory.java @@ -24,6 +24,14 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link ClassicTokenizer}. + *
+ * <fieldType name="text_clssc" class="solr.TextField" positionIncrementGap="100">
+ *   <analyzer>
+ *     <tokenizer class="solr.ClassicTokenizerFactory" maxTokenLength="120"/>
+ *   </analyzer>
+ * </fieldType>
+ * * @version $Id$ */ diff --git a/solr/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java index c0f0867a386..67b84126511 100644 --- a/solr/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CollationKeyFilterFactory.java @@ -56,11 +56,19 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; *
  • strength: 'primary','secondary','tertiary', or 'identical' (optional) *
  • decomposition: 'no','canonical', or 'full' (optional) * - * + * + *
    + * <fieldType name="text_clltnky" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.CollationKeyFilterFactory" language="ja" country="JP"/>
    + *   </analyzer>
    + * </fieldType>
    + * * @see Collator * @see Locale * @see RuleBasedCollator - * @since solr 1.5 + * @since solr 3.1 */ public class CollationKeyFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { private Collator collator; diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java index 386b3de887b..dede00a5e4b 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsFilterFactory.java @@ -27,7 +27,15 @@ import org.apache.solr.common.ResourceLoader; import org.apache.solr.util.plugin.ResourceLoaderAware; /** - * Constructs a CommonGramsFilter + * Constructs a {@link CommonGramsFilter}. + *
    + * <fieldType name="text_cmmngrms" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.CommonGramsFilterFactory" words="commongramsstopwords.txt" ignoreCase="false"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ /* diff --git a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java index 8ec50663774..8a27adc2df9 100644 --- a/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CommonGramsQueryFilterFactory.java @@ -29,10 +29,18 @@ import org.apache.solr.common.ResourceLoader; import org.apache.solr.util.plugin.ResourceLoaderAware; /** - * Construct CommonGramsQueryFilter + * Construct {@link CommonGramsQueryFilter}. * - * This is pretty close to a straight copy from StopFilterFactory + * This is pretty close to a straight copy from {@link StopFilterFactory}. * + *
    + * <fieldType name="text_cmmngrmsqry" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.CommonGramsQueryFilterFactory" words="commongramsquerystopwords.txt" ignoreCase="false"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class CommonGramsQueryFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { diff --git a/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java index f3f889f146d..18d3ea77b1b 100644 --- a/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/CzechStemFilterFactory.java @@ -20,7 +20,16 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.cz.CzechStemFilter; -/** Factory for {@link CzechStemFilter} */ +/** + * Factory for {@link CzechStemFilter}. + *
    + * <fieldType name="text_czstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.CzechStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + */ public class CzechStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new CzechStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java index 69127a5712c..de1ab11ab15 100644 --- a/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DelimitedPayloadTokenFilterFactory.java @@ -31,8 +31,17 @@ import java.util.Map; /** * - * Factory for {@link DelimitedPayloadTokenFilter} - **/ + * Factory for {@link DelimitedPayloadTokenFilter}. + *
    + * <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float" delimiter="|"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + * + */ public class DelimitedPayloadTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public static final String ENCODER_ATTR = "encoder"; public static final String DELIMITER_ATTR = "delimiter"; diff --git a/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java index 63e650e9d7d..e3010ead884 100644 --- a/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DictionaryCompoundWordTokenFilterFactory.java @@ -28,7 +28,18 @@ import org.apache.lucene.analysis.TokenStream; import java.util.Map; import java.io.IOException; -/** Factory for {@link DictionaryCompoundWordTokenFilter} */ +/** + * Factory for {@link DictionaryCompoundWordTokenFilter}. + *
    + * <fieldType name="text_dictcomp" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.DictionaryCompoundWordTokenFilterFactory" dictionary="dictionary.txt"
    + *     	     minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="true"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class DictionaryCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { private CharArraySet dictionary; private String dictFile; diff --git a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java index bb72143c56c..2089f4835c1 100644 --- a/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/DoubleMetaphoneFilterFactory.java @@ -21,6 +21,17 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter; +/** + * Factory for {@link DoubleMetaphoneFilter}. + *
    + * <fieldType name="text_dblmtphn" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.DoubleMetaphoneFilterFactory" inject="true" maxCodeLength="4"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class DoubleMetaphoneFilterFactory extends BaseTokenFilterFactory { public static final String INJECT = "inject"; diff --git a/solr/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java b/solr/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java index a05b6257086..c7f07796176 100644 --- a/solr/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/EdgeNGramFilterFactory.java @@ -23,6 +23,14 @@ import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter; /** * Creates new instances of {@link EdgeNGramTokenFilter}. + *
    + * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.EdgeNGramFilterFactory" side="front" minGramSize="1" maxGramSize="1"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class EdgeNGramFilterFactory extends BaseTokenFilterFactory { private int maxGramSize = 0; diff --git a/solr/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java index 7d97699681e..9945d5f12a4 100755 --- a/solr/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java @@ -24,6 +24,13 @@ import java.util.Map; /** * Creates new instances of {@link EdgeNGramTokenizer}. + *
    + * <fieldType name="text_edgngrm" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.EdgeNGramTokenizerFactory" side="front" minGramSize="1" maxGramSize="1"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class EdgeNGramTokenizerFactory extends BaseTokenizerFactory { private int maxGramSize = 0; diff --git a/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java index 3ced2246a1f..894ec436e19 100644 --- a/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ElisionFilterFactory.java @@ -27,7 +27,17 @@ import org.apache.lucene.analysis.util.CharArraySet; import java.io.IOException; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link ElisionFilter} */ +/** + * Factory for {@link ElisionFilter}. + *
    + * <fieldType name="text_elsn" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.ElisionFilterFactory" articles="stopwordarticles.txt"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class ElisionFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { private CharArraySet articles; diff --git a/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java index a7f1f6dab5c..f64ce24d862 100644 --- a/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/EnglishMinimalStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.EnglishMinimalStemFilter; -/** Factory for {@link EnglishMinimalStemFilter} */ +/** + * Factory for {@link EnglishMinimalStemFilter}. + *
    + * <fieldType name="text_enminstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.EnglishMinimalStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class EnglishMinimalStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new EnglishMinimalStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java b/solr/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java index 3134dcc18f6..488c822ff43 100644 --- a/solr/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/EnglishPossessiveFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; -/** Factory for {@link EnglishPossessiveFilter} */ +/** + * Factory for {@link EnglishPossessiveFilter}. + *
    + * <fieldType name="text_enpossessive" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.EnglishPossessiveFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class EnglishPossessiveFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new EnglishPossessiveFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java index 7a284ed6f8a..ebb077bda1b 100644 --- a/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/FinnishLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.fi.FinnishLightStemFilter; -/** Factory for {@link FinnishLightStemFilter} */ +/** + * Factory for {@link FinnishLightStemFilter}. + *
    + * <fieldType name="text_filgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.FinnishLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class FinnishLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new FinnishLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java index a2f15896174..2e1c16fb6c6 100644 --- a/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/FrenchLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.fr.FrenchLightStemFilter; -/** Factory for {@link FrenchLightStemFilter} */ +/** + * Factory for {@link FrenchLightStemFilter}. + *
    + * <fieldType name="text_frlgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.FrenchLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class FrenchLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new FrenchLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java index 3af344648fb..d2381da89bf 100644 --- a/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/FrenchMinimalStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.fr.FrenchMinimalStemFilter; -/** Factory for {@link FrenchMinimalStemFilter} */ +/** + * Factory for {@link FrenchMinimalStemFilter}. + *
    + * <fieldType name="text_frminstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.FrenchMinimalStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class FrenchMinimalStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new FrenchMinimalStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java index 4cd0ec58c07..844c3f25c76 100644 --- a/solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GalicianStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.gl.GalicianStemFilter; -/** Factory for {@link GalicianStemFilter} */ +/** + * Factory for {@link GalicianStemFilter}. + *
    + * <fieldType name="text_glstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GalicianStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GalicianStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new GalicianStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java index b790d5af297..08cb732e9cc 100644 --- a/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GermanLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanLightStemFilter; -/** Factory for {@link GermanLightStemFilter} */ +/** + * Factory for {@link GermanLightStemFilter}. + *
    + * <fieldType name="text_delgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GermanLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GermanLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new GermanLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java index e41329093eb..5c2f65f69c1 100644 --- a/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GermanMinimalStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.de.GermanMinimalStemFilter; -/** Factory for {@link GermanMinimalStemFilter} */ +/** + * Factory for {@link GermanMinimalStemFilter}. + *
    + * <fieldType name="text_deminstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GermanMinimalStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GermanMinimalStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new GermanMinimalStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java index 2c85c44108d..34d8aaf5651 100644 --- a/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GermanStemFilterFactory.java @@ -22,7 +22,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.de.GermanStemFilter; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link GermanStemFilter} */ +/** + * Factory for {@link GermanStemFilter}. + *
    + * <fieldType name="text_destem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GermanStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GermanStemFilterFactory extends BaseTokenFilterFactory { public GermanStemFilter create(TokenStream in) { return new GermanStemFilter(in); diff --git a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java index 4e87773f18a..2c3a043b781 100644 --- a/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GreekLowerCaseFilterFactory.java @@ -26,7 +26,17 @@ import org.apache.lucene.analysis.el.GreekLowerCaseFilter; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrException.ErrorCode; -/** Factory for {@link GreekLowerCaseFilter} */ +/** + * Factory for {@link GreekLowerCaseFilter}. + *
    + * <fieldType name="text_glc" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GreekLowerCaseFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GreekLowerCaseFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java index 2c6f005222f..2783a7e1995 100644 --- a/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/GreekStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.el.GreekStemFilter; -/** Factory for {@link GreekStemFilter} */ +/** + * Factory for {@link GreekStemFilter}. + *
    + * <fieldType name="text_gstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.GreekStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class GreekStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { diff --git a/solr/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java index 70130d9b5f9..096a07b896c 100644 --- a/solr/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java @@ -21,7 +21,18 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -public class HTMLStripCharFilterFactory extends BaseCharFilterFactory { +/** +* Factory for {@link HTMLStripCharFilter}. + *
    + * <fieldType name="text_html" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <charFilter class="solr.HTMLStripCharFilterFactory"/>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * <fieldType name="text_hinormal" class="solr.TextField" positionIncrementGap="100"> + * <analyzer> + * <tokenizer class="solr.WhitespaceTokenizerFactory"/> + * <filter class="solr.HindiNormalizationFilterFactory"/> + * </analyzer> + * </fieldType> + * @version $Id$ + */ public class HindiNormalizationFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new HindiNormalizationFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java index 406343e021e..e54e8c0ce3c 100644 --- a/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HindiStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hi.HindiStemFilter; -/** Factory for {@link HindiStemFilter} */ +/** + * Factory for {@link HindiStemFilter}. + *
    + * <fieldType name="text_histem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.HindiStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class HindiStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new HindiStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java index b4f6dce5acc..60a46fbb3ce 100644 --- a/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HungarianLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.hu.HungarianLightStemFilter; -/** Factory for {@link HungarianLightStemFilter} */ +/** + * Factory for {@link HungarianLightStemFilter}. + *
    + * <fieldType name="text_hulgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.HungarianLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class HungarianLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new HungarianLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java index 8e1170d6388..7e68d4e727f 100755 --- a/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HyphenatedWordsFilterFactory.java @@ -22,7 +22,15 @@ import org.apache.lucene.analysis.miscellaneous.HyphenatedWordsFilter; import org.apache.solr.analysis.BaseTokenFilterFactory; /** - * Factory for {@link HyphenatedWordsFilter} + * Factory for {@link HyphenatedWordsFilter}. + *
    + * <fieldType name="text_hyphn" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.HyphenatedWordsFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class HyphenatedWordsFilterFactory extends BaseTokenFilterFactory { public HyphenatedWordsFilter create(TokenStream input) { diff --git a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java index a90cef5d5da..7fca6f2349b 100644 --- a/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/HyphenationCompoundWordTokenFilterFactory.java @@ -33,7 +33,7 @@ import java.io.InputStream; import org.xml.sax.InputSource; /** - * Factory for {@link HyphenationCompoundWordTokenFilter} + * Factory for {@link HyphenationCompoundWordTokenFilter}. *

    * This factory accepts the following parameters: *

      @@ -48,6 +48,15 @@ import org.xml.sax.InputSource; * to the stream. defaults to false. *
    *

    + *

    + * <fieldType name="text_hyphncomp" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.HyphenationCompoundWordTokenFilterFactory" hyphenator="hyphenator.xml" encoding="UTF-8"
    + *     	     dictionary="dictionary.txt" minWordSize="5" minSubwordSize="2" maxSubwordSize="15" onlyLongestMatch="false"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ * @see HyphenationCompoundWordTokenFilter */ public class HyphenationCompoundWordTokenFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { From 62f728df42fd270dba14e7c3a357b3b12af4185c Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Wed, 23 Feb 2011 09:35:24 +0000 Subject: [PATCH 3/8] LUCENE-2933: Two-stage state expansion criterion for the FST (distance from root and child count). git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073653 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/util/automaton/fst/Builder.java | 21 ++++- .../apache/lucene/util/automaton/fst/FST.java | 60 ++++++++++++-- .../lucene/util/automaton/fst/Util.java | 14 +++- .../lucene/util/automaton/fst/TestFSTs.java | 82 +++++++++++++++++++ 4 files changed, 163 insertions(+), 14 deletions(-) diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java index 3225fdf0bd8..51278629003 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Builder.java @@ -83,7 +83,7 @@ public class Builder { @SuppressWarnings("unchecked") final UnCompiledNode[] f = (UnCompiledNode[]) new UnCompiledNode[10]; frontier = f; for(int idx=0;idx(this); + frontier[idx] = new UnCompiledNode(this, idx); } } @@ -201,7 +201,7 @@ public class Builder { // undecided on whether to prune it. later, it // will be either compiled or pruned, so we must // allocate a new node: - frontier[idx] = new UnCompiledNode(this); + frontier[idx] = new UnCompiledNode(this, idx); } } } @@ -292,7 +292,7 @@ public class Builder { new UnCompiledNode[ArrayUtil.oversize(input.length+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; System.arraycopy(frontier, 0, next, 0, frontier.length); for(int idx=frontier.length;idx(this); + next[idx] = new UnCompiledNode(this, idx); } frontier = next; } @@ -424,12 +424,22 @@ public class Builder { boolean isFinal; int inputCount; + /** This node's depth, starting from the automaton root. */ + final int depth; + + /** + * @param depth + * The node's depth starting from the automaton root. Needed for + * LUCENE-2934 (node expansion based on conditions other than the + * fanout size). + */ @SuppressWarnings("unchecked") - public UnCompiledNode(Builder owner) { + public UnCompiledNode(Builder owner, int depth) { this.owner = owner; arcs = (Arc[]) new Arc[1]; arcs[0] = new Arc(); output = owner.NO_OUTPUT; + this.depth = depth; } public boolean isCompiled() { @@ -441,6 +451,9 @@ public class Builder { isFinal = false; output = owner.NO_OUTPUT; inputCount = 0; + + // We don't clear the depth here because it never changes + // for nodes on the frontier (even when reused). } public T getLastOutput(int labelToMatch) { diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java index 0b366b471e7..60dc55c137c 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/FST.java @@ -25,6 +25,7 @@ import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.automaton.fst.Builder.UnCompiledNode; /** Represents an FST using a compact byte[] format. *

    The format is similar to what's used by Morfologik @@ -47,11 +48,21 @@ public class FST { // this when number of arcs is > NUM_ARCS_ARRAY: private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; - // If the node has >= this number of arcs, the arcs are - // stored as a fixed array. Fixed array consumes more RAM - // but enables binary search on the arcs (instead of - // linear scan) on lookup by arc label: - private final static int NUM_ARCS_FIXED_ARRAY = 10; + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; + private int[] bytesPerArc = new int[0]; // Increment version to change it @@ -315,7 +326,7 @@ public class FST { int startAddress = writer.posWrite; //System.out.println(" startAddr=" + startAddress); - final boolean doFixedArray = node.numArcs >= NUM_ARCS_FIXED_ARRAY; + final boolean doFixedArray = shouldExpand(node); final int fixedArrayStart; if (doFixedArray) { if (bytesPerArc.length < node.numArcs) { @@ -518,6 +529,23 @@ public class FST { return readNextArc(arc); } + /** + * Checks if arc's target state is in expanded (or vector) format. + * + * @return Returns true if arc points to a state in an + * expanded array format. + */ + boolean isExpandedTarget(Arc follow) throws IOException { + if (follow.isFinal()) { + return false; + } else { + final BytesReader in = getBytesReader(follow.target); + final byte b = in.readByte(); + + return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + } + } + /** In-place read; returns the arc. */ public Arc readNextArc(Arc arc) throws IOException { if (arc.label == -1) { @@ -712,6 +740,26 @@ public class FST { public int getArcWithOutputCount() { return arcWithOutputCount; } + + /** + * Nodes will be expanded if their depth (distance from the root node) is + * <= this value and their number of arcs is >= + * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. + * + *

    + * Fixed array consumes more RAM but enables binary search on the arcs + * (instead of a linear scan) on lookup by arc label. + * + * @return true if node should be stored in an + * expanded (array) form. + * + * @see #FIXED_ARRAY_NUM_ARCS_DEEP + * @see Builder.UnCompiledNode#depth + */ + private boolean shouldExpand(UnCompiledNode node) { + return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || + node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP; + } // Non-static: writes to FST's byte[] class BytesWriter extends DataOutput { diff --git a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java index a10c4bcf753..6699ac6b342 100644 --- a/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java +++ b/lucene/src/java/org/apache/lucene/util/automaton/fst/Util.java @@ -189,6 +189,8 @@ public final class Util { */ public static void toDot(FST fst, Writer out, boolean sameRank, boolean labelStates) throws IOException { + final String expandedNodeColor = "blue"; + // This is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); @@ -219,7 +221,9 @@ public final class Util { } emitDotState(out, "initial", "point", "white", ""); - emitDotState(out, Integer.toString(startArc.target), stateShape, null, ""); + emitDotState(out, Integer.toString(startArc.target), stateShape, + fst.isExpandedTarget(startArc) ? expandedNodeColor : null, + ""); out.write(" initial -> " + startArc.target + "\n"); final T NO_OUTPUT = fst.outputs.getNoOutput(); @@ -243,7 +247,9 @@ public final class Util { while (true) { // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { - emitDotState(out, Integer.toString(arc.target), stateShape, null, + final boolean isExpanded = fst.isExpandedTarget(arc); + emitDotState(out, Integer.toString(arc.target), stateShape, + isExpanded ? expandedNodeColor : null, labelStates ? Integer.toString(arc.target) : ""); seen.set(arc.target); nextLevelQueue.add(new FST.Arc().copyFrom(arc)); @@ -285,10 +291,10 @@ public final class Util { } sameLevelStates.clear(); } - + // Emit terminating state (always there anyway). out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); - out.write(" {rank=sink; -1 } "); + out.write(" {rank=sink; -1 }\n"); out.write("}\n"); out.flush(); diff --git a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java index f979481306e..5f6c589a764 100644 --- a/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java +++ b/lucene/src/test/org/apache/lucene/util/automaton/fst/TestFSTs.java @@ -56,6 +56,7 @@ import org.apache.lucene.util.LineFileDocs; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.UnicodeUtil; import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.automaton.fst.FST.Arc; public class TestFSTs extends LuceneTestCase { @@ -1322,4 +1323,85 @@ public class TestFSTs extends LuceneTestCase { assertEquals(b, seekResult.input); assertEquals(42, (long) seekResult.output); } + + /** + * Test state expansion (array format) on close-to-root states. Creates + * synthetic input that has one expanded state on each level. + * + * @see "https://issues.apache.org/jira/browse/LUCENE-2933" + */ + public void testExpandedCloseToRoot() throws Exception { + class SyntheticData { + FST compile(String[] lines) throws IOException { + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, 0, 0, true, outputs); + + int line = 0; + final BytesRef term = new BytesRef(); + while (line < lines.length) { + String w = lines[line++]; + if (w == null) { + break; + } + term.copy(w); + b.add(term, nothing); + } + + return b.finish(); + } + + void generate(ArrayList out, StringBuilder b, char from, char to, + int depth) { + if (depth == 0 || from == to) { + String seq = b.toString() + "_" + out.size() + "_end"; + out.add(seq); + } else { + for (char c = from; c <= to; c++) { + b.append(c); + generate(out, b, from, c == to ? to : from, depth - 1); + b.deleteCharAt(b.length() - 1); + } + } + } + + public int verifyStateAndBelow(FST fst, Arc arc, int depth) + throws IOException { + if (fst.targetHasArcs(arc)) { + int childCount = 0; + for (arc = fst.readFirstTargetArc(arc, arc);; + arc = fst.readNextArc(arc), childCount++) + { + boolean expanded = fst.isExpandedTarget(arc); + int children = verifyStateAndBelow(fst, new FST.Arc().copyFrom(arc), depth + 1); + + assertEquals( + expanded, + (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && + children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || + children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); + if (arc.isLast()) break; + } + + return childCount; + } + return 0; + } + } + + // Sanity check. + assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); + assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); + + SyntheticData s = new SyntheticData(); + + ArrayList out = new ArrayList(); + StringBuilder b = new StringBuilder(); + s.generate(out, b, 'a', 'i', 10); + String[] input = out.toArray(new String[out.size()]); + Arrays.sort(input); + FST fst = s.compile(input); + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + s.verifyStateAndBelow(fst, arc, 1); + } } From d0a8a1fcb86cb7aa8c01f81db608d0558177d8c7 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Wed, 23 Feb 2011 16:23:57 +0000 Subject: [PATCH 4/8] LUCENE-2894: add more contribution git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073806 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 2 +- .../analysis/IndicNormalizationFilterFactory.java | 12 +++++++++++- .../solr/analysis/IndonesianStemFilterFactory.java | 12 +++++++++++- .../solr/analysis/ItalianLightStemFilterFactory.java | 12 +++++++++++- .../apache/solr/analysis/KeepWordFilterFactory.java | 8 ++++++++ .../solr/analysis/KeywordMarkerFilterFactory.java | 10 +++++++++- .../solr/analysis/KeywordTokenizerFactory.java | 7 +++++++ .../apache/solr/analysis/LengthFilterFactory.java | 8 ++++++++ .../apache/solr/analysis/LetterTokenizerFactory.java | 7 +++++++ .../solr/analysis/LimitTokenCountFilterFactory.java | 11 +++++++++++ .../apache/solr/analysis/LowerCaseFilterFactory.java | 8 ++++++++ .../solr/analysis/LowerCaseTokenizerFactory.java | 7 +++++++ .../solr/analysis/MappingCharFilterFactory.java | 8 ++++++++ .../org/apache/solr/analysis/NGramFilterFactory.java | 10 +++++++++- .../apache/solr/analysis/NGramTokenizerFactory.java | 9 ++++++++- .../analysis/NumericPayloadTokenFilterFactory.java | 12 +++++++++++- 16 files changed, 135 insertions(+), 8 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 66083212f3b..a58930aeb55 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -1055,7 +1055,7 @@ Documentation (Adriano Crestani via Robert Muir) * LUCENE-2894: Use google-code-prettify for syntax highlighting in javadoc. - (Koji Sekiguchi) + (Shinichiro Abe, Koji Sekiguchi) ================== Release 2.9.4 / 3.0.3 2010-12-03 ==================== diff --git a/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java b/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java index 84264dd8ed8..7811a0190eb 100644 --- a/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/IndicNormalizationFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.in.IndicNormalizationFilter; -/** Factory for {@link IndicNormalizationFilter} */ +/** + * Factory for {@link IndicNormalizationFilter}. + *
    + * <fieldType name="text_innormal" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.IndicNormalizationFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class IndicNormalizationFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new IndicNormalizationFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java index d99af63a32e..1e27f73e8ff 100644 --- a/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/IndonesianStemFilterFactory.java @@ -22,7 +22,17 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.id.IndonesianStemFilter; -/** Factory for {@link IndonesianStemFilter} */ +/** + * Factory for {@link IndonesianStemFilter}. + *
    + * <fieldType name="text_idstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.IndonesianStemFilterFactory" stemDerivational="true"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class IndonesianStemFilterFactory extends BaseTokenFilterFactory { private boolean stemDerivational = true; diff --git a/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java index 3281736b876..a93412fe05c 100644 --- a/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ItalianLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.it.ItalianLightStemFilter; -/** Factory for {@link ItalianLightStemFilter} */ +/** + * Factory for {@link ItalianLightStemFilter}. + *
    + * <fieldType name="text_itlgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.ItalianLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class ItalianLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new ItalianLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java index d9b8ee90a88..ca38a761fb5 100644 --- a/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeepWordFilterFactory.java @@ -28,6 +28,14 @@ import java.util.Set; import java.io.IOException; /** + * Factory for {@link KeepWordFilter}. + *
    + * <fieldType name="text_keepword" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.KeepWordFilterFactory" words="keepwords.txt" ignoreCase="false" enablePositionIncrements="false"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class KeepWordFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { diff --git a/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java b/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java index 08c38ab12a0..0c47b6314ba 100644 --- a/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeywordMarkerFilterFactory.java @@ -26,7 +26,15 @@ import org.apache.solr.util.plugin.ResourceLoaderAware; */ /** - * Factory for {@link KeywordMarkerFilter} + * Factory for {@link KeywordMarkerFilter}. + *
    + * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.KeywordMarkerFilterFactory" protected="protectedkeyword.txt" ignoreCase="false"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class KeywordMarkerFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { public static final String PROTECTED_TOKENS = "protected"; diff --git a/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java index be418732554..7a6d324a717 100644 --- a/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/KeywordTokenizerFactory.java @@ -22,6 +22,13 @@ import org.apache.lucene.analysis.core.KeywordTokenizer; import java.io.Reader; /** + * Factory for {@link KeywordTokenizer}. + *
    + * <fieldType name="text_keyword" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.KeywordTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class KeywordTokenizerFactory extends BaseTokenizerFactory { diff --git a/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java b/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java index 74d67422269..97d7703a72f 100644 --- a/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LengthFilterFactory.java @@ -23,6 +23,14 @@ import org.apache.lucene.analysis.miscellaneous.LengthFilter; import java.util.Map; /** + * Factory for {@link LengthFilter}. + *
    + * <fieldType name="text_lngth" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.LengthFilterFactory" min="0" max="1" enablePositionIncrements="false"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class LengthFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java index 4362bbd751d..0c04c0519f9 100644 --- a/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LetterTokenizerFactory.java @@ -23,6 +23,13 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link LetterTokenizer}. + *
    + * <fieldType name="text_letter" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.LetterTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class LetterTokenizerFactory extends BaseTokenizerFactory { diff --git a/solr/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java b/solr/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java index 5a08142df78..12da7f321d0 100644 --- a/solr/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LimitTokenCountFilterFactory.java @@ -22,6 +22,17 @@ import java.util.Map; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter; +/** + * Factory for {@link LimitTokenCountFilter}. + *
    + * <fieldType name="text_lngthcnt" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.LimitTokenCountFilterFactory" maxTokenCount="10"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class LimitTokenCountFilterFactory extends BaseTokenFilterFactory { int maxTokenCount; diff --git a/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java index 7aa82f01f3b..b094eccc2f1 100644 --- a/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LowerCaseFilterFactory.java @@ -23,6 +23,14 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; /** + * Factory for {@link LowerCaseFilter}. + *
    + * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.LowerCaseFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class LowerCaseFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java index 7f4c2553813..5bd59db62a4 100644 --- a/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/LowerCaseTokenizerFactory.java @@ -23,6 +23,13 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link LowerCaseTokenizer}. + *
    + * <fieldType name="text_lwrcase" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.LowerCaseTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class LowerCaseTokenizerFactory extends BaseTokenizerFactory { diff --git a/solr/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java b/solr/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java index 570090ef645..a401c31aa91 100644 --- a/solr/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/MappingCharFilterFactory.java @@ -32,6 +32,14 @@ import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** + * Factory for {@link MappingCharFilter}. + *
    + * <fieldType name="text_map" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <charFilter class="solr.MappingCharFilterFactory" mapping="mapping.txt"/>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * * @version $Id$ * @since Solr 1.4 diff --git a/solr/src/java/org/apache/solr/analysis/NGramFilterFactory.java b/solr/src/java/org/apache/solr/analysis/NGramFilterFactory.java index 0cb850b6665..20849c9551a 100644 --- a/solr/src/java/org/apache/solr/analysis/NGramFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/NGramFilterFactory.java @@ -22,7 +22,15 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.ngram.NGramTokenFilter; /** - * Creates new instances of {@link NGramTokenFilter}. + * Factory for {@link NGramTokenFilter}. + *
    + * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.NGramFilterFactory" minGramSize="1" maxGramSize="2"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class NGramFilterFactory extends BaseTokenFilterFactory { private int maxGramSize = 0; diff --git a/solr/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java index 95beaedc8b8..a9525b2d1a3 100755 --- a/solr/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java @@ -24,7 +24,14 @@ import java.io.Reader; import java.util.Map; /** - * Creates new instances of {@link NGramTokenizer}. + * Factory for {@link NGramTokenizer}. + *
    + * <fieldType name="text_ngrm" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.NGramTokenizerFactory" minGramSize="1" maxGramSize="2"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ */ public class NGramTokenizerFactory extends BaseTokenizerFactory { private int maxGramSize = 0; diff --git a/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java index 0181b96e636..2406da54231 100644 --- a/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/NumericPayloadTokenFilterFactory.java @@ -23,7 +23,17 @@ import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; import java.util.Map; -/** Factory for {@link NumericPayloadTokenFilter} */ +/** + * Factory for {@link NumericPayloadTokenFilter}. + *
    + * <fieldType name="text_numpayload" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.NumericPayloadTokenFilterFactory" payload="24" typeMatch="word"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class NumericPayloadTokenFilterFactory extends BaseTokenFilterFactory { private float payload; private String typeMatch; From 0045e0f4ea1bad9cbcbbfc508af0790c882dda14 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Wed, 23 Feb 2011 17:55:54 +0000 Subject: [PATCH 5/8] LUCENE-2937: fix floatToByte underflow detection git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073850 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 7 ++++ .../org/apache/lucene/util/SmallFloat.java | 6 +-- .../apache/lucene/util/TestSmallFloat.java | 41 +++++++++++++++++-- 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a58930aeb55..2835d89937c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -739,6 +739,13 @@ Bug fixes * LUCENE-2891: IndexWriterConfig did not accept -1 in setReaderTermIndexDivisor, which can be used to prevent loading the terms index into memory. (Shai Erera) +* LUCENE-2937: Encoding a float into a byte (e.g. encoding field norms during + indexing) had an underflow detection bug that caused floatToByte(f)==0 where + f was greater than 0, but slightly less than byteToFloat(1). This meant that + certain very small field norms (index_boost * length_norm) could have + been rounded down to 0 instead of being rounded up to the smallest + positive number. (yonik) + New features * LUCENE-2128: Parallelized fetching document frequencies during weight diff --git a/lucene/src/java/org/apache/lucene/util/SmallFloat.java b/lucene/src/java/org/apache/lucene/util/SmallFloat.java index 986a62128af..175b5331909 100644 --- a/lucene/src/java/org/apache/lucene/util/SmallFloat.java +++ b/lucene/src/java/org/apache/lucene/util/SmallFloat.java @@ -39,7 +39,7 @@ public class SmallFloat { int fzero = (63-zeroExp)<> (24-numMantissaBits); - if (smallfloat < fzero) { + if (smallfloat <= fzero) { return (bits<=0) ? (byte)0 // negative numbers and zero both map to 0 byte :(byte)1; // underflow is mapped to smallest non-zero number. @@ -75,7 +75,7 @@ public class SmallFloat { public static byte floatToByte315(float f) { int bits = Float.floatToRawIntBits(f); int smallfloat = bits >> (24-3); - if (smallfloat < (63-15)<<3) { + if (smallfloat <= ((63-15)<<3)) { return (bits<=0) ? (byte)0 : (byte)1; } if (smallfloat >= ((63-15)<<3) + 0x100) { @@ -103,7 +103,7 @@ public class SmallFloat { public static byte floatToByte52(float f) { int bits = Float.floatToRawIntBits(f); int smallfloat = bits >> (24-5); - if (smallfloat < (63-2)<<5) { + if (smallfloat <= (63-2)<<5) { return (bits<=0) ? (byte)0 : (byte)1; } if (smallfloat >= ((63-2)<<5) + 0x100) { diff --git a/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java b/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java index e4308c87e35..55701e64be3 100644 --- a/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java +++ b/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java @@ -28,8 +28,8 @@ public class TestSmallFloat extends LuceneTestCase { return Float.intBitsToFloat(bits); } - // original lucene floatToByte - static byte orig_floatToByte(float f) { + // original lucene floatToByte (since lucene 1.3) + static byte orig_floatToByte_v13(float f) { if (f < 0.0f) // round negatives up to zero f = 0.0f; @@ -53,7 +53,40 @@ public class TestSmallFloat extends LuceneTestCase { return (byte)((exponent << 3) | mantissa); // pack into a byte } + // This is the original lucene floatToBytes (from v1.3) + // except with the underflow detection bug fixed for values like 5.8123817E-10f + static byte orig_floatToByte(float f) { + if (f < 0.0f) // round negatives up to zero + f = 0.0f; + + if (f == 0.0f) // zero is a special case + return 0; + + int bits = Float.floatToIntBits(f); // parse float into parts + int mantissa = (bits & 0xffffff) >> 21; + int exponent = (((bits >> 24) & 0x7f) - 63) + 15; + + if (exponent > 31) { // overflow: use max value + exponent = 31; + mantissa = 7; + } + + if (exponent < 0 || exponent == 0 && mantissa == 0) { // underflow: use min value + exponent = 0; + mantissa = 1; + } + + return (byte)((exponent << 3) | mantissa); // pack into a byte + } + + public void testByteToFloat() { + assertEquals(0, orig_floatToByte_v13(5.8123817E-10f)); // verify the old bug (see LUCENE-2937) + assertEquals(1, orig_floatToByte(5.8123817E-10f)); // verify it's fixed in this test code + assertEquals(1, SmallFloat.floatToByte315(5.8123817E-10f)); // verify it's fixed + + assertEquals(1, orig_floatToByte(Float.MIN_VALUE)); + for (int i=0; i<256; i++) { float f1 = orig_byteToFloat((byte)i); float f2 = SmallFloat.byteToFloat((byte)i, 3,15); @@ -95,8 +128,8 @@ public class TestSmallFloat extends LuceneTestCase { if (f==f) { // skip non-numbers byte b1 = orig_floatToByte(f); byte b2 = SmallFloat.floatToByte315(f); - if (b1!=b2) { - TestCase.fail("Failed floatToByte315 for float " + f); + if (b1!=b2 || b2==0 && f>0) { + fail("Failed floatToByte315 for float " + f + " source bits="+Integer.toHexString(i) + " float raw bits=" + Integer.toHexString(Float.floatToRawIntBits(i))); } } if (i==Integer.MAX_VALUE) break; From 7fc46ffab1483240ed59be204b26165e44ccb9a8 Mon Sep 17 00:00:00 2001 From: Michael Busch Date: Wed, 23 Feb 2011 21:02:39 +0000 Subject: [PATCH 6/8] LUCENE-2881: fix intermittent failing test by clearing term vectors bits in FieldInfos for segments that had only docs with non-aborting exceptions git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073947 13f79535-47bb-0310-9956-ffa450edef68 --- .../java/org/apache/lucene/index/DocFieldProcessor.java | 7 +++++++ lucene/src/java/org/apache/lucene/index/FieldInfos.java | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java index 6416f518983..3a85224462b 100644 --- a/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java +++ b/lucene/src/java/org/apache/lucene/index/DocFieldProcessor.java @@ -59,6 +59,13 @@ final class DocFieldProcessor extends DocConsumer { // FreqProxTermsWriter does this with // FieldInfo.storePayload. final String fileName = IndexFileNames.segmentFileName(state.segmentName, "", IndexFileNames.FIELD_INFOS_EXTENSION); + + // If this segment only has docs that hit non-aborting exceptions, + // then no term vectors files will have been written; therefore we + // need to update the fieldInfos and clear the term vectors bits + if (!state.hasVectors) { + state.fieldInfos.clearVectors(); + } state.fieldInfos.write(state.directory, fileName); } diff --git a/lucene/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/src/java/org/apache/lucene/index/FieldInfos.java index d5834bc5cbf..a3417798b20 100644 --- a/lucene/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/src/java/org/apache/lucene/index/FieldInfos.java @@ -403,6 +403,14 @@ public final class FieldInfos implements Iterable { return false; } + void clearVectors() { + for (FieldInfo fi : this) { + fi.storeTermVector = false; + fi.storeOffsetWithTermVector = false; + fi.storePositionWithTermVector = false; + } + } + public boolean hasNorms() { for (FieldInfo fi : this) { if (!fi.omitNorms) { From d43ca1d2fdae58a929c004d42ca938dcfb96bbe5 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Wed, 23 Feb 2011 21:42:42 +0000 Subject: [PATCH 7/8] tests: add a few more floatToByte constant tests git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1073957 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/lucene/util/TestSmallFloat.java | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java b/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java index 55701e64be3..54972027374 100644 --- a/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java +++ b/lucene/src/test/org/apache/lucene/util/TestSmallFloat.java @@ -81,12 +81,6 @@ public class TestSmallFloat extends LuceneTestCase { public void testByteToFloat() { - assertEquals(0, orig_floatToByte_v13(5.8123817E-10f)); // verify the old bug (see LUCENE-2937) - assertEquals(1, orig_floatToByte(5.8123817E-10f)); // verify it's fixed in this test code - assertEquals(1, SmallFloat.floatToByte315(5.8123817E-10f)); // verify it's fixed - - assertEquals(1, orig_floatToByte(Float.MIN_VALUE)); - for (int i=0; i<256; i++) { float f1 = orig_byteToFloat((byte)i); float f2 = SmallFloat.byteToFloat((byte)i, 3,15); @@ -101,6 +95,22 @@ public class TestSmallFloat extends LuceneTestCase { } public void testFloatToByte() { + assertEquals(0, orig_floatToByte_v13(5.8123817E-10f)); // verify the old bug (see LUCENE-2937) + assertEquals(1, orig_floatToByte(5.8123817E-10f)); // verify it's fixed in this test code + assertEquals(1, SmallFloat.floatToByte315(5.8123817E-10f)); // verify it's fixed + + // test some constants + assertEquals(0, SmallFloat.floatToByte315(0)); + assertEquals(1, SmallFloat.floatToByte315(Float.MIN_VALUE)); // underflow rounds up to smallest positive + assertEquals(255, SmallFloat.floatToByte315(Float.MAX_VALUE) & 0xff); // overflow rounds down to largest positive + assertEquals(255, SmallFloat.floatToByte315(Float.POSITIVE_INFINITY) & 0xff); + + // all negatives map to 0 + assertEquals(0, SmallFloat.floatToByte315(-Float.MIN_VALUE)); + assertEquals(0, SmallFloat.floatToByte315(-Float.MAX_VALUE)); + assertEquals(0, SmallFloat.floatToByte315(Float.NEGATIVE_INFINITY)); + + // up iterations for more exhaustive test after changing something int num = 100000 * RANDOM_MULTIPLIER; for (int i = 0; i < num; i++) { From 521fb53da5e1f36b69af051bdbf7000e13d7a569 Mon Sep 17 00:00:00 2001 From: Koji Sekiguchi Date: Thu, 24 Feb 2011 01:40:58 +0000 Subject: [PATCH 8/8] LUCENE-2894: add more contribution git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1074009 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/solr/analysis/ShingleFilterFactory.java | 13 ++++++++++++- .../solr/analysis/SnowballPorterFilterFactory.java | 7 +++++++ .../analysis/SpanishLightStemFilterFactory.java | 12 +++++++++++- .../apache/solr/analysis/StandardFilterFactory.java | 8 ++++++++ .../solr/analysis/StandardTokenizerFactory.java | 7 +++++++ .../solr/analysis/StemmerOverrideFilterFactory.java | 10 +++++++++- .../analysis/SwedishLightStemFilterFactory.java | 12 +++++++++++- .../apache/solr/analysis/SynonymFilterFactory.java | 9 +++++++++ .../apache/solr/analysis/ThaiWordFilterFactory.java | 12 +++++++++++- .../TokenOffsetPayloadTokenFilterFactory.java | 12 +++++++++++- .../org/apache/solr/analysis/TrimFilterFactory.java | 8 ++++++++ .../analysis/TurkishLowerCaseFilterFactory.java | 12 +++++++++++- .../analysis/TypeAsPayloadTokenFilterFactory.java | 12 +++++++++++- .../analysis/UAX29URLEmailTokenizerFactory.java | 7 +++++++ .../solr/analysis/WhitespaceTokenizerFactory.java | 7 +++++++ .../solr/analysis/WikipediaTokenizerFactory.java | 11 ++++++++++- .../solr/analysis/WordDelimiterFilterFactory.java | 11 +++++++++++ 17 files changed, 161 insertions(+), 9 deletions(-) diff --git a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java index 58f18ad99de..913a4882338 100644 --- a/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ShingleFilterFactory.java @@ -26,7 +26,18 @@ import org.apache.solr.common.SolrException.ErrorCode; import java.util.Map; -/** Factory for {@link ShingleFilter} */ +/** + * Factory for {@link ShingleFilter}. + *
    + * <fieldType name="text_shingle" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.ShingleFilterFactory" minShingleSize="2" maxShingleSize="2"
    + *             outputUnigrams="true" outputUnigramsIfNoShingles="false" tokenSeparator=" "/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class ShingleFilterFactory extends BaseTokenFilterFactory { private int minShingleSize; private int maxShingleSize; diff --git a/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java index a73e7bf8c33..e3297e45d8c 100644 --- a/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SnowballPorterFilterFactory.java @@ -32,6 +32,13 @@ import org.tartarus.snowball.SnowballProgram; * Factory for {@link SnowballFilter}, with configurable language *

    * Note: Use of the "Lovins" stemmer is not recommended, as it is implemented with reflection. + *

    + * <fieldType name="text_snowballstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.SnowballPorterFilterFactory" protected="protectedkeyword.txt" language="English"/>
    + *   </analyzer>
    + * </fieldType>
    * * @version $Id$ */ diff --git a/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java index 148810d04d9..9f3c7a1a657 100644 --- a/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SpanishLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.es.SpanishLightStemFilter; -/** Factory for {@link SpanishLightStemFilter} */ +/** + * Factory for {@link SpanishLightStemFilter}. + *
    + * <fieldType name="text_eslgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.SpanishLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class SpanishLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new SpanishLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/StandardFilterFactory.java b/solr/src/java/org/apache/solr/analysis/StandardFilterFactory.java index 8574f41f3fe..e4a7d93a7b2 100644 --- a/solr/src/java/org/apache/solr/analysis/StandardFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StandardFilterFactory.java @@ -23,6 +23,14 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardFilter; /** + * Factory for {@link StandardFilter}. + *
    + * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.StandardFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class StandardFilterFactory extends BaseTokenFilterFactory { diff --git a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java index f1d09cac5d0..32087dfb465 100644 --- a/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StandardTokenizerFactory.java @@ -24,6 +24,13 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link StandardTokenizer}. + *
    + * <fieldType name="text_stndrd" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.StandardTokenizerFactory" maxTokenLength="255"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ diff --git a/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java b/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java index e51b8430e10..0ac0db28380 100644 --- a/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/StemmerOverrideFilterFactory.java @@ -28,7 +28,15 @@ import org.apache.solr.common.util.StrUtils; import org.apache.solr.util.plugin.ResourceLoaderAware; /** - * Factory for {@link StemmerOverrideFilter} + * Factory for {@link StemmerOverrideFilter}. + *
    + * <fieldType name="text_dicstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.StemmerOverrideFilterFactory" dictionary="dictionary.txt" ignoreCase="false"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id */ public class StemmerOverrideFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { private CharArrayMap dictionary = null; diff --git a/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java index 2c7aff2e93e..6e0478d161b 100644 --- a/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SwedishLightStemFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.sv.SwedishLightStemFilter; -/** Factory for {@link SwedishLightStemFilter} */ +/** + * Factory for {@link SwedishLightStemFilter}. + *
    + * <fieldType name="text_svlgtstem" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.SwedishLightStemFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class SwedishLightStemFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new SwedishLightStemFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java index cb5fd07c001..571bdcf7f8d 100644 --- a/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/SynonymFilterFactory.java @@ -35,6 +35,15 @@ import java.util.List; import java.util.Map; /** + * Factory for {@link SynonymFilter}. + *
    + * <fieldType name="text_synonym" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="false"
    + *             expand="true" tokenizerFactory="solr.WhitespaceTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class SynonymFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware { diff --git a/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java b/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java index 8d4c60fcd67..d35385ae3c4 100644 --- a/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/ThaiWordFilterFactory.java @@ -22,7 +22,17 @@ import org.apache.lucene.analysis.th.ThaiWordFilter; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link ThaiWordFilter} */ +/** + * Factory for {@link ThaiWordFilter}. + *
    + * <fieldType name="text_thai" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.NGramTokenizerFactory"/>
    + *     <filter class="solr.ThaiWordFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class ThaiWordFilterFactory extends BaseTokenFilterFactory { public ThaiWordFilter create(TokenStream input) { assureMatchVersion(); diff --git a/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java index 729e33424b8..981aab415a9 100644 --- a/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TokenOffsetPayloadTokenFilterFactory.java @@ -22,7 +22,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.payloads.TokenOffsetPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link TokenOffsetPayloadTokenFilter} */ +/** + * Factory for {@link TokenOffsetPayloadTokenFilter}. + *
    + * <fieldType name="text_tokenoffset" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.TokenOffsetPayloadTokenFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class TokenOffsetPayloadTokenFilterFactory extends BaseTokenFilterFactory { public TokenOffsetPayloadTokenFilter create(TokenStream input) { return new TokenOffsetPayloadTokenFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java index 825675ba259..31f85c32ba0 100644 --- a/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TrimFilterFactory.java @@ -24,6 +24,14 @@ import org.apache.lucene.analysis.miscellaneous.TrimFilter; import org.apache.solr.common.SolrException; /** + * Factory for {@link TrimFilter}. + *
    + * <fieldType name="text_trm" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.NGramTokenizerFactory"/>
    + *     <filter class="solr.TrimFilterFactory" updateOffsets="false"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ * @see TrimFilter */ diff --git a/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java index 44b70eaabcb..9efa18e3b19 100644 --- a/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TurkishLowerCaseFilterFactory.java @@ -20,7 +20,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; -/** Factory for {@link TurkishLowerCaseFilter} */ +/** + * Factory for {@link TurkishLowerCaseFilter}. + *
    + * <fieldType name="text_trlwr" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.TurkishLowerCaseFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class TurkishLowerCaseFilterFactory extends BaseTokenFilterFactory { public TokenStream create(TokenStream input) { return new TurkishLowerCaseFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java b/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java index 8fa00caba6e..c34c40f0358 100644 --- a/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/TypeAsPayloadTokenFilterFactory.java @@ -22,7 +22,17 @@ package org.apache.solr.analysis; import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter; import org.apache.lucene.analysis.TokenStream; -/** Factory for {@link TypeAsPayloadTokenFilter} */ +/** + * Factory for {@link TypeAsPayloadTokenFilter}. + *
    + * <fieldType name="text_typeaspayload" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.TypeAsPayloadTokenFilterFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class TypeAsPayloadTokenFilterFactory extends BaseTokenFilterFactory { public TypeAsPayloadTokenFilter create(TokenStream input) { return new TypeAsPayloadTokenFilter(input); diff --git a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java index bdffd9442ea..3b071c7c816 100644 --- a/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/UAX29URLEmailTokenizerFactory.java @@ -27,6 +27,13 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link UAX29URLEmailTokenizer}. + *
    + * <fieldType name="text_urlemail" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.UAX29URLEmailTokenizerFactory" maxTokenLength="255"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ * */ diff --git a/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java index bb8e7cf1341..c94b12a03ea 100644 --- a/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WhitespaceTokenizerFactory.java @@ -23,6 +23,13 @@ import java.io.Reader; import java.util.Map; /** + * Factory for {@link WhitespaceTokenizer}. + *
    + * <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class WhitespaceTokenizerFactory extends BaseTokenizerFactory { diff --git a/solr/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java b/solr/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java index 57b09e7d4c6..54a944fa7ac 100644 --- a/solr/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WikipediaTokenizerFactory.java @@ -22,7 +22,16 @@ import java.io.Reader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer; -/** Factory for {@link WikipediaTokenizer}*/ +/** + * Factory for {@link WikipediaTokenizer}. + *
    + * <fieldType name="text_wiki" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WikipediaTokenizerFactory"/>
    + *   </analyzer>
    + * </fieldType>
    + * @version $Id$ + */ public class WikipediaTokenizerFactory extends BaseTokenizerFactory { // TODO: add support for WikipediaTokenizer's advanced options. public Tokenizer create(Reader input) { diff --git a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java index 712da5edb65..093a5df9ce5 100644 --- a/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java +++ b/solr/src/java/org/apache/solr/analysis/WordDelimiterFilterFactory.java @@ -37,6 +37,17 @@ import java.io.IOException; /** + * Factory for {@link WordDelimiterFilter}. + *
    + * <fieldType name="text_wd" class="solr.TextField" positionIncrementGap="100">
    + *   <analyzer>
    + *     <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    + *     <filter class="solr.WordDelimiterFilterFactory" protected="protectedword.txt"
    + *             preserveOriginal="0" splitOnNumerics="1" splitOnCaseChange="1"
    + *             catenateWords="0" catenateNumbers="0" catenateAll="0"
    + *             generateWordParts="1" generateNumberParts="1" stemEnglishPossessive="1"/>
    + *   </analyzer>
    + * </fieldType>
    * @version $Id$ */ public class WordDelimiterFilterFactory extends BaseTokenFilterFactory implements ResourceLoaderAware {