add ngram factories: SOLR-199

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540219 13f79535-47bb-0310-9956-ffa450edef68
2007-05-21 18:17:40 +00:00 · 2007-05-21 18:17:40 +00:00 · 756b88e8f8
parent de2eebc594
commit 756b88e8f8
3 changed files with 108 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -190,7 +190,10 @@ New Features
 31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec
    language encoders to build phonetically similar tokens.  This currently
    supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan)
-    
+
 32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory 
    and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
    user query, not boost or filter queries (klaas).
--- a/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/EdgeNGramTokenizerFactory.java
@ -0,0 +1,55 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
 import java.io.Reader;
 import java.util.Map;
 /**
 * Creates new instances of {@link EdgeNGramTokenizer}.
 * @author Otis Gospodnetic
 * @author Adam Hiatt
 */
 public class EdgeNGramTokenizerFactory extends BaseTokenizerFactory {
    private int maxGramSize = 0;
    private int minGramSize = 0;
    private String side;
    public void init(Map<String, String> args) {
        super.init(args);
        String maxArg = args.get("maxGramSize");
        maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
        String minArg = args.get("minGramSize");
        minGramSize = (minArg != null ? Integer.parseInt(minArg) : EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
        side = args.get("side");
        if (side == null) {
            side = EdgeNGramTokenizer.Side.FRONT.getLabel();
        }
    }
    public TokenStream create(Reader input) {
        return new EdgeNGramTokenizer(input, side, minGramSize, maxGramSize);
    }
 }
--- a/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java
+++ b/src/java/org/apache/solr/analysis/NGramTokenizerFactory.java
@ -0,0 +1,49 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.ngram.NGramTokenizer;
 import java.io.Reader;
 import java.util.Map;
 /**
 * Creates new instances of {@link NGramTokenizer}.
 * @author Otis Gospodnetic
 * @author Adam Hiatt
 */
 public class NGramTokenizerFactory extends BaseTokenizerFactory {
    private int maxGramSize = 0;
    private int minGramSize = 0;
    /** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */
    public void init(Map<String, String> args) {
        super.init(args);
        String maxArg = args.get("maxGramSize");
        maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
        String minArg = args.get("minGramSize");
        minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
    }
    /** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */
    public TokenStream create(Reader input) {
        return new NGramTokenizer(input, minGramSize, maxGramSize);
    }
 }