add ngram factories: SOLR-199

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540219 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2007-05-21 18:17:40 +00:00
parent de2eebc594
commit 756b88e8f8
3 changed files with 108 additions and 1 deletions

View File

@ -190,7 +190,10 @@ New Features
31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec 31. SOLR-224: Adding a PhoneticFilterFactory that uses apache commons codec
language encoders to build phonetically similar tokens. This currently language encoders to build phonetically similar tokens. This currently
supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan) supports: DoubleMetaphone, Metaphone, Soundex, and RefinedSoundex (ryan)
32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory
and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
Changes in runtime behavior Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main 1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas). user query, not boost or filter queries (klaas).

View File

@ -0,0 +1,55 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* Creates new instances of {@link EdgeNGramTokenizer}.
* @author Otis Gospodnetic
* @author Adam Hiatt
*/
public class EdgeNGramTokenizerFactory extends BaseTokenizerFactory {
private int maxGramSize = 0;
private int minGramSize = 0;
private String side;
public void init(Map<String, String> args) {
super.init(args);
String maxArg = args.get("maxGramSize");
maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE);
String minArg = args.get("minGramSize");
minGramSize = (minArg != null ? Integer.parseInt(minArg) : EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE);
side = args.get("side");
if (side == null) {
side = EdgeNGramTokenizer.Side.FRONT.getLabel();
}
}
public TokenStream create(Reader input) {
return new EdgeNGramTokenizer(input, side, minGramSize, maxGramSize);
}
}

View File

@ -0,0 +1,49 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
import java.io.Reader;
import java.util.Map;
/**
* Creates new instances of {@link NGramTokenizer}.
* @author Otis Gospodnetic
* @author Adam Hiatt
*/
public class NGramTokenizerFactory extends BaseTokenizerFactory {
private int maxGramSize = 0;
private int minGramSize = 0;
/** Initializes the n-gram min and max sizes and the side from which one should start tokenizing. */
public void init(Map<String, String> args) {
super.init(args);
String maxArg = args.get("maxGramSize");
maxGramSize = (maxArg != null ? Integer.parseInt(maxArg) : NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE);
String minArg = args.get("minGramSize");
minGramSize = (minArg != null ? Integer.parseInt(minArg) : NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE);
}
/** Creates the {@link TokenStream} of n-grams from the given {@link Reader}. */
public TokenStream create(Reader input) {
return new NGramTokenizer(input, minGramSize, maxGramSize);
}
}