SOLR-572: Added SpellCheckComponent functionality.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@669485 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-06-19 13:46:54 +00:00
parent 200f1ddd7c
commit 1cd74a0531
26 changed files with 2181 additions and 26 deletions

View File

@ -291,6 +291,11 @@ New Features
from SolrDocuments. (Noble Paul via ryan)
52. SOLR-595: Add support for Field level boosting in the MoreLikeThis Handler. (Tom Morton, gsingers)
53. SOLR-572: Added SpellCheckComponent and org.apache.solr.spelling package to support more spell checking functionality.
Also includes ability to add your own SolrSpellChecker implementation that plugs in.
See http://wiki.apache.org/solr/SpellCheckComponent for more details
(Shalin Shekhar Mangar, Bojan Smid, gsingers)
Changes in runtime behavior
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This

View File

@ -204,6 +204,18 @@
</analyzer>
</fieldType>
<!--
Setup simple analysis for spell checking
-->
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100" >
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
</analyzer>
</fieldType>
<!-- This is an example of using the KeywordTokenizer along
With various TokenFilterFactories to produce a sortable field
that does not include some properties of the source text
@ -303,7 +315,7 @@
-->
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
will be used if the name matches any of the patterns.
RESTRICTION: the glob-like pattern in the name attribute must have
@ -357,6 +369,8 @@
<copyField source="manu" dest="manu_exact"/>
<copyField source="name" dest="spell"/>
<!-- Similarity is the scoring routine for each document vs. a query.
A custom similarity may be specified here, but the default is fine
for most applications. -->

View File

@ -489,6 +489,50 @@
</requestHandler>
<searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent">
<lst name="defaults">
<!-- omp = Only More Popular -->
<str name="spellcheck.onlyMorePopular">false</str>
<!-- exr = Extended Results -->
<str name="spellcheck.extendedResults">false</str>
<!-- The number of suggestions to return -->
<str name="spellcheck.count">1</str>
</lst>
<str name="queryAnalyzerFieldType">textSpell</str>
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">spell</str>
<str name="spellcheckIndexDir">./spellchecker</str>
</lst>
<lst name="spellchecker">
<str name="name">jarowinkler</str>
<str name="field">spell</str>
<!-- Use a different Distance Measure -->
<str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
<str name="spellcheckIndexDir">./spellchecker</str>
</lst>
<!--<lst name="spellchecker">
<str name="classname">solr.FileBasedSpellChecker</str>
<str name="name">external</str>
<str name="sourceLocation">spellings.txt</str>
<str name="characterEncoding">UTF-8</str>
<str name="indexDir">./spellchecker</str>
</lst>-->
</searchComponent>
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/>
<requestHandler name="/spellCheckCompRH" class="org.apache.solr.handler.component.SearchHandler">
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
<requestHandler name="/mlt" class="solr.MoreLikeThisHandler">
<lst name="defaults">
<str name="mlt.fl">manu,cat</str>

View File

@ -1,2 +1,2 @@
AnyObjectId[44e2dcdcc7d1e8c24b7941a45763e3f20310dbd6] was removed in git history.
AnyObjectId[630ded90301495ffdb2d5a69d656c697b54eae4a] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +1,2 @@
AnyObjectId[2c421c5bf65f2838b0ba387f95a55dc0b3d81936] was removed in git history.
AnyObjectId[b7de7debdb89fb00ddcd969e6459059de38a0066] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +1,2 @@
AnyObjectId[f9600dd6bdf7be48acd3a47bfb4142349d63dc88] was removed in git history.
AnyObjectId[7ac77ded12c4e71ebb2dd8c7d8b5d49372823b59] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +1,2 @@
AnyObjectId[c9e70e326acaf4a0633800a52a4c4950ec43b6e7] was removed in git history.
AnyObjectId[5b27a2cc32d635fdd8477a878f4c04eacd6df812] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +1,2 @@
AnyObjectId[db82b130fbe7ea944104ae3f9888c6561ce2914d] was removed in git history.
AnyObjectId[c5b004d8a86cd3d702634fe02e75ab95939ef4a6] was removed in git history.
Apache SVN contains full history.

View File

@ -1,2 +1,2 @@
AnyObjectId[fea1bb71eacb6cb69c030d67334a2013de53b3ce] was removed in git history.
AnyObjectId[eaf9f26f79727c84b0c28cdc7a3b52534e543eaf] was removed in git history.
Apache SVN contains full history.

View File

@ -32,6 +32,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Logger;
import java.nio.charset.Charset;
import javax.naming.Context;
import javax.naming.InitialContext;
@ -48,6 +49,7 @@ import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.schema.FieldType;
import org.apache.solr.util.plugin.ResourceLoaderAware;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.spelling.SpellingQueryConverter;
/**
* @since solr 1.3
@ -58,13 +60,14 @@ public class SolrResourceLoader implements ResourceLoader
static final String project = "solr";
static final String base = "org.apache" + "." + project;
static final String[] packages = {"","analysis.","schema.","handler.","search.","update.","core.","request.","update.processor.","util."};
static final String[] packages = {"","analysis.","schema.","handler.","search.","update.","core.","request.","update.processor.","util.", "spelling."};
private final ClassLoader classLoader;
private final String instanceDir;
private final List<SolrCoreAware> waitingForCore = new ArrayList<SolrCoreAware>();
private final List<ResourceLoaderAware> waitingForResources = new ArrayList<ResourceLoaderAware>();
private static final Charset UTF_8 = Charset.forName("UTF-8");
/**
* <p>
@ -184,13 +187,33 @@ public class SolrResourceLoader implements ResourceLoader
* @throws IOException
*/
public List<String> getLines(String resource) throws IOException {
return getLines(resource, UTF_8);
}
/**
* Accesses a resource by name and returns the (non comment) lines containing
* data using the given character encoding.
*
* <p>
* A comment line is any line that starts with the character "#"
* </p>
*
* @param resource the file to be read
* @param encoding
* @return a list of non-blank non-comment lines with whitespace trimmed
* @throws IOException
*/
public List<String> getLines(String resource,
String encoding) throws IOException {
return getLines(resource, Charset.forName(encoding));
}
public List<String> getLines(String resource, Charset charset) throws IOException{
BufferedReader input = null;
try {
// TODO - allow configurable charset?
input = new BufferedReader(new InputStreamReader(openResource(resource), "UTF-8"));
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
input = new BufferedReader(new InputStreamReader(openResource(resource),
charset));
ArrayList<String> lines = new ArrayList<String>();
for (String word=null; (word=input.readLine())!=null;) {
// skip comments

View File

@ -0,0 +1,371 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.spelling.IndexBasedSpellChecker;
import org.apache.solr.spelling.SolrSpellChecker;
import org.apache.solr.spelling.SpellingResult;
import org.apache.solr.spelling.QueryConverter;
import org.apache.solr.util.plugin.NamedListPluginLoader;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPathConstants;
import java.io.IOException;
import java.io.StringReader;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Logger;
/**
* A SearchComponent implementation which provides support for spell checking
* and suggestions using the Lucene contributed SpellChecker.
*
* <p>
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
* </p>
*
* @since solr 1.3
*/
public class SpellCheckComponent extends SearchComponent implements SolrCoreAware {
private static final Logger LOG = Logger.getLogger(SpellCheckComponent.class.getName());
private static WhitespaceAnalyzer whitespace = new WhitespaceAnalyzer();
public static final boolean DEFAULT_ONLY_MORE_POPULAR = false;
/**
* Base name for all spell checker query parameters. This name is also used to
* register this component with SearchHandler.
*/
public static final String COMPONENT_NAME = "spellcheck";
public static final String SPELLCHECK_PREFIX = "spellcheck.";
/**
* The name of the dictionary to be used for giving the suggestion for a
* request. The value for this parameter is configured in solrconfig.xml
*/
public static final String SPELLCHECK_DICT = SPELLCHECK_PREFIX + "dictionary";
/**
* The count of suggestions needed for a given query.
* <p/>
* If this parameter is absent in the request then only one suggestion is
* returned. If it is more than one then a maximum of given suggestions are
* returned for each token in the query.
*/
public static final String SPELLCHECK_COUNT = SPELLCHECK_PREFIX + "count";
/**
* When this parameter is set to true and the misspelled word exists in the
* user field, only words that occur more frequently in the Solr field than
* the one given will be returned. The default value is false.
* <p/>
* <b>This is applicable only for dictionaries built from Solr fields.</b>
*/
public static final String SPELLCHECK_ONLY_MORE_POPULAR = SPELLCHECK_PREFIX + "onlyMorePopular";
/**
* Whether to use the extended response format, which is more complicated but
* richer. Returns the document frequency for each suggestion and returns one
* suggestion block for each term in the query string. Default is false.
* <p/>
* <b>This is applicable only for dictionaries built from Solr fields.</b>
*/
public static final String SPELLCHECK_EXTENDED_RESULTS = SPELLCHECK_PREFIX + "extendedResults";
/**
* Use the value for this parameter as the query to spell check.
* <p/>
* This parameter is <b>optional</b>. If absent, then the q parameter is
* used.
*/
public static final String SPELLCHECK_Q = SPELLCHECK_PREFIX + "q";
/**
* Whether to build the index or not. Optional and false by default.
*/
public static final String SPELLCHECK_BUILD = SPELLCHECK_PREFIX + "build";
/**
* Whether to reload the index. Optional and false by default.
*/
public static final String SPELLCHECK_RELOAD = SPELLCHECK_PREFIX + "reload";
/**
* Take the top suggestion for each token and create a new query from it
*/
public static final String SPELLCHECK_COLLATE = SPELLCHECK_PREFIX + "collate";
@SuppressWarnings("unchecked")
protected NamedList initParams;
/**
* Key is the dictionary, value is the SpellChecker for that dictionary name
*/
protected Map<String, SolrSpellChecker> spellCheckers = new ConcurrentHashMap<String, SolrSpellChecker>();
protected QueryConverter queryConverter;
@Override
@SuppressWarnings("unchecked")
public void init(NamedList args) {
super.init(args);
this.initParams = args;
}
@Override
@SuppressWarnings("unchecked")
public void prepare(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false)) {
return;
}
SolrSpellChecker spellChecker = getSpellChecker(params);
if (params.getBool(SPELLCHECK_BUILD, false)) {
spellChecker.build(rb.req.getCore());
rb.rsp.add("command", "build");
} else if (params.getBool(SPELLCHECK_RELOAD, false)) {
spellChecker.reload();
rb.rsp.add("command", "reload");
}
}
@Override
@SuppressWarnings("unchecked")
public void process(ResponseBuilder rb) throws IOException {
SolrParams params = rb.req.getParams();
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
return;
}
String q = params.get(SPELLCHECK_Q);
SolrSpellChecker spellChecker = getSpellChecker(params);
Collection<Token> tokens = null;
if (q != null) {
//we have a spell check param, tokenize it with the query analyzer applicable for this spellchecker
tokens = getTokens(q, spellChecker.getQueryAnalyzer());
} else {
q = params.get(CommonParams.Q);
tokens = queryConverter.convert(q);
}
if (tokens != null && tokens.isEmpty() == false) {
if (spellChecker != null) {
int count = params.getInt(SPELLCHECK_COUNT, 1);
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR,
DEFAULT_ONLY_MORE_POPULAR);
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS,
false);
NamedList response = new SimpleOrderedMap();
IndexReader reader = rb.req.getSearcher().getReader();
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
SpellingResult spellingResult = spellChecker.getSuggestions(tokens, reader, count, onlyMorePopular,
extendedResults);
if (spellingResult != null) {
response.add("suggestions", toNamedList(spellingResult, q, extendedResults, collate));
rb.rsp.add("spellcheck", response);
}
} else {
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
"Specified dictionary does not exist.");
}
}
}
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
Collection<Token> result = new ArrayList<Token>();
Token token = null;
TokenStream ts = analyzer.tokenStream("", new StringReader(q));
while ((token = ts.next()) != null){
result.add(token);
}
return result;
}
protected SolrSpellChecker getSpellChecker(SolrParams params) {
String dictName = params.get(SPELLCHECK_DICT);
if (dictName == null) {
dictName = SolrSpellChecker.DEFAULT_DICTIONARY_NAME;
}
return spellCheckers.get(dictName);
}
protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
NamedList result = new NamedList();
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
boolean isCorrectlySpelled = true;
Map<Token, String> best = null;
if (collate == true){
best = new HashMap<Token, String>(suggestions.size());
}
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
Token inputToken = entry.getKey();
Map<String, Integer> theSuggestions = entry.getValue();
if (theSuggestions != null && theSuggestions.size() > 0) {
NamedList suggestionList = new NamedList();
suggestionList.add("numFound", theSuggestions.size());
suggestionList.add("startOffset", inputToken.startOffset());
suggestionList.add("endOffset", inputToken.endOffset());
if (extendedResults && hasFreqInfo) {
suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));
for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
SimpleOrderedMap<Object> suggestionItem = new SimpleOrderedMap<Object>();
suggestionItem.add("frequency", suggEntry.getValue());
suggestionItem.add("word", suggEntry.getKey());
suggestionList.add("suggestion", suggestionItem);
}
} else {
suggestionList.add("suggestion", theSuggestions.keySet());
}
if (collate == true ){//set aside the best suggestion for this token
best.put(inputToken, theSuggestions.keySet().iterator().next());
}
if (hasFreqInfo) {
isCorrectlySpelled = isCorrectlySpelled && spellingResult.getTokenFrequency(inputToken) > 0;
}
result.add(new String(inputToken.termBuffer(), 0, inputToken.termLength()), suggestionList);
}
}
if (hasFreqInfo) {
result.add("correctlySpelled", isCorrectlySpelled);
}
if (collate == true){
StringBuilder collation = new StringBuilder(origQuery);
for (Iterator<Map.Entry<Token, String>> bestIter = best.entrySet().iterator(); bestIter.hasNext();) {
Map.Entry<Token, String> entry = bestIter.next();
Token tok = entry.getKey();
collation.replace(tok.startOffset(), tok.endOffset(), entry.getValue());
}
String collVal = collation.toString();
if (collVal.equals(origQuery) == false) {
LOG.fine("Collation:" + collation);
result.add("collation", collVal);
}
}
return result;
}
public void inform(SolrCore core) {
if (initParams != null) {
LOG.info("Initializing spell checkers");
boolean hasDefault = false;
for (int i = 0; i < initParams.size(); i++) {
if (initParams.getName(i).equals("spellchecker")) {
NamedList spellchecker = (NamedList) initParams.getVal(i);
String className = (String) spellchecker.get("classname");
if (className == null)
className = IndexBasedSpellChecker.class.getName();
SolrResourceLoader loader = core.getResourceLoader();
SolrSpellChecker checker = (SolrSpellChecker) loader.newInstance(className);
if (checker != null) {
String dictionary = checker.init(spellchecker, loader);
if (dictionary != null) {
boolean isDefault = dictionary.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME);
if (isDefault == true && hasDefault == false){
hasDefault = true;
} else if (isDefault == true && hasDefault == true){
throw new RuntimeException("More than one dictionary is missing name.");
}
spellCheckers.put(dictionary, checker);
} else {
if (hasDefault == false){
spellCheckers.put(SolrSpellChecker.DEFAULT_DICTIONARY_NAME, checker);
hasDefault = true;
} else {
throw new RuntimeException("More than one dictionary is missing name.");
}
}
} else {
throw new RuntimeException("Can't load spell checker: " + className);
}
}
}
String xpath = "queryConverter";
SolrConfig solrConfig = core.getSolrConfig();
NodeList nodes = (NodeList) solrConfig.evaluate(xpath, XPathConstants.NODESET);
Map<String, QueryConverter> queryConverters = new HashMap<String, QueryConverter>();
NamedListPluginLoader<QueryConverter> loader =
new NamedListPluginLoader<QueryConverter>("[solrconfig.xml] " + xpath, queryConverters);
loader.load(solrConfig.getResourceLoader(), nodes);
//there should only be one
if (queryConverters.size() == 1) {
queryConverter = queryConverters.values().iterator().next();
IndexSchema schema = core.getSchema();
String fieldTypeName = (String) initParams.get("queryAnalyzerFieldType");
FieldType fieldType = schema.getFieldTypes().get(fieldTypeName);
Analyzer analyzer = fieldType == null ? new WhitespaceAnalyzer()
: fieldType.getQueryAnalyzer();
//TODO: There's got to be a better way! Where's Spring when you need it?
queryConverter.setAnalyzer(analyzer);
} else {
//TODO: Is there a better way?
throw new RuntimeException("One and only one queryConverter may be defined");
}
}
}
// ///////////////////////////////////////////
// / SolrInfoMBean
// //////////////////////////////////////////
@Override
public String getDescription() {
return "A Spell Checker component";
}
@Override
public String getVersion() {
return "$Revision:$";
}
@Override
public String getSourceId() {
return "$Id:$";
}
@Override
public String getSource() {
return "$URL:$";
}
}

View File

@ -0,0 +1,137 @@
package org.apache.solr.spelling;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.lucene.search.spell.LevensteinDistance;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrResourceLoader;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
/**
* Abstract base class for all Lucene based spell checking implementations.
*
* <p>
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
* </p>
*
* @since solr 1.3
*/
public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
public static final String SPELLCHECKER_ARG_NAME = "spellchecker";
public static final String LOCATION = "sourceLocation";
public static final String INDEX_DIR = "spellcheckIndexDir";
public static final String ACCURACY = "accuracy";
public static final String STRING_DISTANCE = "distanceMeasure";
protected String field;
protected org.apache.lucene.search.spell.SpellChecker spellChecker;
protected String sourceLocation;
/*
* The Directory containing the Spell checking index
* */
protected Directory index;
protected Dictionary dictionary;
public static final int DEFAULT_SUGGESTION_COUNT = 5;
protected String indexDir;
public static final String FIELD = "field";
public String init(NamedList config, SolrResourceLoader loader) {
super.init(config, loader);
indexDir = (String) config.get(INDEX_DIR);
sourceLocation = (String) config.get(LOCATION);
field = (String) config.get(FIELD);
String strDistanceName = (String)config.get(STRING_DISTANCE);
StringDistance sd = null;
if (strDistanceName != null) {
sd = (StringDistance) loader.newInstance(strDistanceName);
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
} else {
sd = new LevensteinDistance();
}
try {
initIndex();
spellChecker = new SpellChecker(index, sd);
} catch (IOException e) {
throw new RuntimeException(e);
}
return name;
}
@SuppressWarnings("unchecked")
public SpellingResult getSuggestions(Collection<Token> tokens,
IndexReader reader, int count, boolean onlyMorePopular,
boolean extendedResults)
throws IOException {
SpellingResult result = new SpellingResult(tokens);
reader = determineReader(reader);
Term term = field != null ? new Term(field, "") : null;
for (Token token : tokens) {
String tokenText = new String(token.termBuffer(), 0, token.termLength());
String[] suggestions = spellChecker.suggestSimilar(tokenText, (int) Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT),
field != null ? reader : null, //workaround LUCENE-1295
field,
onlyMorePopular);
if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
//These are spelled the same, continue on
continue;
}
if (extendedResults == true && reader != null && field != null) {
term = term.createTerm(tokenText);
result.add(token, reader.docFreq(term));
int countLimit = Math.min(count, suggestions.length);
for (int i = 0; i < countLimit; i++) {
term = term.createTerm(suggestions[i]);
result.add(token, suggestions[i], reader.docFreq(term));
}
} else {
if (suggestions.length > 0) {
List<String> suggList = Arrays.asList(suggestions);
if (suggestions.length > count) {
suggList = suggList.subList(0, count);
}
result.add(token, suggList);
}
}
}
return result;
}
protected IndexReader determineReader(IndexReader reader) {
return reader;
}
public void reload() throws IOException {
spellChecker.setSpellIndex(index);
}
/**
* Initialize the {@link #index} variable based on the {@link #indexDir}. Does not actually create the spelling index.
*
* @throws IOException
*/
protected void initIndex() throws IOException {
if (indexDir != null) {
index = FSDirectory.getDirectory(indexDir);
} else {
index = new RAMDirectory();
}
}
}

View File

@ -0,0 +1,142 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.spell.PlainTextDictionary;
import org.apache.lucene.store.RAMDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.util.HighFrequencyDictionary;
/**
* <p>
* A spell checker implementation which can load words from a text
* file (one word per line).
* </p>
*
* @since solr 1.3
**/
public class FileBasedSpellChecker extends AbstractLuceneSpellChecker {
private static final Logger log = Logger.getLogger(FileBasedSpellChecker.class.getName());
public static final String FIELD_TYPE = "fieldType";
public static final String SOURCE_FILE_CHAR_ENCODING = "characterEncoding";
private String fieldTypeName;
private String characterEncoding;
public static final String WORD_FIELD_NAME = "word";
public String init(NamedList config, SolrResourceLoader loader) {
super.init(config, loader);
fieldTypeName = (String) config.get(FIELD_TYPE);
characterEncoding = (String) config.get(SOURCE_FILE_CHAR_ENCODING);
return name;
}
public void build(SolrCore core) {
try {
loadExternalFileDictionary(core.getSchema(), core.getResourceLoader());
spellChecker.clearIndex();
spellChecker.indexDictionary(dictionary);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
/**
* Override to return null, since there is no reader associated with a file based index
*/
@Override
protected IndexReader determineReader(IndexReader reader) {
return null;
}
@SuppressWarnings("unchecked")
private void loadExternalFileDictionary(IndexSchema schema, SolrResourceLoader loader) {
IndexSearcher searcher = null;
try {
// Get the field's analyzer
if (fieldTypeName != null
&& schema.getFieldTypeNoEx(fieldTypeName) != null) {
FieldType fieldType = schema.getFieldTypes()
.get(fieldTypeName);
// Do index-time analysis using the given fieldType's analyzer
RAMDirectory ramDir = new RAMDirectory();
IndexWriter writer = new IndexWriter(ramDir, fieldType.getAnalyzer(),
true, IndexWriter.MaxFieldLength.UNLIMITED);
writer.setMergeFactor(300);
writer.setMaxBufferedDocs(150);
List<String> lines = loader.getLines(sourceLocation, characterEncoding);
for (String s : lines) {
Document d = new Document();
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(d);
}
writer.optimize();
writer.close();
dictionary = new HighFrequencyDictionary(IndexReader.open(ramDir),
WORD_FIELD_NAME, 0.0f);
analyzer = fieldType.getQueryAnalyzer();
} else {
log.warning("No fieldType: " + fieldTypeName
+ " found for dictionary: " + name);
analyzer = new WhitespaceAnalyzer();
// check if character encoding is defined
if (characterEncoding == null) {
dictionary = new PlainTextDictionary(loader.openResource(sourceLocation));
} else {
dictionary = new PlainTextDictionary(new InputStreamReader(loader.openResource(sourceLocation), characterEncoding));
}
}
} catch (IOException e) {
log.log(Level.SEVERE, "Unable to load spellings", e);
} finally {
try {
if (searcher != null)
searcher.close();
} catch (IOException e) {
// Ignore
}
}
}
}

View File

@ -0,0 +1,134 @@
package org.apache.solr.spelling;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.SolrIndexSearcher;
import org.apache.solr.util.HighFrequencyDictionary;
import java.io.IOException;
import java.util.logging.Logger;
/**
* <p>
* A spell checker implementation which can load words from Solr as well as arbitary Lucene indices.
* </p>
*
* <p>
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
* </p>
*
* @since solr 1.3
**/
public class IndexBasedSpellChecker extends AbstractLuceneSpellChecker {
private static final Logger log = Logger.getLogger(IndexBasedSpellChecker.class.getName());
public static final String THRESHOLD_TOKEN_FREQUENCY = "thresholdTokenFrequency";
protected float threshold;
protected float accuracy = 0.5f;
protected IndexReader reader;
public String init(NamedList config, SolrResourceLoader loader) {
super.init(config, loader);
String accuracy = (String) config.get(ACCURACY);
threshold = config.get(THRESHOLD_TOKEN_FREQUENCY) == null ? 0.0f
: (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
if (accuracy != null) {
try {
this.accuracy = Float.parseFloat(accuracy);
spellChecker.setAccuracy(this.accuracy);
} catch (NumberFormatException e) {
throw new RuntimeException(
"Unparseable accuracy given for dictionary: " + name, e);
}
}
initSourceReader();
return name;
}
private void initSourceReader() {
if (sourceLocation != null) {
try {
FSDirectory luceneIndexDir = FSDirectory.getDirectory(sourceLocation);
this.reader = IndexReader.open(luceneIndexDir);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public void build(SolrCore core) {
IndexReader reader = null;
try {
if (sourceLocation == null) {
// Load from Solr's index
SolrIndexSearcher searcher = core.getSearcher().get();
reader = searcher.getReader();
} else {
// Load from Lucene index at given sourceLocation
reader = this.reader;
}
loadLuceneDictionary(core.getSchema(), reader);
spellChecker.clearIndex();
spellChecker.indexDictionary(dictionary);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
@Override
protected IndexReader determineReader(IndexReader reader) {
IndexReader result = null;
if (sourceLocation != null) {
result = this.reader;
} else {
result = reader;
}
return result;
}
@SuppressWarnings("unchecked")
private void loadLuceneDictionary(IndexSchema schema, IndexReader reader) {
// Create the dictionary
dictionary = new HighFrequencyDictionary(reader, field,
threshold);
// Get the field's analyzer
FieldType fieldType = schema.getFieldTypeNoEx(field);
analyzer = fieldType == null ? new WhitespaceAnalyzer()
: fieldType.getQueryAnalyzer();
}
@Override
public void reload() throws IOException {
super.reload();
//reload the source
initSourceReader();
}
}

View File

@ -0,0 +1,61 @@
package org.apache.solr.spelling;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
import java.util.Collection;
/**
* <p>
* The QueryConverter is an abstract base class defining a method for converting
* input "raw" queries into a set of tokens for spell checking. It is used to
* "parse" the CommonParams.Q (the input query) and converts it to tokens.
* </p>
*
* <p>
* It is only invoked for the CommonParams.Q parameter, and <b>not</b> the
* "spellcheck.q" parameter. Systems that use their own query parser or those
* that find issue with the basic implementation will want to implement their
* own QueryConverter instead of using the provided implementation
* (SpellingQueryConverter) by overriding the appropriate methods on the
* SpellingQueryConverter and registering it in the solrconfig.xml
* </p>
*
* <p>
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
* </p>
*
* @since solr 1.3
*/
public abstract class QueryConverter implements NamedListInitializedPlugin {
private NamedList args;
protected Analyzer analyzer;
public void init(NamedList args) {
this.args = args;
}
/**
* @param original
* @return The Collection of {@link org.apache.lucene.analysis.Token}s for
* the query. Offsets on the Token should correspond to the correct
* offset in the origQuery
*/
public abstract Collection<Token> convert(String original);
/**
* Set the analyzer to use. Must be set before any calls to convert.
*
* @param analyzer
*/
public void setAnalyzer(Analyzer analyzer) {
this.analyzer = analyzer;
}
public Analyzer getAnalyzer() {
return analyzer;
}
}

View File

@ -0,0 +1,118 @@
package org.apache.solr.spelling;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.IndexReader;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import java.io.IOException;
import java.util.Collection;
/**
* <p>
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
* </p>
*
* @since solr 1.3
*/
public abstract class SolrSpellChecker {
public static final String DICTIONARY_NAME = "name";
public static final String DEFAULT_DICTIONARY_NAME = "default";
protected String name;
protected Analyzer analyzer;
public String init(NamedList config, SolrResourceLoader loader){
name = (String) config.get(DICTIONARY_NAME);
if (name == null) {
name = DEFAULT_DICTIONARY_NAME;
}
return name;
}
public Analyzer getQueryAnalyzer() {
return analyzer;
}
public String getDictionaryName() {
return name;
}
/**
* Reload the index. Useful if an external process is responsible for building the spell checker.
*
* @throws java.io.IOException
*/
public abstract void reload() throws IOException;
/**
* (re)Build The Spelling index. May be a NOOP if the ipmlementation doesn't require building, or can't be rebuilt
*
* @param core The SolrCore
*/
public abstract void build(SolrCore core);
/**
* Assumes count = 1, onlyMorePopular = false, extendedResults = false
*
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
*/
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader) throws IOException {
return getSuggestions(tokens, reader, 1, false, false);
}
/**
* Assumes onlyMorePopular = false, extendedResults = false
*
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
*/
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, int count) throws IOException {
return getSuggestions(tokens, reader, count, false, false);
}
/**
* Assumes count = 1.
*
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
*/
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, boolean onlyMorePopular, boolean extendedResults) throws IOException {
return getSuggestions(tokens, reader, 1, onlyMorePopular, extendedResults);
}
/**
* Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer. The {@link SpellingResult#getSuggestions()} suggestions must be ordered by
* best suggestion first
*
* @param tokens The Tokens to be spell checked.
* @param reader The (optional) IndexReader. If there is not IndexReader, than extendedResults are not possible
* @param count The maximum number of suggestions to return
* @param onlyMorePopular TODO
* @param extendedResults TODO
* @return
* @throws IOException
*/
public abstract SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, int count,
boolean onlyMorePopular, boolean extendedResults)
throws IOException;
}

View File

@ -0,0 +1,64 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
*
* @since solr 1.3
**/
public class SpellingQueryConverter extends QueryConverter {
protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
public Collection<Token> convert(String original) {
Collection<Token> result = new ArrayList<Token>();
//TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
Matcher matcher = QUERY_REGEX.matcher(original);
TokenStream stream;
while (matcher.find()) {
String word = matcher.group(0);
if (word.equals("AND") == false && word.equals("OR") == false) {
try {
stream = analyzer.reusableTokenStream("", new StringReader(word));
Token token;
while ((token = stream.next()) != null) {
token.setStartOffset(matcher.start());
token.setEndOffset(matcher.end());
result.add(token);
}
} catch (IOException e) {
}
}
}
return result;
}
}

View File

@ -0,0 +1,125 @@
package org.apache.solr.spelling;
import org.apache.lucene.analysis.Token;
import java.util.Collection;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.List;
/**
* Implementations of SolrSpellChecker must return suggestions as SpellResult instance.
* This is converted into the required NamedList format in SpellCheckComponent
*
* @since solr 1.3
*/
public class SpellingResult {
private Collection<Token> tokens;
/**
* Key == token
* Value = Map -> key is the suggestion, value is the frequency of the token in the collection
*/
private Map<Token, LinkedHashMap<String, Integer>> suggestions = new LinkedHashMap<Token, LinkedHashMap<String, Integer>>();
private Map<Token, Integer> tokenFrequency;
public static final int NO_FREQUENCY_INFO = -1;
public SpellingResult() {
}
public SpellingResult(Collection<Token> tokens) {
this.tokens = tokens;
}
/**
* Add a whole bunch of suggestions, and don't worry about frequency
*
* @param token The token to associate the suggestions with
* @param suggestions The suggestions
*/
public void add(Token token, List<String> suggestions) {
LinkedHashMap<String, Integer> map = this.suggestions.get(token);
if (map == null ) {
map = new LinkedHashMap<String, Integer>();
this.suggestions.put(token, map);
}
for (String suggestion : suggestions) {
map.put(suggestion, NO_FREQUENCY_INFO);
}
}
public void add(Token token, int docFreq) {
if (tokenFrequency == null) {
tokenFrequency = new LinkedHashMap<Token, Integer>();
}
tokenFrequency.put(token, docFreq);
}
/**
* Suggestions must be added with the best suggestion first. ORDER is important
* @param token The {@link org.apache.lucene.analysis.Token}
* @param suggestion The suggestion for the Token
* @param docFreq The document frequency
*/
public void add(Token token, String suggestion, int docFreq) {
LinkedHashMap<String, Integer> map = this.suggestions.get(token);
//Don't bother adding if we already have this token
if (map == null) {
map = new LinkedHashMap<String, Integer>();
this.suggestions.put(token, map);
}
map.put(suggestion, docFreq);
}
/**
* Get the suggestions for the given token
*
* @param token The {@link org.apache.lucene.analysis.Token} to look up
* @return A LinkedHashMap of the suggestions. Key is the suggestion, value is the token frequency in the index, else {@link #NO_FREQUENCY_INFO}.
*
* The suggestions are added in sorted order (i.e. best suggestion first) then the iterator will return the suggestions in order
*/
public LinkedHashMap<String, Integer> get(Token token) {
return suggestions.get(token);
}
/**
* The token frequency of the input token in the collection
*
* @param token The token
* @return The frequency or null
*/
public Integer getTokenFrequency(Token token) {
return tokenFrequency.get(token);
}
public boolean hasTokenFrequencyInfo() {
return tokenFrequency != null && !tokenFrequency.isEmpty();
}
/**
* All the suggestions. The ordering of the inner LinkedHashMap is by best suggestion first.
* @return The Map of suggestions for each Token. Key is the token, value is a LinkedHashMap whose key is the Suggestion and the value is the frequency or {@link #NO_FREQUENCY_INFO} if frequency info is not available.
*
*/
public Map<Token, LinkedHashMap<String, Integer>> getSuggestions() {
return suggestions;
}
public Map<Token, Integer> getTokenFrequency() {
return tokenFrequency;
}
/**
* @return The original tokens
*/
public Collection<Token> getTokens() {
return tokens;
}
public void setTokens(Collection<Token> tokens) {
this.tokens = tokens;
}
}

View File

@ -22,10 +22,6 @@ import java.util.List;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.component.FacetComponent;
import org.apache.solr.handler.component.MoreLikeThisComponent;
import org.apache.solr.handler.component.SearchComponent;
import org.apache.solr.handler.component.SearchHandler;
import org.apache.solr.util.AbstractSolrTestCase;

View File

@ -0,0 +1,287 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.handler.component;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.SimpleOrderedMap;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.LocalSolrQueryRequest;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.request.SolrRequestHandler;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.spelling.IndexBasedSpellChecker;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
/**
* @since solr 1.3
*/
public class SpellCheckComponentTest extends AbstractSolrTestCase {
@Override
public String getSchemaFile() {
return "schema.xml";
}
@Override
public String getSolrConfigFile() {
return "solrconfig.xml";
}
@Override
public void setUp() throws Exception {
super.setUp();
assertU(adoc("id", "0", "lowerfilt", "This is a title"));
assertU(adoc("id", "1", "lowerfilt",
"The quick reb fox jumped over the lazy brown dogs."));
assertU(adoc("id", "2", "lowerfilt", "This is a document"));
assertU(adoc("id", "3", "lowerfilt", "another document"));
//bunch of docs that are variants on blue
assertU(adoc("id", "4", "lowerfilt", "blue"));
assertU(adoc("id", "5", "lowerfilt", "blud"));
assertU(adoc("id", "6", "lowerfilt", "boue"));
assertU(adoc("id", "7", "lowerfilt", "glue"));
assertU(adoc("id", "8", "lowerfilt", "blee"));
assertU("commit", commit());
}
public void testExtendedResultsCount() throws Exception {
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertTrue("speller is null and it shouldn't be", speller != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.QT, "spellCheckCompRH");
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
params.add(CommonParams.Q, "bluo");
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
params.add(SpellCheckComponent.SPELLCHECK_COUNT, String.valueOf(5));
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, String.valueOf(false));
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
SolrQueryResponse rsp;
rsp = new SolrQueryResponse();
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
String cmdExec = (String) values.get("command");
assertTrue("command is null and it shouldn't be", cmdExec != null);
assertTrue(cmdExec + " is not equal to " + "build",
cmdExec.equals("build") == true);
NamedList spellCheck = (NamedList) values.get("spellcheck");
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
NamedList blue = (NamedList) suggestions.get("bluo");
assertTrue(blue.get("numFound") + " is not equal to " + "5", blue
.get("numFound").toString().equals("5") == true);
Collection<String> theSuggestion = (Collection<String>) blue.get("suggestion");
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
theSuggestion != null);
assertTrue("theSuggestion Size: " + theSuggestion.size() + " is not: " + 5,
theSuggestion.size() == 5);
//we know there are at least 5, but now only get 3
params.remove(SpellCheckComponent.SPELLCHECK_COUNT);
params.remove(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS);
params.remove(SpellCheckComponent.SPELLCHECK_BUILD);
params.add(SpellCheckComponent.SPELLCHECK_COUNT, String.valueOf(3));
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, String.valueOf(true));
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "false");
rsp = new SolrQueryResponse();
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
values = rsp.getValues();
spellCheck = (NamedList) values.get("spellcheck");
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
blue = (NamedList) suggestions.get("bluo");
assertTrue(blue.get("numFound") + " is not equal to " + "3", blue
.get("numFound").toString().equals("3") == true);
SimpleOrderedMap theSuggestions;
int idx = blue.indexOf("suggestion", 0);
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
theSuggestions != null);
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
theSuggestions.size() == 2);//the word and the frequency
idx = blue.indexOf("suggestion", idx + 1);
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
theSuggestions != null);
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
theSuggestions.size() == 2);//the word and the frequency
idx = blue.indexOf("suggestion", idx + 1);
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
theSuggestions != null);
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
theSuggestions.size() == 2);//the word and the frequency
idx = blue.indexOf("suggestion", idx + 1);
assertTrue(idx + " does not equal: " + -1, idx == -1);
}
public void test() throws Exception {
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertTrue("speller is null and it shouldn't be", speller != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.QT, "spellCheckCompRH");
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
params.add(CommonParams.Q, "documemt");
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
SolrQueryResponse rsp = new SolrQueryResponse();
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
String cmdExec = (String) values.get("command");
assertTrue("command is null and it shouldn't be", cmdExec != null);
assertTrue(cmdExec + " is not equal to " + "build",
cmdExec.equals("build") == true);
NamedList spellCheck = (NamedList) values.get("spellcheck");
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
NamedList document = (NamedList) suggestions.get("documemt");
assertTrue(document.get("numFound") + " is not equal to " + "1", document
.get("numFound").toString().equals("1") == true);
assertTrue(document.get("startOffset") + " is not equal to " + "0", document
.get("startOffset").toString().equals("0") == true);
assertTrue(document.get("endOffset") + " is not equal to " + "documemt".length(), document
.get("endOffset").toString().equals(String.valueOf("documemt".length())) == true);
Collection<String> theSuggestion = (Collection<String>) document.get("suggestion");
assertTrue("theSuggestion is null and it shouldn't be: " + document,
theSuggestion != null);
assertTrue("theSuggestion Size: " + theSuggestion.size() + " is not: " + 1,
theSuggestion.size() == 1);
assertTrue(theSuggestion.iterator().next() + " is not equal to " + "document", theSuggestion.iterator().next().equals("document") == true);
}
public void testCollate() throws Exception {
SolrCore core = h.getCore();
SearchComponent speller = core.getSearchComponent("spellcheck");
assertTrue("speller is null and it shouldn't be", speller != null);
ModifiableSolrParams params = new ModifiableSolrParams();
params.add(CommonParams.QT, "spellCheckCompRH");
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
params.add(CommonParams.Q, "documemt");
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
params.add(SpellCheckComponent.SPELLCHECK_COLLATE, "true");
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
SolrQueryResponse rsp = new SolrQueryResponse();
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
NamedList values = rsp.getValues();
NamedList spellCheck = (NamedList) values.get("spellcheck");
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
String collation = (String) suggestions.get("collation");
assertTrue("collation is null and it shouldn't be", collation != null);
assertTrue(collation + " is not equal to " + "document", collation.equals("document") == true);
params.remove(CommonParams.Q);
params.add(CommonParams.Q, "documemt lowerfilt:broen^4");
handler = core.getRequestHandler("spellCheckCompRH");
rsp = new SolrQueryResponse();
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
values = rsp.getValues();
spellCheck = (NamedList) values.get("spellcheck");
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
suggestions = (NamedList) spellCheck.get("suggestions");
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
collation = (String) suggestions.get("collation");
assertTrue("collation is null and it shouldn't be", collation != null);
assertTrue(collation + " is not equal to " + "document lowerfilt:brown^4", collation.equals("document lowerfilt:brown^4") == true);
}
public void testCorrectSpelling() throws Exception {
SolrCore core = h.getCore();
Map<String, String> args = new HashMap<String, String>();
args.put(CommonParams.Q, "lowerfilt:lazy lowerfilt:brown");
args.put(CommonParams.QT, "spellCheckCompRH");
args.put(SpellCheckComponent.SPELLCHECK_BUILD, "true");
args.put(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
args.put(SpellCheckComponent.COMPONENT_NAME, "true");
SolrQueryRequest req = new LocalSolrQueryRequest(core, new MapSolrParams(
args));
assertQ("Make sure correct spellings are signalled in the response", req,
"//*[@numFound='1']", "//result/doc[1]/int[@name='id'][.='1']",
"//*/lst[@name='suggestions']");
}
public void testInit() throws Exception {
SolrCore core = h.getCore();
SpellCheckComponent scc = new SpellCheckComponent();
NamedList args = new NamedList();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
spellchecker.add("name", "default");
spellchecker.add("field", "lowerfilt");
spellchecker.add("spellcheckIndexDir", "./spellchecker");
args.add("spellchecker", spellchecker);
NamedList altSC = new NamedList();
altSC.add("classname", IndexBasedSpellChecker.class.getName());
altSC.add("name", "alternate");
altSC.add("field", "lowerfilt");
altSC.add("spellcheckIndexDir", "./spellchecker");
args.add("spellchecker", altSC);
args.add("queryAnalyzerFieldType", "lowerfilt");
NamedList defaults = new NamedList();
defaults.add(SpellCheckComponent.SPELLCHECK_COLLATE, true);
defaults.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, false);
defaults.add(SpellCheckComponent.SPELLCHECK_COUNT, 2);
args.add("defaults", defaults);
scc.init(args);
scc.inform(core);
//hmm, not sure what to assert here...
//add the sc again and then init again, we should get an exception
args.add("spellchecker", spellchecker);
scc = new SpellCheckComponent();
scc.init(args);
try {
scc.inform(core);
assertTrue(false);
} catch (Exception e) {
}
}
// TODO: add more tests for various spelling options
}

View File

@ -0,0 +1,174 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.analysis.Token;
import java.io.File;
import java.util.Date;
import java.util.Map;
import java.util.Collection;
/**
*
* @since solr 1.3
**/
public class FileBasedSpellCheckerTest extends AbstractSolrTestCase{
public String getSchemaFile() { return "schema.xml"; }
public String getSolrConfigFile() { return "solrconfig.xml"; }
private SpellingQueryConverter queryConverter;
@Override
public void setUp() throws Exception {
super.setUp();
//Index something with a title
assertU(adoc("id", "0", "teststop", "This is a title"));
assertU(adoc("id", "1", "teststop", "The quick reb fox jumped over the lazy brown dogs."));
assertU(adoc("id", "2", "teststop", "This is a Solr"));
assertU(adoc("id", "3", "teststop", "solr foo"));
assertU("commit",
commit());
String allq = "id:[0 TO 3]";
assertQ("docs not added", req(allq));
queryConverter = new SimpleQueryConverter();
queryConverter.init(new NamedList());
}
public void test() throws Exception {
FileBasedSpellChecker checker = new FileBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
File spelling = new File("spellings.txt");
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
indexDir.mkdirs();
spellchecker.add(FileBasedSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("fob");
SpellingResult result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
}
public void testFieldType() throws Exception {
FileBasedSpellChecker checker = new FileBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
File spelling = new File("spellings.txt");
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
indexDir.mkdirs();
spellchecker.add(FileBasedSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(FileBasedSpellChecker.FIELD_TYPE, "teststop");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("Solar");
SpellingResult result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "solr", entry.getKey().equals("solr") == true);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
//test something not in the spell checker
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
}
/**
* No indexDir location set
* @throws Exception
*/
public void testRAMDirectory() throws Exception {
FileBasedSpellChecker checker = new FileBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
File spelling = new File("spellings.txt");
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
spellchecker.add(FileBasedSpellChecker.FIELD_TYPE, "teststop");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("solar");
SpellingResult result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "solr", entry.getKey().equals("solr") == true);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
}
}

View File

@ -0,0 +1,298 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
/**
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.spell.JaroWinklerDistance;
import org.apache.lucene.search.spell.SpellChecker;
import org.apache.lucene.search.spell.StringDistance;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.core.SolrCore;
import org.apache.solr.util.AbstractSolrTestCase;
import java.io.File;
import java.util.Collection;
import java.util.Date;
import java.util.Map;
/**
* @since solr 1.3
*/
public class IndexBasedSpellCheckerTest extends AbstractSolrTestCase {
protected SpellingQueryConverter queryConverter;
protected static String[] DOCS = new String[]{
"This is a title",
"The quick reb fox jumped over the lazy brown dogs.",
"This is a document",
"another document",
"red fox",
"green bun",
"green bud"
};
public String getSchemaFile() {
return "schema.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
@Override
public void setUp() throws Exception {
super.setUp();
//Index something with a title
for (int i = 0; i < DOCS.length; i++) {
assertU(adoc("id", String.valueOf(i), "title", DOCS[i]));
}
assertU("commit",
commit());
String allq = "id:[0 TO 3]";
assertQ("docs not added", req(allq));
queryConverter = new SimpleQueryConverter();
}
public void testSpelling() throws Exception {
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
indexDir.mkdirs();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("documemt");
SpellingResult result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertTrue("documemt is null and it shouldn't be", suggestions != null);
assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "document", entry.getKey().equals("document") == true);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
//test something not in the spell checker
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
//test something that is spelled correctly
tokens = queryConverter.convert("document");
result = checker.getSuggestions(tokens, reader);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is null and it shouldn't be", suggestions == null);
//Has multiple possibilities, but the exact exists, so that should be returned
tokens = queryConverter.convert("red");
result = checker.getSuggestions(tokens, reader, 2);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
//Try out something which should have multiple suggestions
tokens = queryConverter.convert("bug");
result = checker.getSuggestions(tokens, reader, 2);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 2, suggestions.size() == 2);
entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is equal to " + "bug and it shouldn't be", entry.getKey().equals("bug") == false);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is equal to " + "bug and it shouldn't be", entry.getKey().equals("bug") == false);
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
}
public void testExtendedResults() throws Exception {
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
indexDir.mkdirs();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("documemt");
SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertTrue("documemt is null and it shouldn't be", suggestions != null);
assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "document", entry.getKey().equals("document") == true);
assertTrue(entry.getValue() + " does not equal: " + 2, entry.getValue() == 2);
//test something not in the spell checker
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
tokens = queryConverter.convert("document");
result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
}
private class TestSpellChecker extends IndexBasedSpellChecker{
public SpellChecker getSpellChecker(){
return spellChecker;
}
}
public void testAlternateDistance() throws Exception {
TestSpellChecker checker = new TestSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
indexDir.mkdirs();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
spellchecker.add(AbstractLuceneSpellChecker.STRING_DISTANCE, JaroWinklerDistance.class.getName());
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
checker.build(core);
SpellChecker sc = checker.getSpellChecker();
assertTrue("sc is null and it shouldn't be", sc != null);
StringDistance sd = sc.getStringDistance();
assertTrue("sd is null and it shouldn't be", sd != null);
assertTrue("sd is not an instance of " + JaroWinklerDistance.class.getName(), sd instanceof JaroWinklerDistance);
}
public void testAlternateLocation() throws Exception {
String[] ALT_DOCS = new String[]{
"jumpin jack flash",
"Sargent Peppers Lonely Hearts Club Band",
"Born to Run",
"Thunder Road",
"Londons Burning",
"A Horse with No Name",
"Sweet Caroline"
};
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
//create a standalone index
File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
IndexWriter iw = new IndexWriter(altIndexDir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
for (int i = 0; i < ALT_DOCS.length; i++) {
Document doc = new Document();
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.TOKENIZED));
iw.addDocument(doc);
}
iw.optimize();
iw.close();
indexDir.mkdirs();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, altIndexDir.getAbsolutePath());
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core.getResourceLoader());
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
checker.build(core);
IndexReader reader = core.getSearcher().get().getReader();
Collection<Token> tokens = queryConverter.convert("flesh");
SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
//should be lowercased, b/c we are using a lowercasing analyzer
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
assertTrue("flesh is null and it shouldn't be", suggestions != null);
assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
assertTrue(entry.getKey() + " is not equal to " + "flash", entry.getKey().equals("flash") == true);
assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1);
//test something not in the spell checker
tokens = queryConverter.convert("super");
result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
tokens = queryConverter.convert("Caroline");
result = checker.getSuggestions(tokens, reader, 1, false, true);
assertTrue("result is null and it shouldn't be", result != null);
suggestions = result.get(tokens.iterator().next());
assertTrue("suggestions is not null and it should be", suggestions == null);
}
}

View File

@ -0,0 +1,49 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import java.util.Collection;
import java.util.HashSet;
import java.io.StringReader;
import java.io.IOException;
/**
*
* @since solr 1.3
**/
class SimpleQueryConverter extends SpellingQueryConverter{
@Override
public Collection<Token> convert(String origQuery) {
Collection<Token> result = new HashSet<Token>();
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
TokenStream ts = analyzer.tokenStream("", new StringReader(origQuery));
Token tok = null;
try {
while ((tok = ts.next()) != null){
result.add(tok);
}
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
}

View File

@ -0,0 +1,53 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.spelling;
import java.util.Collection;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.util.AbstractSolrTestCase;
/**
*
* @since solr 1.3
**/
public class SpellingQueryConverterTest extends AbstractSolrTestCase {
public String getSchemaFile() {
return "schema.xml";
}
public String getSolrConfigFile() {
return "solrconfig.xml";
}
public void test() throws Exception {
SpellingQueryConverter converter = new SpellingQueryConverter();
converter.init(new NamedList());
converter.setAnalyzer(new WhitespaceAnalyzer());
Collection<Token> tokens = converter.convert("field:foo");
assertTrue("tokens is null and it shouldn't be", tokens != null);
assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
}
}

View File

@ -136,7 +136,7 @@
-->
<maxBooleanClauses>1024</maxBooleanClauses>
<!-- Cache specification for Filters or DocSets - unordered set of *all* documents
that match a particular query.
-->
@ -281,7 +281,7 @@
<requestHandler name="test" class="solr.tst.TestRequestHandler" />
<!-- test query parameter defaults -->
<!-- test query parameter defaults -->
<requestHandler name="defaults" class="solr.StandardRequestHandler">
<lst name="defaults">
<int name="rows">4</int>
@ -289,8 +289,8 @@
<str name="hl.fl">text,name,subject,title,whitetok</str>
</lst>
</requestHandler>
<!-- test query parameter defaults -->
<!-- test query parameter defaults -->
<requestHandler name="lazy" class="solr.StandardRequestHandler" startup="lazy">
<lst name="defaults">
<int name="rows">4</int>
@ -307,7 +307,7 @@
<str name="queryFieldType">string</str>
<str name="config-file">elevate.xml</str>
</searchComponent>
<requestHandler name="/elevate" class="org.apache.solr.handler.component.SearchHandler">
<lst name="defaults">
<str name="echoParams">explicit</str>
@ -316,7 +316,51 @@
<str>elevate</str>
</arr>
</requestHandler>
<searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent">
<lst name="defaults">
<!-- omp = Only More Popular -->
<str name="spellcheck.onlyMorePopular">false</str>
<!-- exr = Extended Results -->
<str name="spellcheck.extendedResults">false</str>
<!-- The number of suggestions to return -->
<str name="spellcheck.count">1</str>
</lst>
<str name="queryAnalyzerFieldType">lowerfilt</str>
<lst name="spellchecker">
<str name="name">default</str>
<str name="field">lowerfilt</str>
<str name="spellcheckIndexDir">./spellchecker</str>
</lst>
<lst name="spellchecker">
<str name="name">jarowinkler</str>
<str name="field">lowerfilt</str>
<!-- Use a different Distance Measure -->
<str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
<str name="spellcheckIndexDir">./spellchecker</str>
</lst>
<lst name="spellchecker">
<str name="classname">solr.FileBasedSpellChecker</str>
<str name="name">external</str>
<str name="sourceLocation">spellings.txt</str>
<str name="characterEncoding">UTF-8</str>
<str name="spellcheckIndexDir">./spellchecker</str>
</lst>
</searchComponent>
<!--
The SpellingQueryConverter to convert raw (CommonParams.Q) queries into tokens. Uses a simple regular expression
to strip off field markup, boosts, ranges, etc. but it is not guaranteed to match an exact parse from the query parser.
-->
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/>
<requestHandler name="spellCheckCompRH" class="org.apache.solr.handler.component.SearchHandler">
<arr name="last-components">
<str>spellcheck</str>
</arr>
</requestHandler>
<highlighting>
<!-- Configure the standard fragmenter -->
@ -325,13 +369,13 @@
<int name="hl.fragsize">100</int>
</lst>
</fragmenter>
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
<lst name="defaults">
<int name="hl.fragsize">70</int>
</lst>
</fragmenter>
<!-- Configure the standard formatter -->
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
<lst name="defaults">

View File

@ -0,0 +1,16 @@
foo
bar
Solr
junk
foo
bar
Solr
junk
foo
bar
Solr
junk
foo
bar
Solr
junk