mirror of https://github.com/apache/lucene.git
SOLR-572: Added SpellCheckComponent functionality.
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@669485 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
200f1ddd7c
commit
1cd74a0531
|
@ -291,6 +291,11 @@ New Features
|
|||
from SolrDocuments. (Noble Paul via ryan)
|
||||
|
||||
52. SOLR-595: Add support for Field level boosting in the MoreLikeThis Handler. (Tom Morton, gsingers)
|
||||
|
||||
53. SOLR-572: Added SpellCheckComponent and org.apache.solr.spelling package to support more spell checking functionality.
|
||||
Also includes ability to add your own SolrSpellChecker implementation that plugs in.
|
||||
See http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
(Shalin Shekhar Mangar, Bojan Smid, gsingers)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This
|
||||
|
|
|
@ -204,6 +204,18 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
Setup simple analysis for spell checking
|
||||
-->
|
||||
<fieldType name="textSpell" class="solr.TextField" positionIncrementGap="100" >
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- This is an example of using the KeywordTokenizer along
|
||||
With various TokenFilterFactories to produce a sortable field
|
||||
that does not include some properties of the source text
|
||||
|
@ -303,7 +315,7 @@
|
|||
-->
|
||||
<field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/>
|
||||
|
||||
|
||||
<field name="spell" type="textSpell" indexed="true" stored="true" multiValued="true"/>
|
||||
<!-- Dynamic field definitions. If a field name is not found, dynamicFields
|
||||
will be used if the name matches any of the patterns.
|
||||
RESTRICTION: the glob-like pattern in the name attribute must have
|
||||
|
@ -357,6 +369,8 @@
|
|||
|
||||
<copyField source="manu" dest="manu_exact"/>
|
||||
|
||||
<copyField source="name" dest="spell"/>
|
||||
|
||||
<!-- Similarity is the scoring routine for each document vs. a query.
|
||||
A custom similarity may be specified here, but the default is fine
|
||||
for most applications. -->
|
||||
|
|
|
@ -489,6 +489,50 @@
|
|||
|
||||
</requestHandler>
|
||||
|
||||
|
||||
<searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent">
|
||||
<lst name="defaults">
|
||||
<!-- omp = Only More Popular -->
|
||||
<str name="spellcheck.onlyMorePopular">false</str>
|
||||
<!-- exr = Extended Results -->
|
||||
<str name="spellcheck.extendedResults">false</str>
|
||||
<!-- The number of suggestions to return -->
|
||||
<str name="spellcheck.count">1</str>
|
||||
</lst>
|
||||
<str name="queryAnalyzerFieldType">textSpell</str>
|
||||
|
||||
<lst name="spellchecker">
|
||||
<str name="name">default</str>
|
||||
<str name="field">spell</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">jarowinkler</str>
|
||||
<str name="field">spell</str>
|
||||
<!-- Use a different Distance Measure -->
|
||||
<str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
|
||||
</lst>
|
||||
|
||||
<!--<lst name="spellchecker">
|
||||
<str name="classname">solr.FileBasedSpellChecker</str>
|
||||
<str name="name">external</str>
|
||||
<str name="sourceLocation">spellings.txt</str>
|
||||
<str name="characterEncoding">UTF-8</str>
|
||||
<str name="indexDir">./spellchecker</str>
|
||||
</lst>-->
|
||||
</searchComponent>
|
||||
|
||||
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/>
|
||||
|
||||
<requestHandler name="/spellCheckCompRH" class="org.apache.solr.handler.component.SearchHandler">
|
||||
<arr name="last-components">
|
||||
<str>spellcheck</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<requestHandler name="/mlt" class="solr.MoreLikeThisHandler">
|
||||
<lst name="defaults">
|
||||
<str name="mlt.fl">manu,cat</str>
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[44e2dcdcc7d1e8c24b7941a45763e3f20310dbd6] was removed in git history.
|
||||
AnyObjectId[630ded90301495ffdb2d5a69d656c697b54eae4a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[2c421c5bf65f2838b0ba387f95a55dc0b3d81936] was removed in git history.
|
||||
AnyObjectId[b7de7debdb89fb00ddcd969e6459059de38a0066] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[f9600dd6bdf7be48acd3a47bfb4142349d63dc88] was removed in git history.
|
||||
AnyObjectId[7ac77ded12c4e71ebb2dd8c7d8b5d49372823b59] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[c9e70e326acaf4a0633800a52a4c4950ec43b6e7] was removed in git history.
|
||||
AnyObjectId[5b27a2cc32d635fdd8477a878f4c04eacd6df812] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[db82b130fbe7ea944104ae3f9888c6561ce2914d] was removed in git history.
|
||||
AnyObjectId[c5b004d8a86cd3d702634fe02e75ab95939ef4a6] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -1,2 +1,2 @@
|
|||
AnyObjectId[fea1bb71eacb6cb69c030d67334a2013de53b3ce] was removed in git history.
|
||||
AnyObjectId[eaf9f26f79727c84b0c28cdc7a3b52534e543eaf] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -32,6 +32,7 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.logging.Logger;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import javax.naming.Context;
|
||||
import javax.naming.InitialContext;
|
||||
|
@ -48,6 +49,7 @@ import org.apache.solr.request.SolrRequestHandler;
|
|||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.util.plugin.ResourceLoaderAware;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.apache.solr.spelling.SpellingQueryConverter;
|
||||
|
||||
/**
|
||||
* @since solr 1.3
|
||||
|
@ -58,13 +60,14 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
|
||||
static final String project = "solr";
|
||||
static final String base = "org.apache" + "." + project;
|
||||
static final String[] packages = {"","analysis.","schema.","handler.","search.","update.","core.","request.","update.processor.","util."};
|
||||
static final String[] packages = {"","analysis.","schema.","handler.","search.","update.","core.","request.","update.processor.","util.", "spelling."};
|
||||
|
||||
private final ClassLoader classLoader;
|
||||
private final String instanceDir;
|
||||
|
||||
private final List<SolrCoreAware> waitingForCore = new ArrayList<SolrCoreAware>();
|
||||
private final List<ResourceLoaderAware> waitingForResources = new ArrayList<ResourceLoaderAware>();
|
||||
private static final Charset UTF_8 = Charset.forName("UTF-8");
|
||||
|
||||
/**
|
||||
* <p>
|
||||
|
@ -184,13 +187,33 @@ public class SolrResourceLoader implements ResourceLoader
|
|||
* @throws IOException
|
||||
*/
|
||||
public List<String> getLines(String resource) throws IOException {
|
||||
return getLines(resource, UTF_8);
|
||||
}
|
||||
|
||||
/**
|
||||
* Accesses a resource by name and returns the (non comment) lines containing
|
||||
* data using the given character encoding.
|
||||
*
|
||||
* <p>
|
||||
* A comment line is any line that starts with the character "#"
|
||||
* </p>
|
||||
*
|
||||
* @param resource the file to be read
|
||||
* @param encoding
|
||||
* @return a list of non-blank non-comment lines with whitespace trimmed
|
||||
* @throws IOException
|
||||
*/
|
||||
public List<String> getLines(String resource,
|
||||
String encoding) throws IOException {
|
||||
return getLines(resource, Charset.forName(encoding));
|
||||
}
|
||||
|
||||
|
||||
public List<String> getLines(String resource, Charset charset) throws IOException{
|
||||
BufferedReader input = null;
|
||||
try {
|
||||
// TODO - allow configurable charset?
|
||||
input = new BufferedReader(new InputStreamReader(openResource(resource), "UTF-8"));
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
input = new BufferedReader(new InputStreamReader(openResource(resource),
|
||||
charset));
|
||||
|
||||
ArrayList<String> lines = new ArrayList<String>();
|
||||
for (String word=null; (word=input.readLine())!=null;) {
|
||||
// skip comments
|
||||
|
|
|
@ -0,0 +1,371 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.handler.component;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.spelling.IndexBasedSpellChecker;
|
||||
import org.apache.solr.spelling.SolrSpellChecker;
|
||||
import org.apache.solr.spelling.SpellingResult;
|
||||
import org.apache.solr.spelling.QueryConverter;
|
||||
import org.apache.solr.util.plugin.NamedListPluginLoader;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
import javax.xml.xpath.XPathConstants;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
/**
|
||||
* A SearchComponent implementation which provides support for spell checking
|
||||
* and suggestions using the Lucene contributed SpellChecker.
|
||||
*
|
||||
* <p>
|
||||
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class SpellCheckComponent extends SearchComponent implements SolrCoreAware {
|
||||
private static final Logger LOG = Logger.getLogger(SpellCheckComponent.class.getName());
|
||||
|
||||
private static WhitespaceAnalyzer whitespace = new WhitespaceAnalyzer();
|
||||
|
||||
public static final boolean DEFAULT_ONLY_MORE_POPULAR = false;
|
||||
|
||||
/**
|
||||
* Base name for all spell checker query parameters. This name is also used to
|
||||
* register this component with SearchHandler.
|
||||
*/
|
||||
public static final String COMPONENT_NAME = "spellcheck";
|
||||
|
||||
public static final String SPELLCHECK_PREFIX = "spellcheck.";
|
||||
|
||||
/**
|
||||
* The name of the dictionary to be used for giving the suggestion for a
|
||||
* request. The value for this parameter is configured in solrconfig.xml
|
||||
*/
|
||||
public static final String SPELLCHECK_DICT = SPELLCHECK_PREFIX + "dictionary";
|
||||
|
||||
/**
|
||||
* The count of suggestions needed for a given query.
|
||||
* <p/>
|
||||
* If this parameter is absent in the request then only one suggestion is
|
||||
* returned. If it is more than one then a maximum of given suggestions are
|
||||
* returned for each token in the query.
|
||||
*/
|
||||
public static final String SPELLCHECK_COUNT = SPELLCHECK_PREFIX + "count";
|
||||
|
||||
/**
|
||||
* When this parameter is set to true and the misspelled word exists in the
|
||||
* user field, only words that occur more frequently in the Solr field than
|
||||
* the one given will be returned. The default value is false.
|
||||
* <p/>
|
||||
* <b>This is applicable only for dictionaries built from Solr fields.</b>
|
||||
*/
|
||||
public static final String SPELLCHECK_ONLY_MORE_POPULAR = SPELLCHECK_PREFIX + "onlyMorePopular";
|
||||
|
||||
/**
|
||||
* Whether to use the extended response format, which is more complicated but
|
||||
* richer. Returns the document frequency for each suggestion and returns one
|
||||
* suggestion block for each term in the query string. Default is false.
|
||||
* <p/>
|
||||
* <b>This is applicable only for dictionaries built from Solr fields.</b>
|
||||
*/
|
||||
public static final String SPELLCHECK_EXTENDED_RESULTS = SPELLCHECK_PREFIX + "extendedResults";
|
||||
|
||||
/**
|
||||
* Use the value for this parameter as the query to spell check.
|
||||
* <p/>
|
||||
* This parameter is <b>optional</b>. If absent, then the q parameter is
|
||||
* used.
|
||||
*/
|
||||
public static final String SPELLCHECK_Q = SPELLCHECK_PREFIX + "q";
|
||||
|
||||
/**
|
||||
* Whether to build the index or not. Optional and false by default.
|
||||
*/
|
||||
public static final String SPELLCHECK_BUILD = SPELLCHECK_PREFIX + "build";
|
||||
|
||||
/**
|
||||
* Whether to reload the index. Optional and false by default.
|
||||
*/
|
||||
public static final String SPELLCHECK_RELOAD = SPELLCHECK_PREFIX + "reload";
|
||||
|
||||
/**
|
||||
* Take the top suggestion for each token and create a new query from it
|
||||
*/
|
||||
public static final String SPELLCHECK_COLLATE = SPELLCHECK_PREFIX + "collate";
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected NamedList initParams;
|
||||
|
||||
/**
|
||||
* Key is the dictionary, value is the SpellChecker for that dictionary name
|
||||
*/
|
||||
protected Map<String, SolrSpellChecker> spellCheckers = new ConcurrentHashMap<String, SolrSpellChecker>();
|
||||
|
||||
protected QueryConverter queryConverter;
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public void init(NamedList args) {
|
||||
super.init(args);
|
||||
this.initParams = args;
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public void prepare(ResponseBuilder rb) throws IOException {
|
||||
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false)) {
|
||||
return;
|
||||
}
|
||||
SolrSpellChecker spellChecker = getSpellChecker(params);
|
||||
if (params.getBool(SPELLCHECK_BUILD, false)) {
|
||||
spellChecker.build(rb.req.getCore());
|
||||
rb.rsp.add("command", "build");
|
||||
} else if (params.getBool(SPELLCHECK_RELOAD, false)) {
|
||||
spellChecker.reload();
|
||||
rb.rsp.add("command", "reload");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@SuppressWarnings("unchecked")
|
||||
public void process(ResponseBuilder rb) throws IOException {
|
||||
SolrParams params = rb.req.getParams();
|
||||
if (!params.getBool(COMPONENT_NAME, false) || spellCheckers.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
String q = params.get(SPELLCHECK_Q);
|
||||
SolrSpellChecker spellChecker = getSpellChecker(params);
|
||||
Collection<Token> tokens = null;
|
||||
if (q != null) {
|
||||
//we have a spell check param, tokenize it with the query analyzer applicable for this spellchecker
|
||||
tokens = getTokens(q, spellChecker.getQueryAnalyzer());
|
||||
} else {
|
||||
q = params.get(CommonParams.Q);
|
||||
tokens = queryConverter.convert(q);
|
||||
}
|
||||
if (tokens != null && tokens.isEmpty() == false) {
|
||||
if (spellChecker != null) {
|
||||
int count = params.getInt(SPELLCHECK_COUNT, 1);
|
||||
boolean onlyMorePopular = params.getBool(SPELLCHECK_ONLY_MORE_POPULAR,
|
||||
DEFAULT_ONLY_MORE_POPULAR);
|
||||
boolean extendedResults = params.getBool(SPELLCHECK_EXTENDED_RESULTS,
|
||||
false);
|
||||
NamedList response = new SimpleOrderedMap();
|
||||
IndexReader reader = rb.req.getSearcher().getReader();
|
||||
boolean collate = params.getBool(SPELLCHECK_COLLATE, false);
|
||||
SpellingResult spellingResult = spellChecker.getSuggestions(tokens, reader, count, onlyMorePopular,
|
||||
extendedResults);
|
||||
if (spellingResult != null) {
|
||||
response.add("suggestions", toNamedList(spellingResult, q, extendedResults, collate));
|
||||
rb.rsp.add("spellcheck", response);
|
||||
}
|
||||
|
||||
} else {
|
||||
throw new SolrException(SolrException.ErrorCode.NOT_FOUND,
|
||||
"Specified dictionary does not exist.");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Collection<Token> getTokens(String q, Analyzer analyzer) throws IOException {
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
Token token = null;
|
||||
TokenStream ts = analyzer.tokenStream("", new StringReader(q));
|
||||
while ((token = ts.next()) != null){
|
||||
result.add(token);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected SolrSpellChecker getSpellChecker(SolrParams params) {
|
||||
String dictName = params.get(SPELLCHECK_DICT);
|
||||
if (dictName == null) {
|
||||
dictName = SolrSpellChecker.DEFAULT_DICTIONARY_NAME;
|
||||
}
|
||||
return spellCheckers.get(dictName);
|
||||
}
|
||||
|
||||
protected NamedList toNamedList(SpellingResult spellingResult, String origQuery, boolean extendedResults, boolean collate) {
|
||||
NamedList result = new NamedList();
|
||||
Map<Token, LinkedHashMap<String, Integer>> suggestions = spellingResult.getSuggestions();
|
||||
boolean hasFreqInfo = spellingResult.hasTokenFrequencyInfo();
|
||||
boolean isCorrectlySpelled = true;
|
||||
Map<Token, String> best = null;
|
||||
if (collate == true){
|
||||
best = new HashMap<Token, String>(suggestions.size());
|
||||
}
|
||||
for (Map.Entry<Token, LinkedHashMap<String, Integer>> entry : suggestions.entrySet()) {
|
||||
Token inputToken = entry.getKey();
|
||||
Map<String, Integer> theSuggestions = entry.getValue();
|
||||
if (theSuggestions != null && theSuggestions.size() > 0) {
|
||||
NamedList suggestionList = new NamedList();
|
||||
suggestionList.add("numFound", theSuggestions.size());
|
||||
suggestionList.add("startOffset", inputToken.startOffset());
|
||||
suggestionList.add("endOffset", inputToken.endOffset());
|
||||
if (extendedResults && hasFreqInfo) {
|
||||
suggestionList.add("origFreq", spellingResult.getTokenFrequency(inputToken));
|
||||
for (Map.Entry<String, Integer> suggEntry : theSuggestions.entrySet()) {
|
||||
SimpleOrderedMap<Object> suggestionItem = new SimpleOrderedMap<Object>();
|
||||
suggestionItem.add("frequency", suggEntry.getValue());
|
||||
suggestionItem.add("word", suggEntry.getKey());
|
||||
suggestionList.add("suggestion", suggestionItem);
|
||||
}
|
||||
} else {
|
||||
suggestionList.add("suggestion", theSuggestions.keySet());
|
||||
}
|
||||
if (collate == true ){//set aside the best suggestion for this token
|
||||
best.put(inputToken, theSuggestions.keySet().iterator().next());
|
||||
}
|
||||
if (hasFreqInfo) {
|
||||
isCorrectlySpelled = isCorrectlySpelled && spellingResult.getTokenFrequency(inputToken) > 0;
|
||||
}
|
||||
result.add(new String(inputToken.termBuffer(), 0, inputToken.termLength()), suggestionList);
|
||||
}
|
||||
}
|
||||
if (hasFreqInfo) {
|
||||
result.add("correctlySpelled", isCorrectlySpelled);
|
||||
}
|
||||
if (collate == true){
|
||||
StringBuilder collation = new StringBuilder(origQuery);
|
||||
for (Iterator<Map.Entry<Token, String>> bestIter = best.entrySet().iterator(); bestIter.hasNext();) {
|
||||
Map.Entry<Token, String> entry = bestIter.next();
|
||||
Token tok = entry.getKey();
|
||||
collation.replace(tok.startOffset(), tok.endOffset(), entry.getValue());
|
||||
}
|
||||
String collVal = collation.toString();
|
||||
if (collVal.equals(origQuery) == false) {
|
||||
LOG.fine("Collation:" + collation);
|
||||
result.add("collation", collVal);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public void inform(SolrCore core) {
|
||||
if (initParams != null) {
|
||||
LOG.info("Initializing spell checkers");
|
||||
boolean hasDefault = false;
|
||||
for (int i = 0; i < initParams.size(); i++) {
|
||||
if (initParams.getName(i).equals("spellchecker")) {
|
||||
NamedList spellchecker = (NamedList) initParams.getVal(i);
|
||||
String className = (String) spellchecker.get("classname");
|
||||
if (className == null)
|
||||
className = IndexBasedSpellChecker.class.getName();
|
||||
SolrResourceLoader loader = core.getResourceLoader();
|
||||
SolrSpellChecker checker = (SolrSpellChecker) loader.newInstance(className);
|
||||
if (checker != null) {
|
||||
String dictionary = checker.init(spellchecker, loader);
|
||||
if (dictionary != null) {
|
||||
boolean isDefault = dictionary.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME);
|
||||
if (isDefault == true && hasDefault == false){
|
||||
hasDefault = true;
|
||||
} else if (isDefault == true && hasDefault == true){
|
||||
throw new RuntimeException("More than one dictionary is missing name.");
|
||||
}
|
||||
spellCheckers.put(dictionary, checker);
|
||||
} else {
|
||||
if (hasDefault == false){
|
||||
spellCheckers.put(SolrSpellChecker.DEFAULT_DICTIONARY_NAME, checker);
|
||||
hasDefault = true;
|
||||
} else {
|
||||
throw new RuntimeException("More than one dictionary is missing name.");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new RuntimeException("Can't load spell checker: " + className);
|
||||
}
|
||||
}
|
||||
}
|
||||
String xpath = "queryConverter";
|
||||
SolrConfig solrConfig = core.getSolrConfig();
|
||||
NodeList nodes = (NodeList) solrConfig.evaluate(xpath, XPathConstants.NODESET);
|
||||
|
||||
Map<String, QueryConverter> queryConverters = new HashMap<String, QueryConverter>();
|
||||
NamedListPluginLoader<QueryConverter> loader =
|
||||
new NamedListPluginLoader<QueryConverter>("[solrconfig.xml] " + xpath, queryConverters);
|
||||
|
||||
loader.load(solrConfig.getResourceLoader(), nodes);
|
||||
//there should only be one
|
||||
if (queryConverters.size() == 1) {
|
||||
queryConverter = queryConverters.values().iterator().next();
|
||||
IndexSchema schema = core.getSchema();
|
||||
String fieldTypeName = (String) initParams.get("queryAnalyzerFieldType");
|
||||
FieldType fieldType = schema.getFieldTypes().get(fieldTypeName);
|
||||
Analyzer analyzer = fieldType == null ? new WhitespaceAnalyzer()
|
||||
: fieldType.getQueryAnalyzer();
|
||||
//TODO: There's got to be a better way! Where's Spring when you need it?
|
||||
queryConverter.setAnalyzer(analyzer);
|
||||
} else {
|
||||
//TODO: Is there a better way?
|
||||
throw new RuntimeException("One and only one queryConverter may be defined");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ///////////////////////////////////////////
|
||||
// / SolrInfoMBean
|
||||
// //////////////////////////////////////////
|
||||
|
||||
@Override
|
||||
public String getDescription() {
|
||||
return "A Spell Checker component";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getVersion() {
|
||||
return "$Revision:$";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSourceId() {
|
||||
return "$Id:$";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSource() {
|
||||
return "$URL:$";
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,137 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spell.Dictionary;
|
||||
import org.apache.lucene.search.spell.SpellChecker;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.lucene.search.spell.LevensteinDistance;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Abstract base class for all Lucene based spell checking implementations.
|
||||
*
|
||||
* <p>
|
||||
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public abstract class AbstractLuceneSpellChecker extends SolrSpellChecker {
|
||||
public static final String SPELLCHECKER_ARG_NAME = "spellchecker";
|
||||
public static final String LOCATION = "sourceLocation";
|
||||
public static final String INDEX_DIR = "spellcheckIndexDir";
|
||||
public static final String ACCURACY = "accuracy";
|
||||
public static final String STRING_DISTANCE = "distanceMeasure";
|
||||
protected String field;
|
||||
protected org.apache.lucene.search.spell.SpellChecker spellChecker;
|
||||
|
||||
protected String sourceLocation;
|
||||
/*
|
||||
* The Directory containing the Spell checking index
|
||||
* */
|
||||
protected Directory index;
|
||||
protected Dictionary dictionary;
|
||||
|
||||
public static final int DEFAULT_SUGGESTION_COUNT = 5;
|
||||
protected String indexDir;
|
||||
public static final String FIELD = "field";
|
||||
|
||||
public String init(NamedList config, SolrResourceLoader loader) {
|
||||
super.init(config, loader);
|
||||
indexDir = (String) config.get(INDEX_DIR);
|
||||
sourceLocation = (String) config.get(LOCATION);
|
||||
field = (String) config.get(FIELD);
|
||||
String strDistanceName = (String)config.get(STRING_DISTANCE);
|
||||
StringDistance sd = null;
|
||||
if (strDistanceName != null) {
|
||||
sd = (StringDistance) loader.newInstance(strDistanceName);
|
||||
//TODO: Figure out how to configure options. Where's Spring when you need it? Or at least BeanUtils...
|
||||
} else {
|
||||
sd = new LevensteinDistance();
|
||||
}
|
||||
try {
|
||||
initIndex();
|
||||
spellChecker = new SpellChecker(index, sd);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public SpellingResult getSuggestions(Collection<Token> tokens,
|
||||
IndexReader reader, int count, boolean onlyMorePopular,
|
||||
boolean extendedResults)
|
||||
throws IOException {
|
||||
SpellingResult result = new SpellingResult(tokens);
|
||||
reader = determineReader(reader);
|
||||
Term term = field != null ? new Term(field, "") : null;
|
||||
for (Token token : tokens) {
|
||||
String tokenText = new String(token.termBuffer(), 0, token.termLength());
|
||||
String[] suggestions = spellChecker.suggestSimilar(tokenText, (int) Math.max(count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT),
|
||||
field != null ? reader : null, //workaround LUCENE-1295
|
||||
field,
|
||||
onlyMorePopular);
|
||||
if (suggestions.length == 1 && suggestions[0].equals(tokenText)) {
|
||||
//These are spelled the same, continue on
|
||||
continue;
|
||||
}
|
||||
|
||||
if (extendedResults == true && reader != null && field != null) {
|
||||
term = term.createTerm(tokenText);
|
||||
result.add(token, reader.docFreq(term));
|
||||
int countLimit = Math.min(count, suggestions.length);
|
||||
for (int i = 0; i < countLimit; i++) {
|
||||
term = term.createTerm(suggestions[i]);
|
||||
result.add(token, suggestions[i], reader.docFreq(term));
|
||||
}
|
||||
} else {
|
||||
if (suggestions.length > 0) {
|
||||
List<String> suggList = Arrays.asList(suggestions);
|
||||
if (suggestions.length > count) {
|
||||
suggList = suggList.subList(0, count);
|
||||
}
|
||||
result.add(token, suggList);
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected IndexReader determineReader(IndexReader reader) {
|
||||
return reader;
|
||||
}
|
||||
|
||||
|
||||
public void reload() throws IOException {
|
||||
spellChecker.setSpellIndex(index);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Initialize the {@link #index} variable based on the {@link #indexDir}. Does not actually create the spelling index.
|
||||
*
|
||||
* @throws IOException
|
||||
*/
|
||||
protected void initIndex() throws IOException {
|
||||
if (indexDir != null) {
|
||||
index = FSDirectory.getDirectory(indexDir);
|
||||
} else {
|
||||
index = new RAMDirectory();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,142 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.spelling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.List;
|
||||
import java.util.logging.Level;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.spell.PlainTextDictionary;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A spell checker implementation which can load words from a text
|
||||
* file (one word per line).
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
public class FileBasedSpellChecker extends AbstractLuceneSpellChecker {
|
||||
|
||||
private static final Logger log = Logger.getLogger(FileBasedSpellChecker.class.getName());
|
||||
|
||||
public static final String FIELD_TYPE = "fieldType";
|
||||
|
||||
public static final String SOURCE_FILE_CHAR_ENCODING = "characterEncoding";
|
||||
|
||||
private String fieldTypeName;
|
||||
private String characterEncoding;
|
||||
public static final String WORD_FIELD_NAME = "word";
|
||||
|
||||
public String init(NamedList config, SolrResourceLoader loader) {
|
||||
super.init(config, loader);
|
||||
fieldTypeName = (String) config.get(FIELD_TYPE);
|
||||
characterEncoding = (String) config.get(SOURCE_FILE_CHAR_ENCODING);
|
||||
return name;
|
||||
}
|
||||
|
||||
public void build(SolrCore core) {
|
||||
try {
|
||||
loadExternalFileDictionary(core.getSchema(), core.getResourceLoader());
|
||||
spellChecker.clearIndex();
|
||||
spellChecker.indexDictionary(dictionary);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Override to return null, since there is no reader associated with a file based index
|
||||
*/
|
||||
@Override
|
||||
protected IndexReader determineReader(IndexReader reader) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void loadExternalFileDictionary(IndexSchema schema, SolrResourceLoader loader) {
|
||||
IndexSearcher searcher = null;
|
||||
try {
|
||||
|
||||
// Get the field's analyzer
|
||||
if (fieldTypeName != null
|
||||
&& schema.getFieldTypeNoEx(fieldTypeName) != null) {
|
||||
FieldType fieldType = schema.getFieldTypes()
|
||||
.get(fieldTypeName);
|
||||
// Do index-time analysis using the given fieldType's analyzer
|
||||
RAMDirectory ramDir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(ramDir, fieldType.getAnalyzer(),
|
||||
true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||
writer.setMergeFactor(300);
|
||||
writer.setMaxBufferedDocs(150);
|
||||
|
||||
List<String> lines = loader.getLines(sourceLocation, characterEncoding);
|
||||
|
||||
for (String s : lines) {
|
||||
Document d = new Document();
|
||||
d.add(new Field(WORD_FIELD_NAME, s, Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer.addDocument(d);
|
||||
}
|
||||
writer.optimize();
|
||||
writer.close();
|
||||
|
||||
dictionary = new HighFrequencyDictionary(IndexReader.open(ramDir),
|
||||
WORD_FIELD_NAME, 0.0f);
|
||||
analyzer = fieldType.getQueryAnalyzer();
|
||||
} else {
|
||||
log.warning("No fieldType: " + fieldTypeName
|
||||
+ " found for dictionary: " + name);
|
||||
analyzer = new WhitespaceAnalyzer();
|
||||
|
||||
// check if character encoding is defined
|
||||
if (characterEncoding == null) {
|
||||
dictionary = new PlainTextDictionary(loader.openResource(sourceLocation));
|
||||
} else {
|
||||
dictionary = new PlainTextDictionary(new InputStreamReader(loader.openResource(sourceLocation), characterEncoding));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
} catch (IOException e) {
|
||||
log.log(Level.SEVERE, "Unable to load spellings", e);
|
||||
} finally {
|
||||
try {
|
||||
if (searcher != null)
|
||||
searcher.close();
|
||||
} catch (IOException e) {
|
||||
// Ignore
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,134 @@
|
|||
package org.apache.solr.spelling;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.search.SolrIndexSearcher;
|
||||
import org.apache.solr.util.HighFrequencyDictionary;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.logging.Logger;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* A spell checker implementation which can load words from Solr as well as arbitary Lucene indices.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
public class IndexBasedSpellChecker extends AbstractLuceneSpellChecker {
|
||||
private static final Logger log = Logger.getLogger(IndexBasedSpellChecker.class.getName());
|
||||
|
||||
public static final String THRESHOLD_TOKEN_FREQUENCY = "thresholdTokenFrequency";
|
||||
|
||||
protected float threshold;
|
||||
protected float accuracy = 0.5f;
|
||||
protected IndexReader reader;
|
||||
|
||||
public String init(NamedList config, SolrResourceLoader loader) {
|
||||
super.init(config, loader);
|
||||
String accuracy = (String) config.get(ACCURACY);
|
||||
threshold = config.get(THRESHOLD_TOKEN_FREQUENCY) == null ? 0.0f
|
||||
: (Float) config.get(THRESHOLD_TOKEN_FREQUENCY);
|
||||
if (accuracy != null) {
|
||||
try {
|
||||
this.accuracy = Float.parseFloat(accuracy);
|
||||
spellChecker.setAccuracy(this.accuracy);
|
||||
|
||||
} catch (NumberFormatException e) {
|
||||
throw new RuntimeException(
|
||||
"Unparseable accuracy given for dictionary: " + name, e);
|
||||
}
|
||||
}
|
||||
initSourceReader();
|
||||
return name;
|
||||
}
|
||||
|
||||
private void initSourceReader() {
|
||||
if (sourceLocation != null) {
|
||||
try {
|
||||
FSDirectory luceneIndexDir = FSDirectory.getDirectory(sourceLocation);
|
||||
this.reader = IndexReader.open(luceneIndexDir);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void build(SolrCore core) {
|
||||
IndexReader reader = null;
|
||||
try {
|
||||
if (sourceLocation == null) {
|
||||
// Load from Solr's index
|
||||
SolrIndexSearcher searcher = core.getSearcher().get();
|
||||
reader = searcher.getReader();
|
||||
} else {
|
||||
// Load from Lucene index at given sourceLocation
|
||||
reader = this.reader;
|
||||
}
|
||||
|
||||
|
||||
loadLuceneDictionary(core.getSchema(), reader);
|
||||
spellChecker.clearIndex();
|
||||
spellChecker.indexDictionary(dictionary);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected IndexReader determineReader(IndexReader reader) {
|
||||
IndexReader result = null;
|
||||
if (sourceLocation != null) {
|
||||
result = this.reader;
|
||||
} else {
|
||||
result = reader;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void loadLuceneDictionary(IndexSchema schema, IndexReader reader) {
|
||||
// Create the dictionary
|
||||
dictionary = new HighFrequencyDictionary(reader, field,
|
||||
threshold);
|
||||
// Get the field's analyzer
|
||||
FieldType fieldType = schema.getFieldTypeNoEx(field);
|
||||
analyzer = fieldType == null ? new WhitespaceAnalyzer()
|
||||
: fieldType.getQueryAnalyzer();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reload() throws IOException {
|
||||
super.reload();
|
||||
//reload the source
|
||||
initSourceReader();
|
||||
}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* The QueryConverter is an abstract base class defining a method for converting
|
||||
* input "raw" queries into a set of tokens for spell checking. It is used to
|
||||
* "parse" the CommonParams.Q (the input query) and converts it to tokens.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* It is only invoked for the CommonParams.Q parameter, and <b>not</b> the
|
||||
* "spellcheck.q" parameter. Systems that use their own query parser or those
|
||||
* that find issue with the basic implementation will want to implement their
|
||||
* own QueryConverter instead of using the provided implementation
|
||||
* (SpellingQueryConverter) by overriding the appropriate methods on the
|
||||
* SpellingQueryConverter and registering it in the solrconfig.xml
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public abstract class QueryConverter implements NamedListInitializedPlugin {
|
||||
private NamedList args;
|
||||
|
||||
protected Analyzer analyzer;
|
||||
|
||||
public void init(NamedList args) {
|
||||
this.args = args;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param original
|
||||
* @return The Collection of {@link org.apache.lucene.analysis.Token}s for
|
||||
* the query. Offsets on the Token should correspond to the correct
|
||||
* offset in the origQuery
|
||||
*/
|
||||
public abstract Collection<Token> convert(String original);
|
||||
|
||||
/**
|
||||
* Set the analyzer to use. Must be set before any calls to convert.
|
||||
*
|
||||
* @param analyzer
|
||||
*/
|
||||
public void setAnalyzer(Analyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
}
|
||||
|
||||
public Analyzer getAnalyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,118 @@
|
|||
package org.apache.solr.spelling;
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.core.SolrResourceLoader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Refer to http://wiki.apache.org/solr/SpellCheckComponent for more details
|
||||
* </p>
|
||||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public abstract class SolrSpellChecker {
|
||||
public static final String DICTIONARY_NAME = "name";
|
||||
public static final String DEFAULT_DICTIONARY_NAME = "default";
|
||||
protected String name;
|
||||
protected Analyzer analyzer;
|
||||
|
||||
public String init(NamedList config, SolrResourceLoader loader){
|
||||
name = (String) config.get(DICTIONARY_NAME);
|
||||
if (name == null) {
|
||||
name = DEFAULT_DICTIONARY_NAME;
|
||||
}
|
||||
return name;
|
||||
}
|
||||
|
||||
public Analyzer getQueryAnalyzer() {
|
||||
return analyzer;
|
||||
}
|
||||
|
||||
|
||||
public String getDictionaryName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reload the index. Useful if an external process is responsible for building the spell checker.
|
||||
*
|
||||
* @throws java.io.IOException
|
||||
*/
|
||||
public abstract void reload() throws IOException;
|
||||
|
||||
/**
|
||||
* (re)Build The Spelling index. May be a NOOP if the ipmlementation doesn't require building, or can't be rebuilt
|
||||
*
|
||||
* @param core The SolrCore
|
||||
*/
|
||||
public abstract void build(SolrCore core);
|
||||
|
||||
/**
|
||||
* Assumes count = 1, onlyMorePopular = false, extendedResults = false
|
||||
*
|
||||
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
|
||||
*/
|
||||
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader) throws IOException {
|
||||
return getSuggestions(tokens, reader, 1, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Assumes onlyMorePopular = false, extendedResults = false
|
||||
*
|
||||
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
|
||||
*/
|
||||
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, int count) throws IOException {
|
||||
return getSuggestions(tokens, reader, count, false, false);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Assumes count = 1.
|
||||
*
|
||||
* @see #getSuggestions(Collection, org.apache.lucene.index.IndexReader, int, boolean, boolean)
|
||||
*/
|
||||
public SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, boolean onlyMorePopular, boolean extendedResults) throws IOException {
|
||||
return getSuggestions(tokens, reader, 1, onlyMorePopular, extendedResults);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get suggestions for the given query. Tokenizes the query using a field appropriate Analyzer. The {@link SpellingResult#getSuggestions()} suggestions must be ordered by
|
||||
* best suggestion first
|
||||
*
|
||||
* @param tokens The Tokens to be spell checked.
|
||||
* @param reader The (optional) IndexReader. If there is not IndexReader, than extendedResults are not possible
|
||||
* @param count The maximum number of suggestions to return
|
||||
* @param onlyMorePopular TODO
|
||||
* @param extendedResults TODO
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public abstract SpellingResult getSuggestions(Collection<Token> tokens, IndexReader reader, int count,
|
||||
boolean onlyMorePopular, boolean extendedResults)
|
||||
throws IOException;
|
||||
}
|
|
@ -0,0 +1,64 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.spelling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
public class SpellingQueryConverter extends QueryConverter {
|
||||
|
||||
protected Pattern QUERY_REGEX = Pattern.compile("(?:(?!(\\w+:|\\d+)))\\w+");
|
||||
|
||||
|
||||
public Collection<Token> convert(String original) {
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
//TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
|
||||
Matcher matcher = QUERY_REGEX.matcher(original);
|
||||
TokenStream stream;
|
||||
while (matcher.find()) {
|
||||
String word = matcher.group(0);
|
||||
if (word.equals("AND") == false && word.equals("OR") == false) {
|
||||
try {
|
||||
stream = analyzer.reusableTokenStream("", new StringReader(word));
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
token.setStartOffset(matcher.start());
|
||||
token.setEndOffset(matcher.end());
|
||||
result.add(token);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
@ -0,0 +1,125 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* Implementations of SolrSpellChecker must return suggestions as SpellResult instance.
|
||||
* This is converted into the required NamedList format in SpellCheckComponent
|
||||
*
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class SpellingResult {
|
||||
private Collection<Token> tokens;
|
||||
|
||||
/**
|
||||
* Key == token
|
||||
* Value = Map -> key is the suggestion, value is the frequency of the token in the collection
|
||||
*/
|
||||
private Map<Token, LinkedHashMap<String, Integer>> suggestions = new LinkedHashMap<Token, LinkedHashMap<String, Integer>>();
|
||||
private Map<Token, Integer> tokenFrequency;
|
||||
public static final int NO_FREQUENCY_INFO = -1;
|
||||
|
||||
|
||||
public SpellingResult() {
|
||||
}
|
||||
|
||||
public SpellingResult(Collection<Token> tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a whole bunch of suggestions, and don't worry about frequency
|
||||
*
|
||||
* @param token The token to associate the suggestions with
|
||||
* @param suggestions The suggestions
|
||||
*/
|
||||
public void add(Token token, List<String> suggestions) {
|
||||
LinkedHashMap<String, Integer> map = this.suggestions.get(token);
|
||||
if (map == null ) {
|
||||
map = new LinkedHashMap<String, Integer>();
|
||||
this.suggestions.put(token, map);
|
||||
}
|
||||
for (String suggestion : suggestions) {
|
||||
map.put(suggestion, NO_FREQUENCY_INFO);
|
||||
}
|
||||
}
|
||||
|
||||
public void add(Token token, int docFreq) {
|
||||
if (tokenFrequency == null) {
|
||||
tokenFrequency = new LinkedHashMap<Token, Integer>();
|
||||
}
|
||||
tokenFrequency.put(token, docFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Suggestions must be added with the best suggestion first. ORDER is important
|
||||
* @param token The {@link org.apache.lucene.analysis.Token}
|
||||
* @param suggestion The suggestion for the Token
|
||||
* @param docFreq The document frequency
|
||||
*/
|
||||
public void add(Token token, String suggestion, int docFreq) {
|
||||
LinkedHashMap<String, Integer> map = this.suggestions.get(token);
|
||||
//Don't bother adding if we already have this token
|
||||
if (map == null) {
|
||||
map = new LinkedHashMap<String, Integer>();
|
||||
this.suggestions.put(token, map);
|
||||
}
|
||||
map.put(suggestion, docFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the suggestions for the given token
|
||||
*
|
||||
* @param token The {@link org.apache.lucene.analysis.Token} to look up
|
||||
* @return A LinkedHashMap of the suggestions. Key is the suggestion, value is the token frequency in the index, else {@link #NO_FREQUENCY_INFO}.
|
||||
*
|
||||
* The suggestions are added in sorted order (i.e. best suggestion first) then the iterator will return the suggestions in order
|
||||
*/
|
||||
public LinkedHashMap<String, Integer> get(Token token) {
|
||||
return suggestions.get(token);
|
||||
}
|
||||
|
||||
/**
|
||||
* The token frequency of the input token in the collection
|
||||
*
|
||||
* @param token The token
|
||||
* @return The frequency or null
|
||||
*/
|
||||
public Integer getTokenFrequency(Token token) {
|
||||
return tokenFrequency.get(token);
|
||||
}
|
||||
|
||||
public boolean hasTokenFrequencyInfo() {
|
||||
return tokenFrequency != null && !tokenFrequency.isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* All the suggestions. The ordering of the inner LinkedHashMap is by best suggestion first.
|
||||
* @return The Map of suggestions for each Token. Key is the token, value is a LinkedHashMap whose key is the Suggestion and the value is the frequency or {@link #NO_FREQUENCY_INFO} if frequency info is not available.
|
||||
*
|
||||
*/
|
||||
public Map<Token, LinkedHashMap<String, Integer>> getSuggestions() {
|
||||
return suggestions;
|
||||
}
|
||||
|
||||
public Map<Token, Integer> getTokenFrequency() {
|
||||
return tokenFrequency;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return The original tokens
|
||||
*/
|
||||
public Collection<Token> getTokens() {
|
||||
return tokens;
|
||||
}
|
||||
|
||||
public void setTokens(Collection<Token> tokens) {
|
||||
this.tokens = tokens;
|
||||
}
|
||||
}
|
|
@ -22,10 +22,6 @@ import java.util.List;
|
|||
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.handler.component.FacetComponent;
|
||||
import org.apache.solr.handler.component.MoreLikeThisComponent;
|
||||
import org.apache.solr.handler.component.SearchComponent;
|
||||
import org.apache.solr.handler.component.SearchHandler;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,287 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.handler.component;
|
||||
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.MapSolrParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.common.util.SimpleOrderedMap;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.request.LocalSolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.request.SolrQueryResponse;
|
||||
import org.apache.solr.request.SolrRequestHandler;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.spelling.IndexBasedSpellChecker;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class SpellCheckComponentTest extends AbstractSolrTestCase {
|
||||
@Override
|
||||
public String getSchemaFile() {
|
||||
return "schema.xml";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
assertU(adoc("id", "0", "lowerfilt", "This is a title"));
|
||||
assertU(adoc("id", "1", "lowerfilt",
|
||||
"The quick reb fox jumped over the lazy brown dogs."));
|
||||
assertU(adoc("id", "2", "lowerfilt", "This is a document"));
|
||||
assertU(adoc("id", "3", "lowerfilt", "another document"));
|
||||
//bunch of docs that are variants on blue
|
||||
assertU(adoc("id", "4", "lowerfilt", "blue"));
|
||||
assertU(adoc("id", "5", "lowerfilt", "blud"));
|
||||
assertU(adoc("id", "6", "lowerfilt", "boue"));
|
||||
assertU(adoc("id", "7", "lowerfilt", "glue"));
|
||||
assertU(adoc("id", "8", "lowerfilt", "blee"));
|
||||
assertU("commit", commit());
|
||||
}
|
||||
|
||||
public void testExtendedResultsCount() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
SearchComponent speller = core.getSearchComponent("spellcheck");
|
||||
assertTrue("speller is null and it shouldn't be", speller != null);
|
||||
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CommonParams.QT, "spellCheckCompRH");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
|
||||
params.add(CommonParams.Q, "bluo");
|
||||
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_COUNT, String.valueOf(5));
|
||||
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, String.valueOf(false));
|
||||
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
|
||||
SolrQueryResponse rsp;
|
||||
rsp = new SolrQueryResponse();
|
||||
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||
NamedList values = rsp.getValues();
|
||||
String cmdExec = (String) values.get("command");
|
||||
assertTrue("command is null and it shouldn't be", cmdExec != null);
|
||||
assertTrue(cmdExec + " is not equal to " + "build",
|
||||
cmdExec.equals("build") == true);
|
||||
NamedList spellCheck = (NamedList) values.get("spellcheck");
|
||||
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
|
||||
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
NamedList blue = (NamedList) suggestions.get("bluo");
|
||||
assertTrue(blue.get("numFound") + " is not equal to " + "5", blue
|
||||
.get("numFound").toString().equals("5") == true);
|
||||
Collection<String> theSuggestion = (Collection<String>) blue.get("suggestion");
|
||||
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
|
||||
theSuggestion != null);
|
||||
assertTrue("theSuggestion Size: " + theSuggestion.size() + " is not: " + 5,
|
||||
theSuggestion.size() == 5);
|
||||
//we know there are at least 5, but now only get 3
|
||||
params.remove(SpellCheckComponent.SPELLCHECK_COUNT);
|
||||
params.remove(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS);
|
||||
params.remove(SpellCheckComponent.SPELLCHECK_BUILD);
|
||||
params.add(SpellCheckComponent.SPELLCHECK_COUNT, String.valueOf(3));
|
||||
params.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, String.valueOf(true));
|
||||
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "false");
|
||||
rsp = new SolrQueryResponse();
|
||||
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||
values = rsp.getValues();
|
||||
|
||||
spellCheck = (NamedList) values.get("spellcheck");
|
||||
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
|
||||
suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
blue = (NamedList) suggestions.get("bluo");
|
||||
assertTrue(blue.get("numFound") + " is not equal to " + "3", blue
|
||||
.get("numFound").toString().equals("3") == true);
|
||||
SimpleOrderedMap theSuggestions;
|
||||
int idx = blue.indexOf("suggestion", 0);
|
||||
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
|
||||
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
|
||||
theSuggestions != null);
|
||||
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
|
||||
theSuggestions.size() == 2);//the word and the frequency
|
||||
|
||||
idx = blue.indexOf("suggestion", idx + 1);
|
||||
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
|
||||
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
|
||||
theSuggestions != null);
|
||||
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
|
||||
theSuggestions.size() == 2);//the word and the frequency
|
||||
|
||||
idx = blue.indexOf("suggestion", idx + 1);
|
||||
theSuggestions = (SimpleOrderedMap) blue.get("suggestion", idx);
|
||||
assertTrue("theSuggestion is null and it shouldn't be: " + blue,
|
||||
theSuggestions != null);
|
||||
assertTrue("theSuggestions Size: " + theSuggestions.size() + " is not: " + 2,
|
||||
theSuggestions.size() == 2);//the word and the frequency
|
||||
|
||||
idx = blue.indexOf("suggestion", idx + 1);
|
||||
assertTrue(idx + " does not equal: " + -1, idx == -1);
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void test() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
SearchComponent speller = core.getSearchComponent("spellcheck");
|
||||
assertTrue("speller is null and it shouldn't be", speller != null);
|
||||
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CommonParams.QT, "spellCheckCompRH");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
|
||||
params.add(CommonParams.Q, "documemt");
|
||||
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
|
||||
|
||||
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
|
||||
SolrQueryResponse rsp = new SolrQueryResponse();
|
||||
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||
NamedList values = rsp.getValues();
|
||||
String cmdExec = (String) values.get("command");
|
||||
assertTrue("command is null and it shouldn't be", cmdExec != null);
|
||||
assertTrue(cmdExec + " is not equal to " + "build",
|
||||
cmdExec.equals("build") == true);
|
||||
NamedList spellCheck = (NamedList) values.get("spellcheck");
|
||||
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
|
||||
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
NamedList document = (NamedList) suggestions.get("documemt");
|
||||
assertTrue(document.get("numFound") + " is not equal to " + "1", document
|
||||
.get("numFound").toString().equals("1") == true);
|
||||
assertTrue(document.get("startOffset") + " is not equal to " + "0", document
|
||||
.get("startOffset").toString().equals("0") == true);
|
||||
assertTrue(document.get("endOffset") + " is not equal to " + "documemt".length(), document
|
||||
.get("endOffset").toString().equals(String.valueOf("documemt".length())) == true);
|
||||
Collection<String> theSuggestion = (Collection<String>) document.get("suggestion");
|
||||
assertTrue("theSuggestion is null and it shouldn't be: " + document,
|
||||
theSuggestion != null);
|
||||
assertTrue("theSuggestion Size: " + theSuggestion.size() + " is not: " + 1,
|
||||
theSuggestion.size() == 1);
|
||||
assertTrue(theSuggestion.iterator().next() + " is not equal to " + "document", theSuggestion.iterator().next().equals("document") == true);
|
||||
|
||||
}
|
||||
|
||||
|
||||
public void testCollate() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
SearchComponent speller = core.getSearchComponent("spellcheck");
|
||||
assertTrue("speller is null and it shouldn't be", speller != null);
|
||||
|
||||
ModifiableSolrParams params = new ModifiableSolrParams();
|
||||
params.add(CommonParams.QT, "spellCheckCompRH");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_BUILD, "true");
|
||||
params.add(CommonParams.Q, "documemt");
|
||||
params.add(SpellCheckComponent.COMPONENT_NAME, "true");
|
||||
params.add(SpellCheckComponent.SPELLCHECK_COLLATE, "true");
|
||||
|
||||
SolrRequestHandler handler = core.getRequestHandler("spellCheckCompRH");
|
||||
SolrQueryResponse rsp = new SolrQueryResponse();
|
||||
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||
NamedList values = rsp.getValues();
|
||||
NamedList spellCheck = (NamedList) values.get("spellcheck");
|
||||
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
|
||||
NamedList suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
String collation = (String) suggestions.get("collation");
|
||||
assertTrue("collation is null and it shouldn't be", collation != null);
|
||||
assertTrue(collation + " is not equal to " + "document", collation.equals("document") == true);
|
||||
params.remove(CommonParams.Q);
|
||||
params.add(CommonParams.Q, "documemt lowerfilt:broen^4");
|
||||
handler = core.getRequestHandler("spellCheckCompRH");
|
||||
rsp = new SolrQueryResponse();
|
||||
handler.handleRequest(new LocalSolrQueryRequest(core, params), rsp);
|
||||
values = rsp.getValues();
|
||||
spellCheck = (NamedList) values.get("spellcheck");
|
||||
assertTrue("spellCheck is null and it shouldn't be", spellCheck != null);
|
||||
suggestions = (NamedList) spellCheck.get("suggestions");
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
collation = (String) suggestions.get("collation");
|
||||
assertTrue("collation is null and it shouldn't be", collation != null);
|
||||
assertTrue(collation + " is not equal to " + "document lowerfilt:brown^4", collation.equals("document lowerfilt:brown^4") == true);
|
||||
|
||||
}
|
||||
|
||||
public void testCorrectSpelling() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
|
||||
args.put(CommonParams.Q, "lowerfilt:lazy lowerfilt:brown");
|
||||
args.put(CommonParams.QT, "spellCheckCompRH");
|
||||
args.put(SpellCheckComponent.SPELLCHECK_BUILD, "true");
|
||||
args.put(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, "true");
|
||||
args.put(SpellCheckComponent.COMPONENT_NAME, "true");
|
||||
SolrQueryRequest req = new LocalSolrQueryRequest(core, new MapSolrParams(
|
||||
args));
|
||||
|
||||
assertQ("Make sure correct spellings are signalled in the response", req,
|
||||
"//*[@numFound='1']", "//result/doc[1]/int[@name='id'][.='1']",
|
||||
"//*/lst[@name='suggestions']");
|
||||
}
|
||||
|
||||
public void testInit() throws Exception {
|
||||
SolrCore core = h.getCore();
|
||||
SpellCheckComponent scc = new SpellCheckComponent();
|
||||
NamedList args = new NamedList();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
spellchecker.add("name", "default");
|
||||
spellchecker.add("field", "lowerfilt");
|
||||
spellchecker.add("spellcheckIndexDir", "./spellchecker");
|
||||
|
||||
args.add("spellchecker", spellchecker);
|
||||
NamedList altSC = new NamedList();
|
||||
altSC.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
altSC.add("name", "alternate");
|
||||
altSC.add("field", "lowerfilt");
|
||||
altSC.add("spellcheckIndexDir", "./spellchecker");
|
||||
|
||||
args.add("spellchecker", altSC);
|
||||
args.add("queryAnalyzerFieldType", "lowerfilt");
|
||||
NamedList defaults = new NamedList();
|
||||
defaults.add(SpellCheckComponent.SPELLCHECK_COLLATE, true);
|
||||
defaults.add(SpellCheckComponent.SPELLCHECK_EXTENDED_RESULTS, false);
|
||||
defaults.add(SpellCheckComponent.SPELLCHECK_COUNT, 2);
|
||||
args.add("defaults", defaults);
|
||||
scc.init(args);
|
||||
scc.inform(core);
|
||||
//hmm, not sure what to assert here...
|
||||
|
||||
//add the sc again and then init again, we should get an exception
|
||||
args.add("spellchecker", spellchecker);
|
||||
scc = new SpellCheckComponent();
|
||||
scc.init(args);
|
||||
try {
|
||||
scc.inform(core);
|
||||
assertTrue(false);
|
||||
} catch (Exception e) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
// TODO: add more tests for various spelling options
|
||||
|
||||
}
|
|
@ -0,0 +1,174 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.spelling;
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.Collection;
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
public class FileBasedSpellCheckerTest extends AbstractSolrTestCase{
|
||||
|
||||
public String getSchemaFile() { return "schema.xml"; }
|
||||
public String getSolrConfigFile() { return "solrconfig.xml"; }
|
||||
|
||||
private SpellingQueryConverter queryConverter;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
//Index something with a title
|
||||
assertU(adoc("id", "0", "teststop", "This is a title"));
|
||||
assertU(adoc("id", "1", "teststop", "The quick reb fox jumped over the lazy brown dogs."));
|
||||
assertU(adoc("id", "2", "teststop", "This is a Solr"));
|
||||
assertU(adoc("id", "3", "teststop", "solr foo"));
|
||||
assertU("commit",
|
||||
commit());
|
||||
String allq = "id:[0 TO 3]";
|
||||
assertQ("docs not added", req(allq));
|
||||
queryConverter = new SimpleQueryConverter();
|
||||
queryConverter.init(new NamedList());
|
||||
}
|
||||
|
||||
public void test() throws Exception {
|
||||
FileBasedSpellChecker checker = new FileBasedSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
|
||||
|
||||
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
|
||||
File spelling = new File("spellings.txt");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
|
||||
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(FileBasedSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("fob");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "foo", entry.getKey().equals("foo") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void testFieldType() throws Exception {
|
||||
FileBasedSpellChecker checker = new FileBasedSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
|
||||
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
|
||||
File spelling = new File("spellings.txt");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
|
||||
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(FileBasedSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
spellchecker.add(FileBasedSpellChecker.FIELD_TYPE, "teststop");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("Solar");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
//should be lowercased, b/c we are using a lowercasing analyzer
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "solr", entry.getKey().equals("solr") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
|
||||
//test something not in the spell checker
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
}
|
||||
|
||||
/**
|
||||
* No indexDir location set
|
||||
* @throws Exception
|
||||
*/
|
||||
public void testRAMDirectory() throws Exception {
|
||||
FileBasedSpellChecker checker = new FileBasedSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", FileBasedSpellChecker.class.getName());
|
||||
|
||||
spellchecker.add(SolrSpellChecker.DICTIONARY_NAME, "external");
|
||||
File spelling = new File("spellings.txt");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, spelling.getAbsolutePath());
|
||||
spellchecker.add(FileBasedSpellChecker.SOURCE_FILE_CHAR_ENCODING, "UTF-8");
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "teststop");
|
||||
spellchecker.add(FileBasedSpellChecker.FIELD_TYPE, "teststop");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + "external", dictName.equals("external") == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("solar");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
//should be lowercased, b/c we are using a lowercasing analyzer
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "solr", entry.getKey().equals("solr") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
|
||||
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,298 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.spelling;
|
||||
/**
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.search.spell.JaroWinklerDistance;
|
||||
import org.apache.lucene.search.spell.SpellChecker;
|
||||
import org.apache.lucene.search.spell.StringDistance;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @since solr 1.3
|
||||
*/
|
||||
public class IndexBasedSpellCheckerTest extends AbstractSolrTestCase {
|
||||
protected SpellingQueryConverter queryConverter;
|
||||
|
||||
protected static String[] DOCS = new String[]{
|
||||
"This is a title",
|
||||
"The quick reb fox jumped over the lazy brown dogs.",
|
||||
"This is a document",
|
||||
"another document",
|
||||
"red fox",
|
||||
"green bun",
|
||||
"green bud"
|
||||
};
|
||||
|
||||
|
||||
public String getSchemaFile() {
|
||||
return "schema.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
//Index something with a title
|
||||
for (int i = 0; i < DOCS.length; i++) {
|
||||
assertU(adoc("id", String.valueOf(i), "title", DOCS[i]));
|
||||
}
|
||||
assertU("commit",
|
||||
commit());
|
||||
String allq = "id:[0 TO 3]";
|
||||
assertQ("docs not added", req(allq));
|
||||
queryConverter = new SimpleQueryConverter();
|
||||
}
|
||||
|
||||
public void testSpelling() throws Exception {
|
||||
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
|
||||
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
SolrCore core = h.getCore();
|
||||
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
|
||||
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("documemt");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
//should be lowercased, b/c we are using a lowercasing analyzer
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("documemt is null and it shouldn't be", suggestions != null);
|
||||
assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "document", entry.getKey().equals("document") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
|
||||
//test something not in the spell checker
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
//test something that is spelled correctly
|
||||
tokens = queryConverter.convert("document");
|
||||
result = checker.getSuggestions(tokens, reader);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions == null);
|
||||
|
||||
//Has multiple possibilities, but the exact exists, so that should be returned
|
||||
tokens = queryConverter.convert("red");
|
||||
result = checker.getSuggestions(tokens, reader, 2);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
//Try out something which should have multiple suggestions
|
||||
tokens = queryConverter.convert("bug");
|
||||
result = checker.getSuggestions(tokens, reader, 2);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is null and it shouldn't be", suggestions != null);
|
||||
assertTrue("suggestions Size: " + suggestions.size() + " is not: " + 2, suggestions.size() == 2);
|
||||
|
||||
entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is equal to " + "bug and it shouldn't be", entry.getKey().equals("bug") == false);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
|
||||
entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is equal to " + "bug and it shouldn't be", entry.getKey().equals("bug") == false);
|
||||
assertTrue(entry.getValue() + " does not equal: " + SpellingResult.NO_FREQUENCY_INFO, entry.getValue() == SpellingResult.NO_FREQUENCY_INFO);
|
||||
}
|
||||
|
||||
public void testExtendedResults() throws Exception {
|
||||
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
|
||||
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("documemt");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
//should be lowercased, b/c we are using a lowercasing analyzer
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("documemt is null and it shouldn't be", suggestions != null);
|
||||
assertTrue("documemt Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "document", entry.getKey().equals("document") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + 2, entry.getValue() == 2);
|
||||
|
||||
//test something not in the spell checker
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
tokens = queryConverter.convert("document");
|
||||
result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
}
|
||||
|
||||
private class TestSpellChecker extends IndexBasedSpellChecker{
|
||||
public SpellChecker getSpellChecker(){
|
||||
return spellChecker;
|
||||
}
|
||||
}
|
||||
|
||||
public void testAlternateDistance() throws Exception {
|
||||
TestSpellChecker checker = new TestSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
spellchecker.add(AbstractLuceneSpellChecker.STRING_DISTANCE, JaroWinklerDistance.class.getName());
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
|
||||
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
|
||||
checker.build(core);
|
||||
SpellChecker sc = checker.getSpellChecker();
|
||||
assertTrue("sc is null and it shouldn't be", sc != null);
|
||||
StringDistance sd = sc.getStringDistance();
|
||||
assertTrue("sd is null and it shouldn't be", sd != null);
|
||||
assertTrue("sd is not an instance of " + JaroWinklerDistance.class.getName(), sd instanceof JaroWinklerDistance);
|
||||
}
|
||||
|
||||
public void testAlternateLocation() throws Exception {
|
||||
String[] ALT_DOCS = new String[]{
|
||||
"jumpin jack flash",
|
||||
"Sargent Peppers Lonely Hearts Club Band",
|
||||
"Born to Run",
|
||||
"Thunder Road",
|
||||
"Londons Burning",
|
||||
"A Horse with No Name",
|
||||
"Sweet Caroline"
|
||||
};
|
||||
|
||||
IndexBasedSpellChecker checker = new IndexBasedSpellChecker();
|
||||
NamedList spellchecker = new NamedList();
|
||||
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
|
||||
|
||||
File tmpDir = new File(System.getProperty("java.io.tmpdir"));
|
||||
File indexDir = new File(tmpDir, "spellingIdx" + new Date().getTime());
|
||||
//create a standalone index
|
||||
File altIndexDir = new File(tmpDir, "alternateIdx" + new Date().getTime());
|
||||
IndexWriter iw = new IndexWriter(altIndexDir, new WhitespaceAnalyzer(), IndexWriter.MaxFieldLength.LIMITED);
|
||||
for (int i = 0; i < ALT_DOCS.length; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(new Field("title", ALT_DOCS[i], Field.Store.YES, Field.Index.TOKENIZED));
|
||||
iw.addDocument(doc);
|
||||
}
|
||||
iw.optimize();
|
||||
iw.close();
|
||||
indexDir.mkdirs();
|
||||
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
|
||||
spellchecker.add(AbstractLuceneSpellChecker.LOCATION, altIndexDir.getAbsolutePath());
|
||||
spellchecker.add(IndexBasedSpellChecker.FIELD, "title");
|
||||
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
|
||||
SolrCore core = h.getCore();
|
||||
String dictName = checker.init(spellchecker, core.getResourceLoader());
|
||||
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME,
|
||||
dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
|
||||
checker.build(core);
|
||||
|
||||
IndexReader reader = core.getSearcher().get().getReader();
|
||||
Collection<Token> tokens = queryConverter.convert("flesh");
|
||||
SpellingResult result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
//should be lowercased, b/c we are using a lowercasing analyzer
|
||||
Map<String, Integer> suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("flesh is null and it shouldn't be", suggestions != null);
|
||||
assertTrue("flesh Size: " + suggestions.size() + " is not: " + 1, suggestions.size() == 1);
|
||||
Map.Entry<String, Integer> entry = suggestions.entrySet().iterator().next();
|
||||
assertTrue(entry.getKey() + " is not equal to " + "flash", entry.getKey().equals("flash") == true);
|
||||
assertTrue(entry.getValue() + " does not equal: " + 1, entry.getValue() == 1);
|
||||
|
||||
//test something not in the spell checker
|
||||
tokens = queryConverter.convert("super");
|
||||
result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
tokens = queryConverter.convert("Caroline");
|
||||
result = checker.getSuggestions(tokens, reader, 1, false, true);
|
||||
assertTrue("result is null and it shouldn't be", result != null);
|
||||
suggestions = result.get(tokens.iterator().next());
|
||||
assertTrue("suggestions is not null and it should be", suggestions == null);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.solr.spelling;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.HashSet;
|
||||
import java.io.StringReader;
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
class SimpleQueryConverter extends SpellingQueryConverter{
|
||||
@Override
|
||||
public Collection<Token> convert(String origQuery) {
|
||||
Collection<Token> result = new HashSet<Token>();
|
||||
WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
|
||||
TokenStream ts = analyzer.tokenStream("", new StringReader(origQuery));
|
||||
Token tok = null;
|
||||
try {
|
||||
while ((tok = ts.next()) != null){
|
||||
result.add(tok);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.solr.spelling;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* @since solr 1.3
|
||||
**/
|
||||
public class SpellingQueryConverterTest extends AbstractSolrTestCase {
|
||||
|
||||
public String getSchemaFile() {
|
||||
return "schema.xml";
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() {
|
||||
return "solrconfig.xml";
|
||||
}
|
||||
|
||||
|
||||
public void test() throws Exception {
|
||||
SpellingQueryConverter converter = new SpellingQueryConverter();
|
||||
converter.init(new NamedList());
|
||||
converter.setAnalyzer(new WhitespaceAnalyzer());
|
||||
Collection<Token> tokens = converter.convert("field:foo");
|
||||
assertTrue("tokens is null and it shouldn't be", tokens != null);
|
||||
assertTrue("tokens Size: " + tokens.size() + " is not: " + 1, tokens.size() == 1);
|
||||
}
|
||||
|
||||
|
||||
}
|
|
@ -136,7 +136,7 @@
|
|||
-->
|
||||
<maxBooleanClauses>1024</maxBooleanClauses>
|
||||
|
||||
|
||||
|
||||
<!-- Cache specification for Filters or DocSets - unordered set of *all* documents
|
||||
that match a particular query.
|
||||
-->
|
||||
|
@ -281,7 +281,7 @@
|
|||
|
||||
<requestHandler name="test" class="solr.tst.TestRequestHandler" />
|
||||
|
||||
<!-- test query parameter defaults -->
|
||||
<!-- test query parameter defaults -->
|
||||
<requestHandler name="defaults" class="solr.StandardRequestHandler">
|
||||
<lst name="defaults">
|
||||
<int name="rows">4</int>
|
||||
|
@ -289,8 +289,8 @@
|
|||
<str name="hl.fl">text,name,subject,title,whitetok</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
<!-- test query parameter defaults -->
|
||||
|
||||
<!-- test query parameter defaults -->
|
||||
<requestHandler name="lazy" class="solr.StandardRequestHandler" startup="lazy">
|
||||
<lst name="defaults">
|
||||
<int name="rows">4</int>
|
||||
|
@ -307,7 +307,7 @@
|
|||
<str name="queryFieldType">string</str>
|
||||
<str name="config-file">elevate.xml</str>
|
||||
</searchComponent>
|
||||
|
||||
|
||||
<requestHandler name="/elevate" class="org.apache.solr.handler.component.SearchHandler">
|
||||
<lst name="defaults">
|
||||
<str name="echoParams">explicit</str>
|
||||
|
@ -316,7 +316,51 @@
|
|||
<str>elevate</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
|
||||
<searchComponent name="spellcheck" class="org.apache.solr.handler.component.SpellCheckComponent">
|
||||
<lst name="defaults">
|
||||
<!-- omp = Only More Popular -->
|
||||
<str name="spellcheck.onlyMorePopular">false</str>
|
||||
<!-- exr = Extended Results -->
|
||||
<str name="spellcheck.extendedResults">false</str>
|
||||
<!-- The number of suggestions to return -->
|
||||
<str name="spellcheck.count">1</str>
|
||||
</lst>
|
||||
<str name="queryAnalyzerFieldType">lowerfilt</str>
|
||||
|
||||
<lst name="spellchecker">
|
||||
<str name="name">default</str>
|
||||
<str name="field">lowerfilt</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="name">jarowinkler</str>
|
||||
<str name="field">lowerfilt</str>
|
||||
<!-- Use a different Distance Measure -->
|
||||
<str name="distanceMeasure">org.apache.lucene.search.spell.JaroWinklerDistance</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
|
||||
</lst>
|
||||
<lst name="spellchecker">
|
||||
<str name="classname">solr.FileBasedSpellChecker</str>
|
||||
<str name="name">external</str>
|
||||
<str name="sourceLocation">spellings.txt</str>
|
||||
<str name="characterEncoding">UTF-8</str>
|
||||
<str name="spellcheckIndexDir">./spellchecker</str>
|
||||
</lst>
|
||||
</searchComponent>
|
||||
<!--
|
||||
The SpellingQueryConverter to convert raw (CommonParams.Q) queries into tokens. Uses a simple regular expression
|
||||
to strip off field markup, boosts, ranges, etc. but it is not guaranteed to match an exact parse from the query parser.
|
||||
-->
|
||||
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SpellingQueryConverter"/>
|
||||
|
||||
<requestHandler name="spellCheckCompRH" class="org.apache.solr.handler.component.SearchHandler">
|
||||
<arr name="last-components">
|
||||
<str>spellcheck</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<highlighting>
|
||||
<!-- Configure the standard fragmenter -->
|
||||
|
@ -325,13 +369,13 @@
|
|||
<int name="hl.fragsize">100</int>
|
||||
</lst>
|
||||
</fragmenter>
|
||||
|
||||
|
||||
<fragmenter name="regex" class="org.apache.solr.highlight.RegexFragmenter">
|
||||
<lst name="defaults">
|
||||
<int name="hl.fragsize">70</int>
|
||||
</lst>
|
||||
</fragmenter>
|
||||
|
||||
|
||||
<!-- Configure the standard formatter -->
|
||||
<formatter name="html" class="org.apache.solr.highlight.HtmlFormatter" default="true">
|
||||
<lst name="defaults">
|
||||
|
|
|
@ -0,0 +1,16 @@
|
|||
foo
|
||||
bar
|
||||
Solr
|
||||
junk
|
||||
foo
|
||||
bar
|
||||
Solr
|
||||
junk
|
||||
foo
|
||||
bar
|
||||
Solr
|
||||
junk
|
||||
foo
|
||||
bar
|
||||
Solr
|
||||
junk
|
Loading…
Reference in New Issue