mirror of https://github.com/apache/lucene.git
SOLR-3143: add SuggestQueryConverter for autosuggesters
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1291322 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9fd084ea1c
commit
5adeacf2aa
|
@ -506,6 +506,9 @@ New Features
|
|||
* LUCENE-3714: Add WFSTLookupFactory, a suggester that uses a weighted FST
|
||||
for more fine-grained suggestions. (Mike McCandless, Dawid Weiss, Robert Muir)
|
||||
|
||||
* SOLR-3143: Add SuggestQueryConverter, a QueryConverter intended for
|
||||
auto-suggesters. (Robert Muir)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
@ -100,39 +101,42 @@ public class SpellingQueryConverter extends QueryConverter {
|
|||
Collection<Token> result = new ArrayList<Token>();
|
||||
//TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
|
||||
Matcher matcher = QUERY_REGEX.matcher(original);
|
||||
TokenStream stream;
|
||||
while (matcher.find()) {
|
||||
String word = matcher.group(0);
|
||||
if (word.equals("AND") == false && word.equals("OR") == false) {
|
||||
try {
|
||||
stream = analyzer.tokenStream("", new StringReader(word));
|
||||
// TODO: support custom attributes
|
||||
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
|
||||
FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
|
||||
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setStartOffset(matcher.start() + offsetAtt.startOffset());
|
||||
token.setEndOffset(matcher.start() + offsetAtt.endOffset());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
analyze(result, new StringReader(word), matcher.start());
|
||||
} catch (IOException e) {
|
||||
// TODO: shouldn't we log something?
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
protected void analyze(Collection<Token> result, Reader text, int offset) throws IOException {
|
||||
TokenStream stream = analyzer.tokenStream("", text);
|
||||
// TODO: support custom attributes
|
||||
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
|
||||
FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
|
||||
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
Token token = new Token();
|
||||
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||
token.setStartOffset(offset + offsetAtt.startOffset());
|
||||
token.setEndOffset(offset + offsetAtt.endOffset());
|
||||
token.setFlags(flagsAtt.getFlags());
|
||||
token.setType(typeAtt.type());
|
||||
token.setPayload(payloadAtt.getPayload());
|
||||
token.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||
result.add(token);
|
||||
}
|
||||
stream.end();
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
|
||||
/**
|
||||
* Passes the entire query string to the configured analyzer as-is.
|
||||
**/
|
||||
public class SuggestQueryConverter extends SpellingQueryConverter {
|
||||
|
||||
@Override
|
||||
public Collection<Token> convert(String original) {
|
||||
if (original == null) { // this can happen with q.alt = and no query
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
Collection<Token> result = new ArrayList<Token>();
|
||||
try {
|
||||
analyze(result, new StringReader(original), 0);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
# simple auto-suggest phrase dictionary for testing
|
||||
# note this uses tabs as separator!
|
||||
the first phrase 1.0
|
||||
the second phrase 2.0
|
||||
testing 1234 3.0
|
||||
foo 5.0
|
||||
the fifth phrase 2.0
|
||||
the final phrase 4.0
|
|
@ -0,0 +1,52 @@
|
|||
<?xml version="1.0" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- Test schema file for phrase suggestions -->
|
||||
|
||||
<schema name="test" version="1.0">
|
||||
<types>
|
||||
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
|
||||
|
||||
<!-- basic text field -->
|
||||
<fieldtype name="text" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="phrase_suggest" class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory"/>
|
||||
<filter class="solr.PatternReplaceFilterFactory"
|
||||
pattern="([^\p{L}\p{M}\p{N}\p{Cs}]*[\p{L}\p{M}\p{N}\p{Cs}\_]+:)|([^\p{L}\p{M}\p{N}\p{Cs}])+"
|
||||
replacement=" " replace="all"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.TrimFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
</types>
|
||||
|
||||
<fields>
|
||||
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
</fields>
|
||||
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
<uniqueKey>id</uniqueKey>
|
||||
</schema>
|
|
@ -0,0 +1,63 @@
|
|||
<?xml version="1.0" ?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!-- solrconfig.xml for a WFST phrase suggester -->
|
||||
<config>
|
||||
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
|
||||
<dataDir>${solr.data.dir:}</dataDir>
|
||||
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
|
||||
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
|
||||
|
||||
<!-- WFSTLookup suggest component -->
|
||||
<searchComponent class="solr.SpellCheckComponent" name="suggest_wfst">
|
||||
<lst name="spellchecker">
|
||||
<str name="name">suggest_wfst</str>
|
||||
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
|
||||
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
|
||||
<str name="storeDir">suggest_wfst</str>
|
||||
<str name="buildOnCommit">false</str>
|
||||
|
||||
<!-- Suggester properties -->
|
||||
<bool name="exactMatchFirst">true</bool>
|
||||
|
||||
<str name="sourceLocation">phrasesuggest.txt</str>
|
||||
</lst>
|
||||
|
||||
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
|
||||
<str name="queryAnalyzerFieldType">phrase_suggest</str>
|
||||
</searchComponent>
|
||||
|
||||
<!-- is this thing just configured globally or wtf is going on here?! -->
|
||||
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
|
||||
|
||||
<!-- wfst (finite state automaton based) -->
|
||||
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_wfst">
|
||||
<lst name="defaults">
|
||||
<str name="spellcheck">true</str>
|
||||
<str name="spellcheck.dictionary">suggest_wfst</str>
|
||||
<str name="spellcheck.collate">false</str>
|
||||
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
|
||||
<str name="spellcheck.onlyMorePopular">true</str>
|
||||
</lst>
|
||||
<arr name="components">
|
||||
<str>suggest_wfst</str>
|
||||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
</config>
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.solr.spelling;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Collection;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.CannedTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
|
||||
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
|
||||
|
||||
public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
|
||||
SuggestQueryConverter converter = new SuggestQueryConverter();
|
||||
|
||||
public void testSimple() throws Exception {
|
||||
// lowercases only!
|
||||
converter.setAnalyzer(new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
|
||||
assertConvertsTo("This is a test", new String[] { "this is a test" });
|
||||
}
|
||||
|
||||
public void testComplicated() throws Exception {
|
||||
// lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
|
||||
converter.setAnalyzer(new Analyzer() {
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
Tokenizer tokenizer = new KeywordTokenizer(reader);
|
||||
TokenStream filter = new PatternReplaceFilter(tokenizer,
|
||||
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
|
||||
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
|
||||
filter = new TrimFilter(filter, false);
|
||||
return new TokenStreamComponents(tokenizer, filter);
|
||||
}
|
||||
});
|
||||
assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
|
||||
assertConvertsTo("test~", new String[] { "test" });
|
||||
assertConvertsTo("field:test", new String[] { "test" });
|
||||
assertConvertsTo("This is a test", new String[] { "this is a test" });
|
||||
assertConvertsTo(" This is a test", new String[] { "this is a test" });
|
||||
assertConvertsTo("Foo (field:bar) text_hi:हिन्दी ", new String[] { "foo bar हिन्दी" });
|
||||
}
|
||||
|
||||
public void assertConvertsTo(String text, String expected[]) throws IOException {
|
||||
Collection<Token> tokens = converter.convert(text);
|
||||
TokenStream ts = new CannedTokenStream(tokens.toArray(new Token[0]));
|
||||
assertTokenStreamContents(ts, expected);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
package org.apache.solr.spelling.suggest;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.SpellingParams;
|
||||
import org.junit.BeforeClass;
|
||||
|
||||
public class TestPhraseSuggestions extends SolrTestCaseJ4 {
|
||||
static final String URI = "/suggest_wfst";
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
|
||||
assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
|
||||
}
|
||||
|
||||
public void test() {
|
||||
assertQ(req("qt", URI, "q", "the f", SpellingParams.SPELLCHECK_COUNT, "4"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/int[@name='numFound'][.='3']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[1][.='the final phrase']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[2][.='the fifth phrase']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[3][.='the first phrase']"
|
||||
);
|
||||
|
||||
assertQ(req("qt", URI, "q", "Testing +12", SpellingParams.SPELLCHECK_COUNT, "4"),
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/int[@name='numFound'][.='1']",
|
||||
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/arr[@name='suggestion']/str[1][.='testing 1234']"
|
||||
);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue