SOLR-3143: add SuggestQueryConverter for autosuggesters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1291322 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-20 15:48:38 +00:00
parent 9fd084ea1c
commit 5adeacf2aa
8 changed files with 320 additions and 24 deletions

View File

@ -506,6 +506,9 @@ New Features
* LUCENE-3714: Add WFSTLookupFactory, a suggester that uses a weighted FST
for more fine-grained suggestions. (Mike McCandless, Dawid Weiss, Robert Muir)
* SOLR-3143: Add SuggestQueryConverter, a QueryConverter intended for
auto-suggesters. (Robert Muir)
Optimizations
----------------------
* SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter

View File

@ -18,6 +18,7 @@
package org.apache.solr.spelling;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
@ -100,39 +101,42 @@ public class SpellingQueryConverter extends QueryConverter {
Collection<Token> result = new ArrayList<Token>();
//TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
Matcher matcher = QUERY_REGEX.matcher(original);
TokenStream stream;
while (matcher.find()) {
String word = matcher.group(0);
if (word.equals("AND") == false && word.equals("OR") == false) {
try {
stream = analyzer.tokenStream("", new StringReader(word));
// TODO: support custom attributes
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setStartOffset(matcher.start() + offsetAtt.startOffset());
token.setEndOffset(matcher.start() + offsetAtt.endOffset());
token.setFlags(flagsAtt.getFlags());
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
stream.end();
stream.close();
analyze(result, new StringReader(word), matcher.start());
} catch (IOException e) {
// TODO: shouldn't we log something?
}
}
}
return result;
}
protected void analyze(Collection<Token> result, Reader text, int offset) throws IOException {
TokenStream stream = analyzer.tokenStream("", text);
// TODO: support custom attributes
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
stream.reset();
while (stream.incrementToken()) {
Token token = new Token();
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
token.setStartOffset(offset + offsetAtt.startOffset());
token.setEndOffset(offset + offsetAtt.endOffset());
token.setFlags(flagsAtt.getFlags());
token.setType(typeAtt.type());
token.setPayload(payloadAtt.getPayload());
token.setPositionIncrement(posIncAtt.getPositionIncrement());
result.add(token);
}
stream.end();
stream.close();
}
}

View File

@ -0,0 +1,47 @@
package org.apache.solr.spelling;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import org.apache.lucene.analysis.Token;
/**
* Passes the entire query string to the configured analyzer as-is.
**/
public class SuggestQueryConverter extends SpellingQueryConverter {
@Override
public Collection<Token> convert(String original) {
if (original == null) { // this can happen with q.alt = and no query
return Collections.emptyList();
}
Collection<Token> result = new ArrayList<Token>();
try {
analyze(result, new StringReader(original), 0);
} catch (IOException e) {
throw new RuntimeException(e);
}
return result;
}
}

View File

@ -0,0 +1,8 @@
# simple auto-suggest phrase dictionary for testing
# note this uses tabs as separator!
the first phrase 1.0
the second phrase 2.0
testing 1234 3.0
foo 5.0
the fifth phrase 2.0
the final phrase 4.0

View File

@ -0,0 +1,52 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- Test schema file for phrase suggestions -->
<schema name="test" version="1.0">
<types>
<fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
<!-- basic text field -->
<fieldtype name="text" class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
</analyzer>
</fieldtype>
<fieldtype name="phrase_suggest" class="solr.TextField">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory"/>
<filter class="solr.PatternReplaceFilterFactory"
pattern="([^\p{L}\p{M}\p{N}\p{Cs}]*[\p{L}\p{M}\p{N}\p{Cs}\_]+:)|([^\p{L}\p{M}\p{N}\p{Cs}])+"
replacement=" " replace="all"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.TrimFilterFactory"/>
</analyzer>
</fieldtype>
</types>
<fields>
<field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="text" type="text" indexed="true" stored="false"/>
</fields>
<defaultSearchField>text</defaultSearchField>
<uniqueKey>id</uniqueKey>
</schema>

View File

@ -0,0 +1,63 @@
<?xml version="1.0" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!-- solrconfig.xml for a WFST phrase suggester -->
<config>
<luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
<dataDir>${solr.data.dir:}</dataDir>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
<requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
<!-- WFSTLookup suggest component -->
<searchComponent class="solr.SpellCheckComponent" name="suggest_wfst">
<lst name="spellchecker">
<str name="name">suggest_wfst</str>
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
<str name="storeDir">suggest_wfst</str>
<str name="buildOnCommit">false</str>
<!-- Suggester properties -->
<bool name="exactMatchFirst">true</bool>
<str name="sourceLocation">phrasesuggest.txt</str>
</lst>
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
<str name="queryAnalyzerFieldType">phrase_suggest</str>
</searchComponent>
<!-- is this thing just configured globally or wtf is going on here?! -->
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
<!-- wfst (finite state automaton based) -->
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_wfst">
<lst name="defaults">
<str name="spellcheck">true</str>
<str name="spellcheck.dictionary">suggest_wfst</str>
<str name="spellcheck.collate">false</str>
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
<str name="spellcheck.onlyMorePopular">true</str>
</lst>
<arr name="components">
<str>suggest_wfst</str>
</arr>
</requestHandler>
</config>

View File

@ -0,0 +1,73 @@
package org.apache.solr.spelling;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.io.Reader;
import java.util.Collection;
import java.util.regex.Pattern;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.KeywordTokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.TrimFilter;
import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
SuggestQueryConverter converter = new SuggestQueryConverter();
public void testSimple() throws Exception {
// lowercases only!
converter.setAnalyzer(new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
assertConvertsTo("This is a test", new String[] { "this is a test" });
}
public void testComplicated() throws Exception {
// lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
converter.setAnalyzer(new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new KeywordTokenizer(reader);
TokenStream filter = new PatternReplaceFilter(tokenizer,
Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
filter = new TrimFilter(filter, false);
return new TokenStreamComponents(tokenizer, filter);
}
});
assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
assertConvertsTo("test~", new String[] { "test" });
assertConvertsTo("field:test", new String[] { "test" });
assertConvertsTo("This is a test", new String[] { "this is a test" });
assertConvertsTo(" This is a test", new String[] { "this is a test" });
assertConvertsTo("Foo (field:bar) text_hi:हिन्दी ", new String[] { "foo bar हिन्दी" });
}
public void assertConvertsTo(String text, String expected[]) throws IOException {
Collection<Token> tokens = converter.convert(text);
TokenStream ts = new CannedTokenStream(tokens.toArray(new Token[0]));
assertTokenStreamContents(ts, expected);
}
}

View File

@ -0,0 +1,46 @@
package org.apache.solr.spelling.suggest;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.SpellingParams;
import org.junit.BeforeClass;
public class TestPhraseSuggestions extends SolrTestCaseJ4 {
static final String URI = "/suggest_wfst";
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
}
public void test() {
assertQ(req("qt", URI, "q", "the f", SpellingParams.SPELLCHECK_COUNT, "4"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/int[@name='numFound'][.='3']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[1][.='the final phrase']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[2][.='the fifth phrase']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[3][.='the first phrase']"
);
assertQ(req("qt", URI, "q", "Testing +12", SpellingParams.SPELLCHECK_COUNT, "4"),
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/int[@name='numFound'][.='1']",
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/arr[@name='suggestion']/str[1][.='testing 1234']"
);
}
}