mirror of https://github.com/apache/lucene.git
SOLR-3906: add factory for AnalyzingSuggester
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400565 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1e47b31e1d
commit
49a4f8b07c
|
@ -79,6 +79,10 @@ Bug Fixes
|
||||||
* LUCENE-4479: Highlighter works correctly for fields with term vector
|
* LUCENE-4479: Highlighter works correctly for fields with term vector
|
||||||
positions, but no offsets. (Alan Woodward)
|
positions, but no offsets. (Alan Woodward)
|
||||||
|
|
||||||
|
* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
|
||||||
|
romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
|
||||||
|
(Robert Muir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
|
|
||||||
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
|
* LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets
|
||||||
|
|
|
@ -35,6 +35,7 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
|
||||||
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
|
||||||
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);
|
||||||
|
|
||||||
|
private StringBuilder buffer = new StringBuilder();
|
||||||
private boolean useRomaji;
|
private boolean useRomaji;
|
||||||
|
|
||||||
public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
|
public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
|
||||||
|
@ -50,10 +51,19 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
String reading = readingAttr.getReading();
|
String reading = readingAttr.getReading();
|
||||||
if (reading != null) {
|
|
||||||
if (useRomaji) {
|
if (useRomaji) {
|
||||||
ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
|
if (reading == null) {
|
||||||
|
// if its an OOV term, just try the term text
|
||||||
|
buffer.setLength(0);
|
||||||
|
ToStringUtil.getRomanization(buffer, termAttr);
|
||||||
|
termAttr.setEmpty().append(buffer);
|
||||||
} else {
|
} else {
|
||||||
|
ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// just replace the term text with the reading, if it exists
|
||||||
|
if (reading != null) {
|
||||||
termAttr.setEmpty().append(reading);
|
termAttr.setEmpty().append(reading);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.cjk.CJKWidthFilter;
|
||||||
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
import org.apache.lucene.analysis.core.KeywordTokenizer;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -53,12 +55,40 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testKatakanaReadingsHalfWidth() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||||
|
TokenStream stream = new CJKWidthFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a, "今夜はロバート先生と話した",
|
||||||
|
new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testRomajiReadings() throws IOException {
|
public void testRomajiReadings() throws IOException {
|
||||||
assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
|
assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
|
||||||
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
|
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testRomajiReadingsHalfWidth() throws IOException {
|
||||||
|
Analyzer a = new Analyzer() {
|
||||||
|
@Override
|
||||||
|
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
|
||||||
|
TokenStream stream = new CJKWidthFilter(tokenizer);
|
||||||
|
return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
assertAnalyzesTo(a, "今夜はロバート先生と話した",
|
||||||
|
new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
public void testRandomData() throws IOException {
|
public void testRandomData() throws IOException {
|
||||||
Random random = random();
|
Random random = random();
|
||||||
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
|
checkRandomData(random, katakanaAnalyzer, 1000*RANDOM_MULTIPLIER);
|
||||||
|
|
|
@ -127,7 +127,7 @@ public class AnalyzingSuggester extends Lookup {
|
||||||
private final boolean exactFirst;
|
private final boolean exactFirst;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* True if separator between tokens should be preservered.
|
* True if separator between tokens should be preserved.
|
||||||
*/
|
*/
|
||||||
private final boolean preserveSep;
|
private final boolean preserveSep;
|
||||||
|
|
||||||
|
|
|
@ -45,6 +45,10 @@ New Features
|
||||||
* SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
|
* SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
|
||||||
(phunt via Mark Miller)
|
(phunt via Mark Miller)
|
||||||
|
|
||||||
|
* SOLR-3906: Add support for AnalyzingSuggester (LUCENE-3842), where the
|
||||||
|
underlying analyzed form used for suggestions is separate from the returned
|
||||||
|
text. (Robert Muir)
|
||||||
|
|
||||||
Optimizations
|
Optimizations
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,118 @@
|
||||||
|
package org.apache.solr.spelling.suggest.fst;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.search.suggest.Lookup;
|
||||||
|
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
|
||||||
|
import org.apache.solr.common.util.NamedList;
|
||||||
|
import org.apache.solr.core.SolrCore;
|
||||||
|
import org.apache.solr.schema.FieldType;
|
||||||
|
import org.apache.solr.spelling.suggest.LookupFactory;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory for {@link AnalyzingSuggester}
|
||||||
|
* @lucene.experimental
|
||||||
|
*/
|
||||||
|
public class AnalyzingLookupFactory extends LookupFactory {
|
||||||
|
/**
|
||||||
|
* If <code>true</code>, exact suggestions are returned first, even if they are prefixes
|
||||||
|
* of other strings in the automaton (possibly with larger weights).
|
||||||
|
*/
|
||||||
|
public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If <code>true</code>, then a separator between tokens is preserved. This means that
|
||||||
|
* suggestions are sensitive to tokenization (e.g. baseball is different from base ball).
|
||||||
|
*/
|
||||||
|
public static final String PRESERVE_SEP = "preserveSep";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When multiple suggestions collide to the same analyzed form, this is the limit of
|
||||||
|
* how many unique surface forms we keep.
|
||||||
|
*/
|
||||||
|
public static final String MAX_SURFACE_FORMS = "maxSurfaceFormsPerAnalyzedForm";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* When building the FST ("index-time"), we add each path through the tokenstream graph
|
||||||
|
* as an individual entry. This places an upper-bound on how many expansions will be added
|
||||||
|
* for a single suggestion.
|
||||||
|
*/
|
||||||
|
public static final String MAX_EXPANSIONS = "maxGraphExpansions";
|
||||||
|
|
||||||
|
// confusingly: the queryAnalyzerFieldType parameter is something totally different, this
|
||||||
|
// is solr's "analysis" of the queries before they even reach the suggester (really makes
|
||||||
|
// little sense for suggest at all, only for spellcheck). So we pick different names.
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The analyzer used at "query-time" and "build-time" to analyze suggestions.
|
||||||
|
*/
|
||||||
|
public static final String QUERY_ANALYZER = "suggestAnalyzerFieldType";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* File name for the automaton.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
private static final String FILENAME = "wfsta.bin";
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Lookup create(NamedList params, SolrCore core) {
|
||||||
|
// mandatory parameter
|
||||||
|
Object fieldTypeName = params.get(QUERY_ANALYZER);
|
||||||
|
if (fieldTypeName == null) {
|
||||||
|
throw new IllegalArgumentException("Error in configuration: " + QUERY_ANALYZER + " parameter is mandatory");
|
||||||
|
}
|
||||||
|
FieldType ft = core.getSchema().getFieldTypeByName(fieldTypeName.toString());
|
||||||
|
Analyzer indexAnalyzer = ft.getAnalyzer();
|
||||||
|
Analyzer queryAnalyzer = ft.getQueryAnalyzer();
|
||||||
|
|
||||||
|
// optional parameters
|
||||||
|
|
||||||
|
boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
|
||||||
|
? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
|
||||||
|
: true;
|
||||||
|
|
||||||
|
boolean preserveSep = params.get(PRESERVE_SEP) != null
|
||||||
|
? Boolean.valueOf(params.get(PRESERVE_SEP).toString())
|
||||||
|
: true;
|
||||||
|
|
||||||
|
int flags = 0;
|
||||||
|
if (exactMatchFirst) {
|
||||||
|
flags |= AnalyzingSuggester.EXACT_FIRST;
|
||||||
|
}
|
||||||
|
if (preserveSep) {
|
||||||
|
flags |= AnalyzingSuggester.PRESERVE_SEP;
|
||||||
|
}
|
||||||
|
|
||||||
|
int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null
|
||||||
|
? Integer.parseInt(params.get(MAX_SURFACE_FORMS).toString())
|
||||||
|
: 256;
|
||||||
|
|
||||||
|
int maxGraphExpansions = params.get(MAX_EXPANSIONS) != null
|
||||||
|
? Integer.parseInt(params.get(MAX_EXPANSIONS).toString())
|
||||||
|
: -1;
|
||||||
|
|
||||||
|
|
||||||
|
return new AnalyzingSuggester(indexAnalyzer, queryAnalyzer, flags, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String storeFileName() {
|
||||||
|
return FILENAME;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
# simple auto-suggest phrase dictionary for testing
|
||||||
|
# note this uses tabs as separator!
|
||||||
|
北海道 1.0
|
||||||
|
今夜 3.0
|
||||||
|
話した 6.0
|
|
@ -40,6 +40,14 @@
|
||||||
<filter class="solr.TrimFilterFactory"/>
|
<filter class="solr.TrimFilterFactory"/>
|
||||||
</analyzer>
|
</analyzer>
|
||||||
</fieldtype>
|
</fieldtype>
|
||||||
|
|
||||||
|
<fieldtype name="ja_suggest" class="solr.TextField">
|
||||||
|
<analyzer>
|
||||||
|
<tokenizer class="solr.JapaneseTokenizerFactory" mode="normal"/>
|
||||||
|
<filter class="solr.CJKWidthFilterFactory"/>
|
||||||
|
<filter class="solr.JapaneseReadingFormFilterFactory" useRomaji="true"/>
|
||||||
|
</analyzer>
|
||||||
|
</fieldtype>
|
||||||
</types>
|
</types>
|
||||||
|
|
||||||
<fields>
|
<fields>
|
||||||
|
|
|
@ -43,6 +43,27 @@
|
||||||
<str name="queryAnalyzerFieldType">phrase_suggest</str>
|
<str name="queryAnalyzerFieldType">phrase_suggest</str>
|
||||||
</searchComponent>
|
</searchComponent>
|
||||||
|
|
||||||
|
<!-- AnalyzingLookup suggest component -->
|
||||||
|
<searchComponent class="solr.SpellCheckComponent" name="suggest_analyzing">
|
||||||
|
<lst name="spellchecker">
|
||||||
|
<str name="name">suggest_analyzing</str>
|
||||||
|
<str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
|
||||||
|
<str name="lookupImpl">org.apache.solr.spelling.suggest.fst.AnalyzingLookupFactory</str>
|
||||||
|
<str name="storeDir">suggest_analyzing</str>
|
||||||
|
<str name="buildOnCommit">false</str>
|
||||||
|
|
||||||
|
<!-- Suggester properties -->
|
||||||
|
<bool name="exactMatchFirst">true</bool>
|
||||||
|
<str name="suggestAnalyzerFieldType">ja_suggest</str>
|
||||||
|
<bool name="preserveSep">false</bool>
|
||||||
|
|
||||||
|
<str name="sourceLocation">jasuggest.txt</str>
|
||||||
|
</lst>
|
||||||
|
|
||||||
|
<!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
|
||||||
|
<str name="queryAnalyzerFieldType">phrase_suggest</str>
|
||||||
|
</searchComponent>
|
||||||
|
|
||||||
<!-- is this thing just configured globally or wtf is going on here?! -->
|
<!-- is this thing just configured globally or wtf is going on here?! -->
|
||||||
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
|
<queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
|
||||||
|
|
||||||
|
@ -60,4 +81,18 @@
|
||||||
</arr>
|
</arr>
|
||||||
</requestHandler>
|
</requestHandler>
|
||||||
|
|
||||||
|
<!-- analyzing (finite state automaton based) -->
|
||||||
|
<requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_analyzing">
|
||||||
|
<lst name="defaults">
|
||||||
|
<str name="spellcheck">true</str>
|
||||||
|
<str name="spellcheck.dictionary">suggest_analyzing</str>
|
||||||
|
<str name="spellcheck.collate">false</str>
|
||||||
|
<!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
|
||||||
|
<str name="spellcheck.onlyMorePopular">true</str>
|
||||||
|
</lst>
|
||||||
|
<arr name="components">
|
||||||
|
<str>suggest_analyzing</str>
|
||||||
|
</arr>
|
||||||
|
</requestHandler>
|
||||||
|
|
||||||
</config>
|
</config>
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
package org.apache.solr.spelling.suggest;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import org.apache.solr.SolrTestCaseJ4;
|
||||||
|
import org.apache.solr.common.params.SpellingParams;
|
||||||
|
import org.junit.BeforeClass;
|
||||||
|
|
||||||
|
public class TestAnalyzedSuggestions extends SolrTestCaseJ4 {
|
||||||
|
static final String URI = "/suggest_analyzing";
|
||||||
|
|
||||||
|
@BeforeClass
|
||||||
|
public static void beforeClass() throws Exception {
|
||||||
|
initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
|
||||||
|
assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void test() {
|
||||||
|
assertQ(req("qt", URI, "q", "hokk", SpellingParams.SPELLCHECK_COUNT, "1"),
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/int[@name='numFound'][.='1']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/arr[@name='suggestion']/str[1][.='北海道']"
|
||||||
|
);
|
||||||
|
assertQ(req("qt", URI, "q", "ほっk", SpellingParams.SPELLCHECK_COUNT, "1"),
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ほっk']/int[@name='numFound'][.='1']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ほっk']/arr[@name='suggestion']/str[1][.='北海道']"
|
||||||
|
);
|
||||||
|
assertQ(req("qt", URI, "q", "ホッk", SpellingParams.SPELLCHECK_COUNT, "1"),
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/int[@name='numFound'][.='1']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/arr[@name='suggestion']/str[1][.='北海道']"
|
||||||
|
);
|
||||||
|
assertQ(req("qt", URI, "q", "ホッk", SpellingParams.SPELLCHECK_COUNT, "1"),
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/int[@name='numFound'][.='1']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/arr[@name='suggestion']/str[1][.='北海道']"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void testMultiple() {
|
||||||
|
assertQ(req("qt", URI, "q", "h", SpellingParams.SPELLCHECK_COUNT, "2"),
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/int[@name='numFound'][.='2']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[1][.='話した']",
|
||||||
|
"//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[2][.='北海道']"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue