SOLR-3906: add factory for AnalyzingSuggester

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1400565 13f79535-47bb-0310-9956-ffa450edef68
2025-03-03 14:59:16 +00:00 · 2012-10-21 03:31:29 +00:00 · 2012-10-21 03:31:29 +00:00 · 49a4f8b07c
commit 49a4f8b07c
parent 1e47b31e1d
10 changed files with 277 additions and 4 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -79,6 +79,10 @@ Bug Fixes
 * LUCENE-4479: Highlighter works correctly for fields with term vector
  positions, but no offsets.  (Alan Woodward)

+* SOLR-3906: JapaneseReadingFormFilter in romaji mode will return
+  romaji even for out-of-vocabulary kana cases (e.g. half-width forms).
+  (Robert Muir)
+
 Optimizations

 * LUCENE-4443: BlockPostingsFormat no longer writes unnecessary offsets 
--- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
+++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseReadingFormFilter.java
@ -35,6 +35,7 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
  private final CharTermAttribute termAttr = addAttribute(CharTermAttribute.class);
  private final ReadingAttribute readingAttr = addAttribute(ReadingAttribute.class);

+  private StringBuilder buffer = new StringBuilder();
  private boolean useRomaji;

  public JapaneseReadingFormFilter(TokenStream input, boolean useRomaji) {
@ -50,10 +51,19 @@ public final class JapaneseReadingFormFilter extends TokenFilter {
  public boolean incrementToken() throws IOException {
    if (input.incrementToken()) {
      String reading = readingAttr.getReading();
-      if (reading != null) {
-        if (useRomaji) {
-          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+      
+      if (useRomaji) {
+        if (reading == null) {
+          // if its an OOV term, just try the term text
+          buffer.setLength(0);
+          ToStringUtil.getRomanization(buffer, termAttr);
+          termAttr.setEmpty().append(buffer);
        } else {
+          ToStringUtil.getRomanization(termAttr.setEmpty(), reading);
+        }
+      } else {
+        // just replace the term text with the reading, if it exists
+        if (reading != null) {
          termAttr.setEmpty().append(reading);
        }
      }
--- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
+++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestJapaneseReadingFormFilter.java
@ -19,7 +19,9 @@ package org.apache.lucene.analysis.ja;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.cjk.CJKWidthFilter;
 import org.apache.lucene.analysis.core.KeywordTokenizer;

 import java.io.IOException;
@ -52,12 +54,40 @@ public class TestJapaneseReadingFormFilter extends BaseTokenStreamTestCase {
        new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
    );
  }
+  
+  public void testKatakanaReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, false));
+      }
+    };
+    assertAnalyzesTo(a, "今夜はﾛﾊﾞｰﾄ先生と話した",
+        new String[] { "コンヤ", "ハ", "ロバート", "センセイ", "ト", "ハナシ", "タ" }
+    );
+  }

  public void testRomajiReadings() throws IOException {
    assertAnalyzesTo(romajiAnalyzer, "今夜はロバート先生と話した",
        new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
    );
  }
+  
+  public void testRomajiReadingsHalfWidth() throws IOException {
+    Analyzer a = new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new JapaneseTokenizer(reader, null, true, JapaneseTokenizer.Mode.SEARCH);
+        TokenStream stream = new CJKWidthFilter(tokenizer);
+        return new TokenStreamComponents(tokenizer, new JapaneseReadingFormFilter(stream, true));
+      }
+    };
+    assertAnalyzesTo(a, "今夜はﾛﾊﾞｰﾄ先生と話した",
+        new String[] { "kon'ya", "ha", "robato", "sensei", "to", "hanashi", "ta" }
+    );
+  }

  public void testRandomData() throws IOException {
    Random random = random();
--- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
+++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingSuggester.java
@ -127,7 +127,7 @@ public class AnalyzingSuggester extends Lookup {
  private final boolean exactFirst;
  
  /** 
-   * True if separator between tokens should be preservered.
+   * True if separator between tokens should be preserved.
   */
  private final boolean preserveSep;

--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -45,6 +45,10 @@ New Features
 * SOLR-3929: Support configuring IndexWriter max thread count in solrconfig.
  (phunt via Mark Miller)

+* SOLR-3906: Add support for AnalyzingSuggester (LUCENE-3842), where the
+  underlying analyzed form used for suggestions is separate from the returned
+  text.  (Robert Muir)
+
 Optimizations
 ----------------------

--- a/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
+++ b/solr/core/src/java/org/apache/solr/spelling/suggest/fst/AnalyzingLookupFactory.java
@ -0,0 +1,118 @@
+package org.apache.solr.spelling.suggest.fst;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.suggest.Lookup;
+import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
+import org.apache.solr.common.util.NamedList;
+import org.apache.solr.core.SolrCore;
+import org.apache.solr.schema.FieldType;
+import org.apache.solr.spelling.suggest.LookupFactory;
+
+/**
+ * Factory for {@link AnalyzingSuggester}
+ * @lucene.experimental
+ */
+public class AnalyzingLookupFactory extends LookupFactory {
+  /**
+   * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
+   * of other strings in the automaton (possibly with larger weights). 
+   */
+  public static final String EXACT_MATCH_FIRST = "exactMatchFirst";
+  
+  /**
+   * If <code>true</code>, then a separator between tokens is preserved. This means that
+   * suggestions are sensitive to tokenization (e.g. baseball is different from base ball).
+   */
+  public static final String PRESERVE_SEP = "preserveSep";
+  
+  /**
+   * When multiple suggestions collide to the same analyzed form, this is the limit of
+   * how many unique surface forms we keep.
+   */
+  public static final String MAX_SURFACE_FORMS = "maxSurfaceFormsPerAnalyzedForm";
+  
+  /**
+   * When building the FST ("index-time"), we add each path through the tokenstream graph
+   * as an individual entry. This places an upper-bound on how many expansions will be added
+   * for a single suggestion.
+   */
+  public static final String MAX_EXPANSIONS = "maxGraphExpansions";
+  
+  // confusingly: the queryAnalyzerFieldType parameter is something totally different, this
+  // is solr's "analysis" of the queries before they even reach the suggester (really makes
+  // little sense for suggest at all, only for spellcheck). So we pick different names.
+  
+  /**
+   * The analyzer used at "query-time" and "build-time" to analyze suggestions.
+   */
+  public static final String QUERY_ANALYZER = "suggestAnalyzerFieldType";
+
+  /**
+   * File name for the automaton.
+   * 
+   */
+  private static final String FILENAME = "wfsta.bin";
+
+  @Override
+  public Lookup create(NamedList params, SolrCore core) {
+    // mandatory parameter
+    Object fieldTypeName = params.get(QUERY_ANALYZER);
+    if (fieldTypeName == null) {
+      throw new IllegalArgumentException("Error in configuration: " + QUERY_ANALYZER + " parameter is mandatory");
+    }
+    FieldType ft = core.getSchema().getFieldTypeByName(fieldTypeName.toString());
+    Analyzer indexAnalyzer = ft.getAnalyzer();
+    Analyzer queryAnalyzer = ft.getQueryAnalyzer();
+    
+    // optional parameters
+    
+    boolean exactMatchFirst = params.get(EXACT_MATCH_FIRST) != null
+    ? Boolean.valueOf(params.get(EXACT_MATCH_FIRST).toString())
+    : true;
+    
+    boolean preserveSep = params.get(PRESERVE_SEP) != null
+    ? Boolean.valueOf(params.get(PRESERVE_SEP).toString())
+    : true;
+    
+    int flags = 0;
+    if (exactMatchFirst) {
+      flags |= AnalyzingSuggester.EXACT_FIRST;
+    }
+    if (preserveSep) {
+      flags |= AnalyzingSuggester.PRESERVE_SEP;
+    }
+    
+    int maxSurfaceFormsPerAnalyzedForm = params.get(MAX_SURFACE_FORMS) != null
+    ? Integer.parseInt(params.get(MAX_SURFACE_FORMS).toString())
+    : 256;
+    
+    int maxGraphExpansions = params.get(MAX_EXPANSIONS) != null
+    ? Integer.parseInt(params.get(MAX_EXPANSIONS).toString())
+    : -1;
+
+    
+    return new AnalyzingSuggester(indexAnalyzer, queryAnalyzer, flags, maxSurfaceFormsPerAnalyzedForm, maxGraphExpansions);
+  }
+
+  @Override
+  public String storeFileName() {
+    return FILENAME;
+  }
+}
--- a/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
+++ b/solr/core/src/test-files/solr/collection1/conf/jasuggest.txt
@ -0,0 +1,5 @@
+# simple auto-suggest phrase dictionary for testing
+# note this uses tabs as separator!
+北海道	1.0
+今夜	3.0
+話した	6.0
--- a/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/schema-phrasesuggest.xml
@ -40,6 +40,14 @@
 	    <filter class="solr.TrimFilterFactory"/>
 	  </analyzer>
 	</fieldtype>
+	
+	<fieldtype name="ja_suggest" class="solr.TextField">
+	  <analyzer>
+	    <tokenizer class="solr.JapaneseTokenizerFactory" mode="normal"/>
+	    <filter class="solr.CJKWidthFilterFactory"/>
+	    <filter class="solr.JapaneseReadingFormFilterFactory" useRomaji="true"/>
+	  </analyzer>
+	</fieldtype>
  </types>

  <fields>
--- a/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
+++ b/solr/core/src/test-files/solr/collection1/conf/solrconfig-phrasesuggest.xml
@ -43,6 +43,27 @@
    <str name="queryAnalyzerFieldType">phrase_suggest</str>
  </searchComponent>
  
+  <!-- AnalyzingLookup suggest component -->
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_analyzing">
+    <lst name="spellchecker">
+      <str name="name">suggest_analyzing</str>
+      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+      <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.AnalyzingLookupFactory</str>
+      <str name="storeDir">suggest_analyzing</str>
+      <str name="buildOnCommit">false</str>
+
+      <!-- Suggester properties -->
+      <bool name="exactMatchFirst">true</bool>
+      <str name="suggestAnalyzerFieldType">ja_suggest</str>
+      <bool name="preserveSep">false</bool>
+      
+      <str name="sourceLocation">jasuggest.txt</str>
+    </lst>
+    
+    <!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
+    <str name="queryAnalyzerFieldType">phrase_suggest</str>
+  </searchComponent>
+  
  <!-- is this thing just configured globally or wtf is going on here?! -->
  <queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
  
@ -60,4 +81,18 @@
    </arr>
  </requestHandler>
  
+  <!--  analyzing (finite state automaton based) -->
+  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_analyzing">
+    <lst name="defaults">
+      <str name="spellcheck">true</str>
+      <str name="spellcheck.dictionary">suggest_analyzing</str>
+      <str name="spellcheck.collate">false</str>
+      <!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
+      <str name="spellcheck.onlyMorePopular">true</str>
+    </lst>
+    <arr name="components">
+      <str>suggest_analyzing</str>
+    </arr>
+  </requestHandler>
+  
 </config>
--- a/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
+++ b/solr/core/src/test/org/apache/solr/spelling/suggest/TestAnalyzedSuggestions.java
@ -0,0 +1,59 @@
+package org.apache.solr.spelling.suggest;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SpellingParams;
+import org.junit.BeforeClass;
+
+public class TestAnalyzedSuggestions extends SolrTestCaseJ4 {
+  static final String URI = "/suggest_analyzing";
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
+    assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
+  }
+  
+  public void test() {
+    assertQ(req("qt", URI, "q", "hokk", SpellingParams.SPELLCHECK_COUNT, "1"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/int[@name='numFound'][.='1']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='hokk']/arr[@name='suggestion']/str[1][.='北海道']"
+    );
+    assertQ(req("qt", URI, "q", "ほっk", SpellingParams.SPELLCHECK_COUNT, "1"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ほっk']/int[@name='numFound'][.='1']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ほっk']/arr[@name='suggestion']/str[1][.='北海道']"
+    );
+    assertQ(req("qt", URI, "q", "ホッk", SpellingParams.SPELLCHECK_COUNT, "1"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/int[@name='numFound'][.='1']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ホッk']/arr[@name='suggestion']/str[1][.='北海道']"
+    );
+    assertQ(req("qt", URI, "q", "ﾎｯk", SpellingParams.SPELLCHECK_COUNT, "1"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ﾎｯk']/int[@name='numFound'][.='1']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='ﾎｯk']/arr[@name='suggestion']/str[1][.='北海道']"
+    );
+  }
+  
+  public void testMultiple() {
+    assertQ(req("qt", URI, "q", "h", SpellingParams.SPELLCHECK_COUNT, "2"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/int[@name='numFound'][.='2']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[1][.='話した']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='h']/arr[@name='suggestion']/str[2][.='北海道']"
+    );
+  }
+}