SOLR-3143: add SuggestQueryConverter for autosuggesters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1291322 13f79535-47bb-0310-9956-ffa450edef68
2012-02-20 15:48:38 +00:00 · 2012-02-20 15:48:38 +00:00 · 5adeacf2aa
parent 9fd084ea1c
commit 5adeacf2aa
8 changed files with 320 additions and 24 deletions
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@ -506,6 +506,9 @@ New Features
 * LUCENE-3714: Add WFSTLookupFactory, a suggester that uses a weighted FST
  for more fine-grained suggestions.  (Mike McCandless, Dawid Weiss, Robert Muir)

+* SOLR-3143: Add SuggestQueryConverter, a QueryConverter intended for
+  auto-suggesters. (Robert Muir)
+
 Optimizations
 ----------------------
 * SOLR-1931: Speedup for LukeRequestHandler and admin/schema browser. New parameter
--- a/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SpellingQueryConverter.java
@ -18,6 +18,7 @@
 package org.apache.solr.spelling;

 import java.io.IOException;
+import java.io.Reader;
 import java.io.StringReader;
 import java.util.ArrayList;
 import java.util.Collection;
@ -100,39 +101,42 @@ public class SpellingQueryConverter extends QueryConverter  {
    Collection<Token> result = new ArrayList<Token>();
    //TODO: Extract the words using a simple regex, but not query stuff, and then analyze them to produce the token stream
    Matcher matcher = QUERY_REGEX.matcher(original);
-    TokenStream stream;
    while (matcher.find()) {
      String word = matcher.group(0);
      if (word.equals("AND") == false && word.equals("OR") == false) {
        try {
-          stream = analyzer.tokenStream("", new StringReader(word));
-          // TODO: support custom attributes
-          CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
-          FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
-          TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
-          PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
-          PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
-          OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
-          stream.reset();
-          while (stream.incrementToken()) {
-            Token token = new Token();
-            token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
-            token.setStartOffset(matcher.start() + offsetAtt.startOffset());
-            token.setEndOffset(matcher.start() + offsetAtt.endOffset());
-            token.setFlags(flagsAtt.getFlags());
-            token.setType(typeAtt.type());
-            token.setPayload(payloadAtt.getPayload());
-            token.setPositionIncrement(posIncAtt.getPositionIncrement());
-            result.add(token);
-          }
-          stream.end();
-          stream.close();
+          analyze(result, new StringReader(word), matcher.start());
        } catch (IOException e) {
+          // TODO: shouldn't we log something?
        }
      }
    }
    return result;
  }
-
+  
+  protected void analyze(Collection<Token> result, Reader text, int offset) throws IOException {
+    TokenStream stream = analyzer.tokenStream("", text);
+    // TODO: support custom attributes
+    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
+    FlagsAttribute flagsAtt = stream.addAttribute(FlagsAttribute.class);
+    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
+    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
+    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
+    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
+    stream.reset();
+    while (stream.incrementToken()) {
+      Token token = new Token();
+      token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
+      token.setStartOffset(offset + offsetAtt.startOffset());
+      token.setEndOffset(offset + offsetAtt.endOffset());
+      token.setFlags(flagsAtt.getFlags());
+      token.setType(typeAtt.type());
+      token.setPayload(payloadAtt.getPayload());
+      token.setPositionIncrement(posIncAtt.getPositionIncrement());
+      result.add(token);
+    }
+    stream.end();
+    stream.close();
+  }
 }

--- a/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java
+++ b/solr/core/src/java/org/apache/solr/spelling/SuggestQueryConverter.java
@ -0,0 +1,47 @@
+package org.apache.solr.spelling;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+
+import org.apache.lucene.analysis.Token;
+
+/**
+ * Passes the entire query string to the configured analyzer as-is.
+ **/
+public class SuggestQueryConverter extends SpellingQueryConverter {
+
+  @Override
+  public Collection<Token> convert(String original) {
+    if (original == null) { // this can happen with q.alt = and no query
+      return Collections.emptyList();
+    }
+
+    Collection<Token> result = new ArrayList<Token>();
+    try {
+      analyze(result, new StringReader(original), 0);
+    } catch (IOException e) {
+      throw new RuntimeException(e);
+    }
+    return result;
+  }
+}
--- a/solr/core/src/test-files/solr/conf/phrasesuggest.txt
+++ b/solr/core/src/test-files/solr/conf/phrasesuggest.txt
@ -0,0 +1,8 @@
+# simple auto-suggest phrase dictionary for testing
+# note this uses tabs as separator!
+the first phrase	1.0
+the second phrase	2.0
+testing 1234	3.0
+foo	5.0
+the fifth phrase	2.0
+the final phrase	4.0
--- a/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml
+++ b/solr/core/src/test-files/solr/conf/schema-phrasesuggest.xml
@ -0,0 +1,52 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- Test schema file for phrase suggestions -->
+
+<schema name="test" version="1.0">
+  <types>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/>
+
+    <!-- basic text field -->
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="phrase_suggest" class="solr.TextField">
+	  <analyzer>
+	    <tokenizer class="solr.KeywordTokenizerFactory"/>
+	    <filter class="solr.PatternReplaceFilterFactory"
+	            pattern="([^\p{L}\p{M}\p{N}\p{Cs}]*[\p{L}\p{M}\p{N}\p{Cs}\_]+:)|([^\p{L}\p{M}\p{N}\p{Cs}])+"
+	            replacement=" " replace="all"/>
+	    <filter class="solr.LowerCaseFilterFactory"/>
+	    <filter class="solr.TrimFilterFactory"/>
+	  </analyzer>
+	</fieldtype>
+  </types>
+
+  <fields>
+    <field name="id" type="int" indexed="true" stored="true" multiValued="false" required="false"/>
+    <field name="text" type="text" indexed="true" stored="false"/>
+  </fields>
+
+  <defaultSearchField>text</defaultSearchField>
+  <uniqueKey>id</uniqueKey>
+</schema>
--- a/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml
+++ b/solr/core/src/test-files/solr/conf/solrconfig-phrasesuggest.xml
@ -0,0 +1,63 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!-- solrconfig.xml for a WFST phrase suggester -->
+<config>
+  <luceneMatchVersion>${tests.luceneMatchVersion:LUCENE_CURRENT}</luceneMatchVersion>
+  <dataDir>${solr.data.dir:}</dataDir>
+  <directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.RAMDirectoryFactory}"/>
+  <requestHandler name="standard" class="solr.StandardRequestHandler"></requestHandler>
+  
+  <!-- WFSTLookup suggest component -->
+  <searchComponent class="solr.SpellCheckComponent" name="suggest_wfst">
+    <lst name="spellchecker">
+      <str name="name">suggest_wfst</str>
+      <str name="classname">org.apache.solr.spelling.suggest.Suggester</str>
+      <str name="lookupImpl">org.apache.solr.spelling.suggest.fst.WFSTLookupFactory</str>
+      <str name="storeDir">suggest_wfst</str>
+      <str name="buildOnCommit">false</str>
+
+      <!-- Suggester properties -->
+      <bool name="exactMatchFirst">true</bool>
+      
+      <str name="sourceLocation">phrasesuggest.txt</str>
+    </lst>
+    
+    <!-- specify a fieldtype using keywordtokenizer + lowercase + cleanup -->
+    <str name="queryAnalyzerFieldType">phrase_suggest</str>
+  </searchComponent>
+  
+  <!-- is this thing just configured globally or wtf is going on here?! -->
+  <queryConverter name="queryConverter" class="org.apache.solr.spelling.SuggestQueryConverter"/>
+  
+  <!--  wfst (finite state automaton based) -->
+  <requestHandler class="org.apache.solr.handler.component.SearchHandler" name="/suggest_wfst">
+    <lst name="defaults">
+      <str name="spellcheck">true</str>
+      <str name="spellcheck.dictionary">suggest_wfst</str>
+      <str name="spellcheck.collate">false</str>
+      <!-- NOTE: if this is false, results are alpha-ordered, not by weight! -->
+      <str name="spellcheck.onlyMorePopular">true</str>
+    </lst>
+    <arr name="components">
+      <str>suggest_wfst</str>
+    </arr>
+  </requestHandler>
+  
+</config>
--- a/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
+++ b/solr/core/src/test/org/apache/solr/spelling/TestSuggestSpellingConverter.java
@ -0,0 +1,73 @@
+package org.apache.solr.spelling;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.Reader;
+import java.util.Collection;
+import java.util.regex.Pattern;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.core.KeywordTokenizer;
+import org.apache.lucene.analysis.core.LowerCaseFilter;
+import org.apache.lucene.analysis.miscellaneous.TrimFilter;
+import org.apache.lucene.analysis.pattern.PatternReplaceFilter;
+
+public class TestSuggestSpellingConverter extends BaseTokenStreamTestCase {
+  SuggestQueryConverter converter = new SuggestQueryConverter();
+  
+  public void testSimple() throws Exception {
+    // lowercases only!
+    converter.setAnalyzer(new MockAnalyzer(random, MockTokenizer.KEYWORD, true));
+    assertConvertsTo("This is a test", new String[] { "this is a test" });
+  }
+  
+  public void testComplicated() throws Exception {
+    // lowercases, removes field names, other syntax, collapses runs of whitespace, etc.
+    converter.setAnalyzer(new Analyzer() {
+      @Override
+      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+        Tokenizer tokenizer = new KeywordTokenizer(reader);
+        TokenStream filter = new PatternReplaceFilter(tokenizer, 
+            Pattern.compile("([^\\p{L}\\p{M}\\p{N}\\p{Cs}]*[\\p{L}\\p{M}\\p{N}\\p{Cs}\\_]+:)|([^\\p{L}\\p{M}\\p{N}\\p{Cs}])+"), " ", true);
+        filter = new LowerCaseFilter(TEST_VERSION_CURRENT, filter);
+        filter = new TrimFilter(filter, false);
+        return new TokenStreamComponents(tokenizer, filter);
+      }
+    });
+    assertConvertsTo("test1 +test2", new String[] { "test1 test2" });
+    assertConvertsTo("test~", new String[] { "test" });
+    assertConvertsTo("field:test", new String[] { "test" });
+    assertConvertsTo("This is a test", new String[] { "this is a test" });
+    assertConvertsTo(" This is  a test", new String[] { "this is a test" });
+    assertConvertsTo("Foo (field:bar) text_hi:हिन्दी    ", new String[] { "foo bar हिन्दी" });
+  }
+  
+  public void assertConvertsTo(String text, String expected[]) throws IOException {
+    Collection<Token> tokens = converter.convert(text);
+    TokenStream ts = new CannedTokenStream(tokens.toArray(new Token[0]));
+    assertTokenStreamContents(ts, expected);
+  }
+}
--- a/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java
+++ b/solr/core/src/test/org/apache/solr/spelling/suggest/TestPhraseSuggestions.java
@ -0,0 +1,46 @@
+package org.apache.solr.spelling.suggest;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.params.SpellingParams;
+import org.junit.BeforeClass;
+
+public class TestPhraseSuggestions extends SolrTestCaseJ4 {
+  static final String URI = "/suggest_wfst";
+  
+  @BeforeClass
+  public static void beforeClass() throws Exception {
+    initCore("solrconfig-phrasesuggest.xml","schema-phrasesuggest.xml");
+    assertQ(req("qt", URI, "q", "", SpellingParams.SPELLCHECK_BUILD, "true"));
+  }
+  
+  public void test() {
+    assertQ(req("qt", URI, "q", "the f", SpellingParams.SPELLCHECK_COUNT, "4"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/int[@name='numFound'][.='3']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[1][.='the final phrase']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[2][.='the fifth phrase']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='the f']/arr[@name='suggestion']/str[3][.='the first phrase']"
+    );
+    
+    assertQ(req("qt", URI, "q", "Testing +12", SpellingParams.SPELLCHECK_COUNT, "4"),
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/int[@name='numFound'][.='1']",
+        "//lst[@name='spellcheck']/lst[@name='suggestions']/lst[@name='testing 12']/arr[@name='suggestion']/str[1][.='testing 1234']"
+    );
+  }
+}