SOLR-1321: Added better support for efficient leading wildcards

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@813830 13f79535-47bb-0310-9956-ffa450edef68
2009-09-11 13:50:16 +00:00 · 2009-09-11 13:50:16 +00:00 · 70fe600134
parent e723f7fbad
commit 70fe600134
6 changed files with 490 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -309,6 +309,8 @@ New Features
    to allow more efficient bulk queries (those that retrieve many or all
    documents).  (Brian Whitman via yonik)

+78. SOLR-1321: Add better support for efficient wildcard handling (Andrzej Bialecki, Robert Muir, gsingers)    
+
 Optimizations
 ----------------------
 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
--- a/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
+++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
@ -0,0 +1,85 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
+/**
+ * This class produces a special form of reversed tokens, suitable for
+ * better handling of leading wildcards. Tokens from the input TokenStream
+ * are reversed and prepended with a special "reversed" marker character.
+ * If <code>withOriginal<code> argument is <code>true</code> then first the
+ * original token is returned, and then the reversed token (with
+ * <code>positionIncrement == 0</code>) is returned. Otherwise only reversed
+ * tokens are returned.
+ * <p>Note: this filter doubles the number of tokens in the input stream when
+ * <code>withOriginal == true</code>, which proportionally increases the size
+ * of postings and term dictionary in the index.
+ */
+public class ReversedWildcardFilter extends TokenFilter {
+  
+  private boolean withOriginal;
+  private char markerChar;
+  private State save;
+  private TermAttribute termAtt;
+  private PositionIncrementAttribute posAtt;
+
+  protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
+    super(input);
+    this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
+    this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+    this.withOriginal = withOriginal;
+    this.markerChar = markerChar;
+  }
+
+  @Override
+  public boolean incrementToken() throws IOException {
+    if( save != null ) {
+      // clearAttributes();  // not currently necessary
+      restoreState(save);
+      save = null;
+      return true;
+    }
+
+    if (!input.incrementToken()) return false;
+
+    // pass through zero-length terms
+    int oldLen = termAtt.termLength();
+    if (oldLen ==0) return true;
+    int origOffset = posAtt.getPositionIncrement();
+    if (withOriginal == true){
+      posAtt.setPositionIncrement(0);
+      save = captureState();
+    }
+    char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
+    buffer[oldLen] = markerChar;
+    //String reversed = reverseAndMark(value, markerChar);
+    ReverseStringFilter.reverse(buffer, oldLen + 1);
+
+    posAtt.setPositionIncrement(origOffset);
+    termAtt.setTermBuffer(buffer, 0, oldLen +1);
+    return true;
+  }
+  
+   
+}
--- a/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
@ -0,0 +1,131 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Map;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+
+/**
+ * Factory for {@link ReversedWildcardFilter}-s. When this factory is
+ * added to an analysis chain, it will be used both for filtering the
+ * tokens during indexing, and to determine the query processing of
+ * this field during search.
+ * <p>This class supports the following init arguments:
+ * <ul>
+ * <li><code>withOriginal</code> - if true, then produce both original and reversed tokens at
+ * the same positions. If false, then produce only reversed tokens.</li>
+ * <li><code>maxPosAsterisk</code> - maximum position (1-based) of the asterisk wildcard
+ * ('*') that triggers the reversal of query term. Asterisk that occurs at
+ * positions higher than this value will not cause the reversal of query term.
+ * Defaults to 2, meaning that asterisks on positions 1 and 2 will cause
+ * a reversal.</li>
+ * <li><code>maxPosQuestion</code> - maximum position (1-based) of the question
+ * mark wildcard ('?') that triggers the reversal of query term. Defaults to 1.
+ * Set this to 0, and <code>maxPosAsterisk</code> to 1 to reverse only
+ * pure suffix queries (i.e. ones with a single leading asterisk).</li>
+ * <li><code>maxFractionAsterisk</code> - additional parameter that
+ * triggers the reversal if asterisk ('*') position is less than this
+ * fraction of the query token length. Defaults to 0.0f (disabled).</li>
+ * <li><code>minTrailing</code> - minimum number of trailing characters in query
+ * token after the last wildcard character. For good performance this should be
+ * set to a value larger than 1. Defaults to 2.
+ * </ul>
+ * Note 1: This filter always reverses input tokens during indexing.
+ * Note 2: Query tokens without wildcard characters will never be reversed.
+ */
+public class ReversedWildcardFilterFactory extends BaseTokenFilterFactory {
+  
+  private char markerChar = ReverseStringFilter.START_OF_HEADING_MARKER;
+  private boolean withOriginal;
+  private int maxPosAsterisk;
+  private int maxPosQuestion;
+  private int minTrailing;
+  private float maxFractionAsterisk;
+
+  @Override
+  public void init(Map<String, String> args) {
+    super.init(args);
+    withOriginal = getBoolean("withOriginal", true);
+    maxPosAsterisk = getInt("maxPosAsterisk", 2);
+    maxPosQuestion = getInt("maxPosQuestion", 1);
+    minTrailing = getInt("minTrailing", 2);
+    maxFractionAsterisk = getFloat("maxFractionAsterisk", 0.0f);
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new ReversedWildcardFilter(input, withOriginal, markerChar);
+  }
+  
+  /**
+   * This method encapsulates the logic that determines whether
+   * a query token should be reversed in order to use the
+   * reversed terms in the index.
+   * @param token input token.
+   * @return true if input token should be reversed, false otherwise.
+   */
+  public boolean shouldReverse(String token) {
+    int posQ = token.indexOf('?');
+    int posA = token.indexOf('*');
+    if (posQ == -1 && posA == -1) { // not a wildcard query
+      return false;
+    }
+    int pos;
+    int lastPos;
+    int len = token.length();
+    lastPos = token.lastIndexOf('?');
+    pos = token.lastIndexOf('*');
+    if (pos > lastPos) lastPos = pos;
+    if (posQ != -1) {
+      pos = posQ;
+      if (posA != -1) {
+        pos = Math.min(posQ, posA);
+      }
+    } else {
+      pos = posA;
+    }
+    if (len - lastPos < minTrailing)  { // too few trailing chars
+      return false;
+    }
+    if (posQ != -1 && posQ < maxPosQuestion) {  // leading '?'
+      return true;
+    }
+    if (posA != -1 && posA < maxPosAsterisk) { // leading '*'
+      return true;
+    }
+    // '*' in the leading part
+    if (maxFractionAsterisk > 0.0f && pos < (float)token.length() * maxFractionAsterisk) {
+      return true;
+    }
+    return false;
+  }
+  
+  public char getMarkerChar() {
+    return markerChar;
+  }
+  
+  protected float getFloat(String name, float defValue) {
+    String val = args.get(name);
+    if (val == null) {
+      return defValue;
+    } else {
+      return Float.parseFloat(val);
+    }
+  }
+}
--- a/src/java/org/apache/solr/search/SolrQueryParser.java
+++ b/src/java/org/apache/solr/search/SolrQueryParser.java
@ -17,14 +17,24 @@

 package org.apache.solr.search;

+import java.util.HashMap;
+import java.util.Map;
+import java.util.Map.Entry;
+
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.reverse.ReverseStringFilter;
+import org.apache.solr.analysis.ReversedWildcardFilter;
+import org.apache.solr.analysis.ReversedWildcardFilterFactory;
+import org.apache.solr.analysis.TokenFilterFactory;
+import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.schema.SchemaField;
 import org.apache.solr.schema.TrieField;
 import org.apache.solr.schema.SchemaField;

@ -55,6 +65,8 @@ public class SolrQueryParser extends QueryParser {
  protected final IndexSchema schema;
  protected final QParser parser;
  protected final String defaultField;
+  protected final Map<String, ReversedWildcardFilterFactory> leadingWildcards =
+    new HashMap<String, ReversedWildcardFilterFactory>();

  /**
   * Constructs a SolrQueryParser using the schema to understand the
@ -72,7 +84,8 @@ public class SolrQueryParser extends QueryParser {
    this.parser  = null;
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
-    setEnablePositionIncrements(true);    
+    setEnablePositionIncrements(true);
+    checkAllowLeadingWildcards();
  }

  public SolrQueryParser(QParser parser, String defaultField) {
@ -86,8 +99,31 @@ public class SolrQueryParser extends QueryParser {
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
    setEnablePositionIncrements(true);
+    checkAllowLeadingWildcards();
  }

+  protected void checkAllowLeadingWildcards() {
+    boolean allow = false;
+    for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) {
+      Analyzer a = e.getValue().getAnalyzer();
+      if (a instanceof TokenizerChain) {
+        // examine the indexing analysis chain if it supports leading wildcards
+        TokenizerChain tc = (TokenizerChain)a;
+        TokenFilterFactory[] factories = tc.getTokenFilterFactories();
+        for (TokenFilterFactory factory : factories) {
+          if (factory instanceof ReversedWildcardFilterFactory) {
+            allow = true;
+            leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory);
+          }
+        }
+      }
+    }
+    // XXX should be enabled on a per-field basis
+    if (allow) {
+      setAllowLeadingWildcard(true);
+    }
+  }
+  
  private void checkNullField(String field) throws SolrException {
    if (field == null && defaultField == null) {
      throw new SolrException
@ -149,6 +185,17 @@ public class SolrQueryParser extends QueryParser {
  }

  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
+    // *:* -> MatchAllDocsQuery
+    if ("*".equals(field) && "*".equals(termStr)) {
+      return newMatchAllDocsQuery();
+    }
+    
+    // can we use reversed wildcards in this field?
+    String type = schema.getFieldType(field).getTypeName();
+    ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
+    if (factory != null && factory.shouldReverse(termStr)) {
+      termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
+    }
    Query q = super.getWildcardQuery(field, termStr);
    if (q instanceof WildcardQuery) {
      // use a constant score query to avoid overflowing clauses
--- a/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
+++ b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
@ -0,0 +1,143 @@
+package org.apache.solr.analysis;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.search.Query;
+import org.apache.solr.schema.IndexSchema;
+import org.apache.solr.search.SolrQueryParser;
+
+public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
+  Map<String,String> args = new HashMap<String, String>();
+  ReversedWildcardFilterFactory factory = new ReversedWildcardFilterFactory();
+  IndexSchema schema;
+
+  public String getSchemaFile() {
+    return "schema-reversed.xml";
+  }
+
+  public String getSolrConfigFile() {
+    return "solrconfig.xml";
+  }
+  
+  public void setUp() throws Exception {
+    super.setUp();
+    schema = new IndexSchema(solrConfig, getSchemaFile(), null);
+  }
+
+  public void testReversedTokens() throws IOException {
+    String text = "simple text";
+    String expected1 = "simple \u0001elpmis text \u0001txet";
+    String expected2 = "\u0001elpmis \u0001txet";
+    args.put("withOriginal", "true");
+    factory.init(args);
+    TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
+    List<Token> realTokens = getTokens(input);
+    List<Token> expectedTokens = tokens(expected1);
+    // set positionIncrements in expected tokens
+    for (int i = 1; i < expectedTokens.size(); i += 2) {
+      expectedTokens.get(i).setPositionIncrement(0);
+    }
+    assertTokEqual(realTokens, expectedTokens);
+    
+    // now without original tokens
+    args.put("withOriginal", "false");
+    factory.init(args);
+    input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
+    realTokens = getTokens(input);
+    expectedTokens = tokens(expected2);
+    assertTokEqual(realTokens, expectedTokens);
+  }
+  
+  public void testIndexingAnalysis() throws Exception {
+    Analyzer a = schema.getAnalyzer();
+    String text = "one two three";
+    String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
+    List<Token> expectedTokens1 = getTokens(
+            new WhitespaceTokenizer(new StringReader(expected1)));
+    // set positionIncrements and offsets in expected tokens
+    for (int i = 1; i < expectedTokens1.size(); i += 2) {
+      Token t = expectedTokens1.get(i);
+      t.setPositionIncrement(0);
+    }
+    String expected2 = "\u0001eno \u0001owt \u0001eerht";
+    List<Token> expectedTokens2 = getTokens(
+            new WhitespaceTokenizer(new StringReader(expected2)));
+    String expected3 = "one two three";
+    List<Token> expectedTokens3 = getTokens(
+            new WhitespaceTokenizer(new StringReader(expected3)));
+    // field one
+    TokenStream input = a.tokenStream("one", new StringReader(text));
+    List<Token> realTokens = getTokens(input);
+    assertTokEqual(realTokens, expectedTokens1);
+    // field two
+    input = a.tokenStream("two", new StringReader(text));
+    realTokens = getTokens(input);
+    assertTokEqual(realTokens, expectedTokens2);
+    // field three
+    input = a.tokenStream("three", new StringReader(text));
+    realTokens = getTokens(input);
+    assertTokEqual(realTokens, expectedTokens3);
+  }
+  
+  public void testQueryParsing() throws IOException, ParseException {
+
+    SolrQueryParser parserOne = new SolrQueryParser(schema, "one");
+    assertTrue(parserOne.getAllowLeadingWildcard());
+    SolrQueryParser parserTwo = new SolrQueryParser(schema, "two");
+    assertTrue(parserTwo.getAllowLeadingWildcard());
+    SolrQueryParser parserThree = new SolrQueryParser(schema, "three");
+    // XXX note: this should be false, but for now we return true for any field,
+    // XXX if at least one field uses the reversing
+    assertTrue(parserThree.getAllowLeadingWildcard());
+    String text = "one +two *hree f*ur fiv*";
+    String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
+    String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
+    String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
+    Query q = parserOne.parse(text);
+    assertEquals(expectedOne, q.toString());
+    q = parserTwo.parse(text);
+    assertEquals(expectedTwo, q.toString());
+    q = parserThree.parse(text);
+    assertEquals(expectedThree, q.toString());
+    // test conditional reversal
+    String condText = "*hree t*ree th*ee thr*e ?hree t?ree th?ee th?*ee " + 
+        "short*token ver*longtoken";
+    String expected = "two:\u0001eerh* two:\u0001eer*t two:\u0001ee*ht " +
+        "two:thr*e " +
+        "two:\u0001eerh? two:\u0001eer?t " +
+        "two:th?ee " +
+        "two:th?*ee " +
+        "two:short*token " +
+        "two:\u0001nekotgnol*rev";
+    q = parserTwo.parse(condText);
+    assertEquals(expected, q.toString());
+  }
+
+}
--- a/src/test/test-files/solr/conf/schema-reversed.xml
+++ b/src/test/test-files/solr/conf/schema-reversed.xml
@ -0,0 +1,81 @@
+<?xml version="1.0" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<!--
+
+  For testing reversed wildcards.
+  -->
+
+<schema name="test" version="1.0">
+  <types>
+
+
+    <fieldtype name="integer" class="solr.IntField" />
+
+    <fieldtype name="text" class="solr.TextField">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldtype>
+    
+    <fieldtype name="srev" class="solr.TextField">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
+            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+      </analyzer>
+    </fieldtype>
+    <fieldtype name="rev" class="solr.TextField">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="false"
+            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"
+            minTrailing="1"/>
+      </analyzer>
+
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+
+      </analyzer>
+    </fieldtype>
+ </types>
+
+
+ <fields>
+   <field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
+   <field name="one" type="srev" indexed="true" stored="false"/>
+   <field name="two" type="rev" indexed="true" stored="false"/>
+   <field name="three" type="text" indexed="true" stored="false"/>
+
+ </fields>
+
+ <defaultSearchField>one</defaultSearchField>
+ <uniqueKey>id</uniqueKey>
+
+
+</schema>