SOLR-1321: Added better support for efficient leading wildcards

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@813830 13f79535-47bb-0310-9956-ffa450edef68
2009-09-11 13:50:16 +00:00 · 2009-09-11 13:50:16 +00:00 · 70fe600134
parent e723f7fbad
commit 70fe600134
6 changed files with 490 additions and 1 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -309,6 +309,8 @@ New Features
    to allow more efficient bulk queries (those that retrieve many or all
    documents).  (Brian Whitman via yonik)
 78. SOLR-1321: Add better support for efficient wildcard handling (Andrzej Bialecki, Robert Muir, gsingers)    
 Optimizations
 ----------------------
 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the
--- a/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
+++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java
@ -0,0 +1,85 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 /**
 * This class produces a special form of reversed tokens, suitable for
 * better handling of leading wildcards. Tokens from the input TokenStream
 * are reversed and prepended with a special "reversed" marker character.
 * If <code>withOriginal<code> argument is <code>true</code> then first the
 * original token is returned, and then the reversed token (with
 * <code>positionIncrement == 0</code>) is returned. Otherwise only reversed
 * tokens are returned.
 * <p>Note: this filter doubles the number of tokens in the input stream when
 * <code>withOriginal == true</code>, which proportionally increases the size
 * of postings and term dictionary in the index.
 */
 public class ReversedWildcardFilter extends TokenFilter {
  private boolean withOriginal;
  private char markerChar;
  private State save;
  private TermAttribute termAtt;
  private PositionIncrementAttribute posAtt;
  protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) {
    super(input);
    this.termAtt = (TermAttribute)addAttribute(TermAttribute.class);
    this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
    this.withOriginal = withOriginal;
    this.markerChar = markerChar;
  }
  @Override
  public boolean incrementToken() throws IOException {
    if( save != null ) {
      // clearAttributes();  // not currently necessary
      restoreState(save);
      save = null;
      return true;
    }
    if (!input.incrementToken()) return false;
    // pass through zero-length terms
    int oldLen = termAtt.termLength();
    if (oldLen ==0) return true;
    int origOffset = posAtt.getPositionIncrement();
    if (withOriginal == true){
      posAtt.setPositionIncrement(0);
      save = captureState();
    }
    char [] buffer = termAtt.resizeTermBuffer(oldLen + 1);
    buffer[oldLen] = markerChar;
    //String reversed = reverseAndMark(value, markerChar);
    ReverseStringFilter.reverse(buffer, oldLen + 1);
    posAtt.setPositionIncrement(origOffset);
    termAtt.setTermBuffer(buffer, 0, oldLen +1);
    return true;
  }
 }
--- a/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java
@ -0,0 +1,131 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
 /**
 * Factory for {@link ReversedWildcardFilter}-s. When this factory is
 * added to an analysis chain, it will be used both for filtering the
 * tokens during indexing, and to determine the query processing of
 * this field during search.
 * <p>This class supports the following init arguments:
 * <ul>
 * <li><code>withOriginal</code> - if true, then produce both original and reversed tokens at
 * the same positions. If false, then produce only reversed tokens.</li>
 * <li><code>maxPosAsterisk</code> - maximum position (1-based) of the asterisk wildcard
 * ('*') that triggers the reversal of query term. Asterisk that occurs at
 * positions higher than this value will not cause the reversal of query term.
 * Defaults to 2, meaning that asterisks on positions 1 and 2 will cause
 * a reversal.</li>
 * <li><code>maxPosQuestion</code> - maximum position (1-based) of the question
 * mark wildcard ('?') that triggers the reversal of query term. Defaults to 1.
 * Set this to 0, and <code>maxPosAsterisk</code> to 1 to reverse only
 * pure suffix queries (i.e. ones with a single leading asterisk).</li>
 * <li><code>maxFractionAsterisk</code> - additional parameter that
 * triggers the reversal if asterisk ('*') position is less than this
 * fraction of the query token length. Defaults to 0.0f (disabled).</li>
 * <li><code>minTrailing</code> - minimum number of trailing characters in query
 * token after the last wildcard character. For good performance this should be
 * set to a value larger than 1. Defaults to 2.
 * </ul>
 * Note 1: This filter always reverses input tokens during indexing.
 * Note 2: Query tokens without wildcard characters will never be reversed.
 */
 public class ReversedWildcardFilterFactory extends BaseTokenFilterFactory {
  private char markerChar = ReverseStringFilter.START_OF_HEADING_MARKER;
  private boolean withOriginal;
  private int maxPosAsterisk;
  private int maxPosQuestion;
  private int minTrailing;
  private float maxFractionAsterisk;
  @Override
  public void init(Map<String, String> args) {
    super.init(args);
    withOriginal = getBoolean("withOriginal", true);
    maxPosAsterisk = getInt("maxPosAsterisk", 2);
    maxPosQuestion = getInt("maxPosQuestion", 1);
    minTrailing = getInt("minTrailing", 2);
    maxFractionAsterisk = getFloat("maxFractionAsterisk", 0.0f);
  }
  @Override
  public TokenStream create(TokenStream input) {
    return new ReversedWildcardFilter(input, withOriginal, markerChar);
  }
  /**
   * This method encapsulates the logic that determines whether
   * a query token should be reversed in order to use the
   * reversed terms in the index.
   * @param token input token.
   * @return true if input token should be reversed, false otherwise.
   */
  public boolean shouldReverse(String token) {
    int posQ = token.indexOf('?');
    int posA = token.indexOf('*');
    if (posQ == -1 && posA == -1) { // not a wildcard query
      return false;
    }
    int pos;
    int lastPos;
    int len = token.length();
    lastPos = token.lastIndexOf('?');
    pos = token.lastIndexOf('*');
    if (pos > lastPos) lastPos = pos;
    if (posQ != -1) {
      pos = posQ;
      if (posA != -1) {
        pos = Math.min(posQ, posA);
      }
    } else {
      pos = posA;
    }
    if (len - lastPos < minTrailing)  { // too few trailing chars
      return false;
    }
    if (posQ != -1 && posQ < maxPosQuestion) {  // leading '?'
      return true;
    }
    if (posA != -1 && posA < maxPosAsterisk) { // leading '*'
      return true;
    }
    // '*' in the leading part
    if (maxFractionAsterisk > 0.0f && pos < (float)token.length() * maxFractionAsterisk) {
      return true;
    }
    return false;
  }
  public char getMarkerChar() {
    return markerChar;
  }
  protected float getFloat(String name, float defValue) {
    String val = args.get(name);
    if (val == null) {
      return defValue;
    } else {
      return Float.parseFloat(val);
    }
  }
 }
--- a/src/java/org/apache/solr/search/SolrQueryParser.java
+++ b/src/java/org/apache/solr/search/SolrQueryParser.java
@ -17,14 +17,24 @@
 package org.apache.solr.search;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Map.Entry;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 import org.apache.lucene.search.*;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.reverse.ReverseStringFilter;
 import org.apache.solr.analysis.ReversedWildcardFilter;
 import org.apache.solr.analysis.ReversedWildcardFilterFactory;
 import org.apache.solr.analysis.TokenFilterFactory;
 import org.apache.solr.analysis.TokenizerChain;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.schema.TrieField;
 import org.apache.solr.schema.SchemaField;
@ -55,6 +65,8 @@ public class SolrQueryParser extends QueryParser {
  protected final IndexSchema schema;
  protected final QParser parser;
  protected final String defaultField;
  protected final Map<String, ReversedWildcardFilterFactory> leadingWildcards =
    new HashMap<String, ReversedWildcardFilterFactory>();
  /**
   * Constructs a SolrQueryParser using the schema to understand the
@ -73,6 +85,7 @@ public class SolrQueryParser extends QueryParser {
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
    setEnablePositionIncrements(true);
    checkAllowLeadingWildcards();
  }
  public SolrQueryParser(QParser parser, String defaultField) {
@ -86,6 +99,29 @@ public class SolrQueryParser extends QueryParser {
    this.defaultField = defaultField;
    setLowercaseExpandedTerms(false);
    setEnablePositionIncrements(true);
    checkAllowLeadingWildcards();
  }
  protected void checkAllowLeadingWildcards() {
    boolean allow = false;
    for (Entry<String, FieldType> e : schema.getFieldTypes().entrySet()) {
      Analyzer a = e.getValue().getAnalyzer();
      if (a instanceof TokenizerChain) {
        // examine the indexing analysis chain if it supports leading wildcards
        TokenizerChain tc = (TokenizerChain)a;
        TokenFilterFactory[] factories = tc.getTokenFilterFactories();
        for (TokenFilterFactory factory : factories) {
          if (factory instanceof ReversedWildcardFilterFactory) {
            allow = true;
            leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory);
          }
        }
      }
    }
    // XXX should be enabled on a per-field basis
    if (allow) {
      setAllowLeadingWildcard(true);
    }
  }
  private void checkNullField(String field) throws SolrException {
@ -149,6 +185,17 @@ public class SolrQueryParser extends QueryParser {
  }
  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
    // *:* -> MatchAllDocsQuery
    if ("*".equals(field) && "*".equals(termStr)) {
      return newMatchAllDocsQuery();
    }
    // can we use reversed wildcards in this field?
    String type = schema.getFieldType(field).getTypeName();
    ReversedWildcardFilterFactory factory = leadingWildcards.get(type);
    if (factory != null && factory.shouldReverse(termStr)) {
      termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar());
    }
    Query q = super.getWildcardQuery(field, termStr);
    if (q instanceof WildcardQuery) {
      // use a constant score query to avoid overflowing clauses
--- a/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
+++ b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java
@ -0,0 +1,143 @@
 package org.apache.solr.analysis;
 /**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.WhitespaceTokenizer;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.search.Query;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.search.SolrQueryParser;
 public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
  Map<String,String> args = new HashMap<String, String>();
  ReversedWildcardFilterFactory factory = new ReversedWildcardFilterFactory();
  IndexSchema schema;
  public String getSchemaFile() {
    return "schema-reversed.xml";
  }
  public String getSolrConfigFile() {
    return "solrconfig.xml";
  }
  public void setUp() throws Exception {
    super.setUp();
    schema = new IndexSchema(solrConfig, getSchemaFile(), null);
  }
  public void testReversedTokens() throws IOException {
    String text = "simple text";
    String expected1 = "simple \u0001elpmis text \u0001txet";
    String expected2 = "\u0001elpmis \u0001txet";
    args.put("withOriginal", "true");
    factory.init(args);
    TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
    List<Token> realTokens = getTokens(input);
    List<Token> expectedTokens = tokens(expected1);
    // set positionIncrements in expected tokens
    for (int i = 1; i < expectedTokens.size(); i += 2) {
      expectedTokens.get(i).setPositionIncrement(0);
    }
    assertTokEqual(realTokens, expectedTokens);
    // now without original tokens
    args.put("withOriginal", "false");
    factory.init(args);
    input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
    realTokens = getTokens(input);
    expectedTokens = tokens(expected2);
    assertTokEqual(realTokens, expectedTokens);
  }
  public void testIndexingAnalysis() throws Exception {
    Analyzer a = schema.getAnalyzer();
    String text = "one two three";
    String expected1 = "one \u0001eno two \u0001owt three \u0001eerht";
    List<Token> expectedTokens1 = getTokens(
            new WhitespaceTokenizer(new StringReader(expected1)));
    // set positionIncrements and offsets in expected tokens
    for (int i = 1; i < expectedTokens1.size(); i += 2) {
      Token t = expectedTokens1.get(i);
      t.setPositionIncrement(0);
    }
    String expected2 = "\u0001eno \u0001owt \u0001eerht";
    List<Token> expectedTokens2 = getTokens(
            new WhitespaceTokenizer(new StringReader(expected2)));
    String expected3 = "one two three";
    List<Token> expectedTokens3 = getTokens(
            new WhitespaceTokenizer(new StringReader(expected3)));
    // field one
    TokenStream input = a.tokenStream("one", new StringReader(text));
    List<Token> realTokens = getTokens(input);
    assertTokEqual(realTokens, expectedTokens1);
    // field two
    input = a.tokenStream("two", new StringReader(text));
    realTokens = getTokens(input);
    assertTokEqual(realTokens, expectedTokens2);
    // field three
    input = a.tokenStream("three", new StringReader(text));
    realTokens = getTokens(input);
    assertTokEqual(realTokens, expectedTokens3);
  }
  public void testQueryParsing() throws IOException, ParseException {
    SolrQueryParser parserOne = new SolrQueryParser(schema, "one");
    assertTrue(parserOne.getAllowLeadingWildcard());
    SolrQueryParser parserTwo = new SolrQueryParser(schema, "two");
    assertTrue(parserTwo.getAllowLeadingWildcard());
    SolrQueryParser parserThree = new SolrQueryParser(schema, "three");
    // XXX note: this should be false, but for now we return true for any field,
    // XXX if at least one field uses the reversing
    assertTrue(parserThree.getAllowLeadingWildcard());
    String text = "one +two *hree f*ur fiv*";
    String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*";
    String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*";
    String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*";
    Query q = parserOne.parse(text);
    assertEquals(expectedOne, q.toString());
    q = parserTwo.parse(text);
    assertEquals(expectedTwo, q.toString());
    q = parserThree.parse(text);
    assertEquals(expectedThree, q.toString());
    // test conditional reversal
    String condText = "*hree t*ree th*ee thr*e ?hree t?ree th?ee th?*ee " + 
        "short*token ver*longtoken";
    String expected = "two:\u0001eerh* two:\u0001eer*t two:\u0001ee*ht " +
        "two:thr*e " +
        "two:\u0001eerh? two:\u0001eer?t " +
        "two:th?ee " +
        "two:th?*ee " +
        "two:short*token " +
        "two:\u0001nekotgnol*rev";
    q = parserTwo.parse(condText);
    assertEquals(expected, q.toString());
  }
 }
--- a/src/test/test-files/solr/conf/schema-reversed.xml
+++ b/src/test/test-files/solr/conf/schema-reversed.xml
@ -0,0 +1,81 @@
 <?xml version="1.0" ?>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one or more
 contributor license agreements.  See the NOTICE file distributed with
 this work for additional information regarding copyright ownership.
 The ASF licenses this file to You under the Apache License, Version 2.0
 (the "License"); you may not use this file except in compliance with
 the License.  You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 <!--
  For testing reversed wildcards.
  -->
 <schema name="test" version="1.0">
  <types>
    <fieldtype name="integer" class="solr.IntField" />
    <fieldtype name="text" class="solr.TextField">
      <analyzer>
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldtype>
    <fieldtype name="srev" class="solr.TextField">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldtype>
    <fieldtype name="rev" class="solr.TextField">
      <analyzer type="index">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="false"
            maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"
            minTrailing="1"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldtype>
 </types>
 <fields>
   <field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
   <field name="one" type="srev" indexed="true" stored="false"/>
   <field name="two" type="rev" indexed="true" stored="false"/>
   <field name="three" type="text" indexed="true" stored="false"/>
 </fields>
 <defaultSearchField>one</defaultSearchField>
 <uniqueKey>id</uniqueKey>
 </schema>