diff --git a/CHANGES.txt b/CHANGES.txt index 700fe66b3ac..0486929cade 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -309,6 +309,8 @@ New Features to allow more efficient bulk queries (those that retrieve many or all documents). (Brian Whitman via yonik) +78. SOLR-1321: Add better support for efficient wildcard handling (Andrzej Bialecki, Robert Muir, gsingers) + Optimizations ---------------------- 1. SOLR-374: Use IndexReader.reopen to save resources by re-using parts of the diff --git a/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java new file mode 100644 index 00000000000..262bc90edd1 --- /dev/null +++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilter.java @@ -0,0 +1,85 @@ +package org.apache.solr.analysis; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.TermAttribute; + +/** + * This class produces a special form of reversed tokens, suitable for + * better handling of leading wildcards. Tokens from the input TokenStream + * are reversed and prepended with a special "reversed" marker character. + * If withOriginal argument is true then first the + * original token is returned, and then the reversed token (with + * positionIncrement == 0) is returned. Otherwise only reversed + * tokens are returned. + *

Note: this filter doubles the number of tokens in the input stream when + * withOriginal == true, which proportionally increases the size + * of postings and term dictionary in the index. + */ +public class ReversedWildcardFilter extends TokenFilter { + + private boolean withOriginal; + private char markerChar; + private State save; + private TermAttribute termAtt; + private PositionIncrementAttribute posAtt; + + protected ReversedWildcardFilter(TokenStream input, boolean withOriginal, char markerChar) { + super(input); + this.termAtt = (TermAttribute)addAttribute(TermAttribute.class); + this.posAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class); + this.withOriginal = withOriginal; + this.markerChar = markerChar; + } + + @Override + public boolean incrementToken() throws IOException { + if( save != null ) { + // clearAttributes(); // not currently necessary + restoreState(save); + save = null; + return true; + } + + if (!input.incrementToken()) return false; + + // pass through zero-length terms + int oldLen = termAtt.termLength(); + if (oldLen ==0) return true; + int origOffset = posAtt.getPositionIncrement(); + if (withOriginal == true){ + posAtt.setPositionIncrement(0); + save = captureState(); + } + char [] buffer = termAtt.resizeTermBuffer(oldLen + 1); + buffer[oldLen] = markerChar; + //String reversed = reverseAndMark(value, markerChar); + ReverseStringFilter.reverse(buffer, oldLen + 1); + + posAtt.setPositionIncrement(origOffset); + termAtt.setTermBuffer(buffer, 0, oldLen +1); + return true; + } + + +} diff --git a/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java b/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java new file mode 100644 index 00000000000..8cd822eb1ee --- /dev/null +++ b/src/java/org/apache/solr/analysis/ReversedWildcardFilterFactory.java @@ -0,0 +1,131 @@ +package org.apache.solr.analysis; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; + +/** + * Factory for {@link ReversedWildcardFilter}-s. When this factory is + * added to an analysis chain, it will be used both for filtering the + * tokens during indexing, and to determine the query processing of + * this field during search. + *

This class supports the following init arguments: + *

+ * Note 1: This filter always reverses input tokens during indexing. + * Note 2: Query tokens without wildcard characters will never be reversed. + */ +public class ReversedWildcardFilterFactory extends BaseTokenFilterFactory { + + private char markerChar = ReverseStringFilter.START_OF_HEADING_MARKER; + private boolean withOriginal; + private int maxPosAsterisk; + private int maxPosQuestion; + private int minTrailing; + private float maxFractionAsterisk; + + @Override + public void init(Map args) { + super.init(args); + withOriginal = getBoolean("withOriginal", true); + maxPosAsterisk = getInt("maxPosAsterisk", 2); + maxPosQuestion = getInt("maxPosQuestion", 1); + minTrailing = getInt("minTrailing", 2); + maxFractionAsterisk = getFloat("maxFractionAsterisk", 0.0f); + } + + @Override + public TokenStream create(TokenStream input) { + return new ReversedWildcardFilter(input, withOriginal, markerChar); + } + + /** + * This method encapsulates the logic that determines whether + * a query token should be reversed in order to use the + * reversed terms in the index. + * @param token input token. + * @return true if input token should be reversed, false otherwise. + */ + public boolean shouldReverse(String token) { + int posQ = token.indexOf('?'); + int posA = token.indexOf('*'); + if (posQ == -1 && posA == -1) { // not a wildcard query + return false; + } + int pos; + int lastPos; + int len = token.length(); + lastPos = token.lastIndexOf('?'); + pos = token.lastIndexOf('*'); + if (pos > lastPos) lastPos = pos; + if (posQ != -1) { + pos = posQ; + if (posA != -1) { + pos = Math.min(posQ, posA); + } + } else { + pos = posA; + } + if (len - lastPos < minTrailing) { // too few trailing chars + return false; + } + if (posQ != -1 && posQ < maxPosQuestion) { // leading '?' + return true; + } + if (posA != -1 && posA < maxPosAsterisk) { // leading '*' + return true; + } + // '*' in the leading part + if (maxFractionAsterisk > 0.0f && pos < (float)token.length() * maxFractionAsterisk) { + return true; + } + return false; + } + + public char getMarkerChar() { + return markerChar; + } + + protected float getFloat(String name, float defValue) { + String val = args.get(name); + if (val == null) { + return defValue; + } else { + return Float.parseFloat(val); + } + } +} diff --git a/src/java/org/apache/solr/search/SolrQueryParser.java b/src/java/org/apache/solr/search/SolrQueryParser.java index 2283b5f72bd..a1cac8c7141 100644 --- a/src/java/org/apache/solr/search/SolrQueryParser.java +++ b/src/java/org/apache/solr/search/SolrQueryParser.java @@ -17,14 +17,24 @@ package org.apache.solr.search; +import java.util.HashMap; +import java.util.Map; +import java.util.Map.Entry; + import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.*; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.reverse.ReverseStringFilter; +import org.apache.solr.analysis.ReversedWildcardFilter; +import org.apache.solr.analysis.ReversedWildcardFilterFactory; +import org.apache.solr.analysis.TokenFilterFactory; +import org.apache.solr.analysis.TokenizerChain; import org.apache.solr.common.SolrException; import org.apache.solr.schema.FieldType; import org.apache.solr.schema.IndexSchema; +import org.apache.solr.schema.SchemaField; import org.apache.solr.schema.TrieField; import org.apache.solr.schema.SchemaField; @@ -55,6 +65,8 @@ public class SolrQueryParser extends QueryParser { protected final IndexSchema schema; protected final QParser parser; protected final String defaultField; + protected final Map leadingWildcards = + new HashMap(); /** * Constructs a SolrQueryParser using the schema to understand the @@ -72,7 +84,8 @@ public class SolrQueryParser extends QueryParser { this.parser = null; this.defaultField = defaultField; setLowercaseExpandedTerms(false); - setEnablePositionIncrements(true); + setEnablePositionIncrements(true); + checkAllowLeadingWildcards(); } public SolrQueryParser(QParser parser, String defaultField) { @@ -86,8 +99,31 @@ public class SolrQueryParser extends QueryParser { this.defaultField = defaultField; setLowercaseExpandedTerms(false); setEnablePositionIncrements(true); + checkAllowLeadingWildcards(); } + protected void checkAllowLeadingWildcards() { + boolean allow = false; + for (Entry e : schema.getFieldTypes().entrySet()) { + Analyzer a = e.getValue().getAnalyzer(); + if (a instanceof TokenizerChain) { + // examine the indexing analysis chain if it supports leading wildcards + TokenizerChain tc = (TokenizerChain)a; + TokenFilterFactory[] factories = tc.getTokenFilterFactories(); + for (TokenFilterFactory factory : factories) { + if (factory instanceof ReversedWildcardFilterFactory) { + allow = true; + leadingWildcards.put(e.getKey(), (ReversedWildcardFilterFactory)factory); + } + } + } + } + // XXX should be enabled on a per-field basis + if (allow) { + setAllowLeadingWildcard(true); + } + } + private void checkNullField(String field) throws SolrException { if (field == null && defaultField == null) { throw new SolrException @@ -149,6 +185,17 @@ public class SolrQueryParser extends QueryParser { } protected Query getWildcardQuery(String field, String termStr) throws ParseException { + // *:* -> MatchAllDocsQuery + if ("*".equals(field) && "*".equals(termStr)) { + return newMatchAllDocsQuery(); + } + + // can we use reversed wildcards in this field? + String type = schema.getFieldType(field).getTypeName(); + ReversedWildcardFilterFactory factory = leadingWildcards.get(type); + if (factory != null && factory.shouldReverse(termStr)) { + termStr = ReverseStringFilter.reverse(termStr + factory.getMarkerChar()); + } Query q = super.getWildcardQuery(field, termStr); if (q instanceof WildcardQuery) { // use a constant score query to avoid overflowing clauses diff --git a/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java new file mode 100644 index 00000000000..04fe1c64544 --- /dev/null +++ b/src/test/org/apache/solr/analysis/TestReversedWildcardFilterFactory.java @@ -0,0 +1,143 @@ +package org.apache.solr.analysis; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +import java.io.IOException; +import java.io.StringReader; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.WhitespaceTokenizer; +import org.apache.lucene.queryParser.ParseException; +import org.apache.lucene.search.Query; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.search.SolrQueryParser; + +public class TestReversedWildcardFilterFactory extends BaseTokenTestCase { + Map args = new HashMap(); + ReversedWildcardFilterFactory factory = new ReversedWildcardFilterFactory(); + IndexSchema schema; + + public String getSchemaFile() { + return "schema-reversed.xml"; + } + + public String getSolrConfigFile() { + return "solrconfig.xml"; + } + + public void setUp() throws Exception { + super.setUp(); + schema = new IndexSchema(solrConfig, getSchemaFile(), null); + } + + public void testReversedTokens() throws IOException { + String text = "simple text"; + String expected1 = "simple \u0001elpmis text \u0001txet"; + String expected2 = "\u0001elpmis \u0001txet"; + args.put("withOriginal", "true"); + factory.init(args); + TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text))); + List realTokens = getTokens(input); + List expectedTokens = tokens(expected1); + // set positionIncrements in expected tokens + for (int i = 1; i < expectedTokens.size(); i += 2) { + expectedTokens.get(i).setPositionIncrement(0); + } + assertTokEqual(realTokens, expectedTokens); + + // now without original tokens + args.put("withOriginal", "false"); + factory.init(args); + input = factory.create(new WhitespaceTokenizer(new StringReader(text))); + realTokens = getTokens(input); + expectedTokens = tokens(expected2); + assertTokEqual(realTokens, expectedTokens); + } + + public void testIndexingAnalysis() throws Exception { + Analyzer a = schema.getAnalyzer(); + String text = "one two three"; + String expected1 = "one \u0001eno two \u0001owt three \u0001eerht"; + List expectedTokens1 = getTokens( + new WhitespaceTokenizer(new StringReader(expected1))); + // set positionIncrements and offsets in expected tokens + for (int i = 1; i < expectedTokens1.size(); i += 2) { + Token t = expectedTokens1.get(i); + t.setPositionIncrement(0); + } + String expected2 = "\u0001eno \u0001owt \u0001eerht"; + List expectedTokens2 = getTokens( + new WhitespaceTokenizer(new StringReader(expected2))); + String expected3 = "one two three"; + List expectedTokens3 = getTokens( + new WhitespaceTokenizer(new StringReader(expected3))); + // field one + TokenStream input = a.tokenStream("one", new StringReader(text)); + List realTokens = getTokens(input); + assertTokEqual(realTokens, expectedTokens1); + // field two + input = a.tokenStream("two", new StringReader(text)); + realTokens = getTokens(input); + assertTokEqual(realTokens, expectedTokens2); + // field three + input = a.tokenStream("three", new StringReader(text)); + realTokens = getTokens(input); + assertTokEqual(realTokens, expectedTokens3); + } + + public void testQueryParsing() throws IOException, ParseException { + + SolrQueryParser parserOne = new SolrQueryParser(schema, "one"); + assertTrue(parserOne.getAllowLeadingWildcard()); + SolrQueryParser parserTwo = new SolrQueryParser(schema, "two"); + assertTrue(parserTwo.getAllowLeadingWildcard()); + SolrQueryParser parserThree = new SolrQueryParser(schema, "three"); + // XXX note: this should be false, but for now we return true for any field, + // XXX if at least one field uses the reversing + assertTrue(parserThree.getAllowLeadingWildcard()); + String text = "one +two *hree f*ur fiv*"; + String expectedOne = "one:one +one:two one:\u0001eerh* one:\u0001ru*f one:fiv*"; + String expectedTwo = "two:one +two:two two:\u0001eerh* two:\u0001ru*f two:fiv*"; + String expectedThree = "three:one +three:two three:*hree three:f*ur three:fiv*"; + Query q = parserOne.parse(text); + assertEquals(expectedOne, q.toString()); + q = parserTwo.parse(text); + assertEquals(expectedTwo, q.toString()); + q = parserThree.parse(text); + assertEquals(expectedThree, q.toString()); + // test conditional reversal + String condText = "*hree t*ree th*ee thr*e ?hree t?ree th?ee th?*ee " + + "short*token ver*longtoken"; + String expected = "two:\u0001eerh* two:\u0001eer*t two:\u0001ee*ht " + + "two:thr*e " + + "two:\u0001eerh? two:\u0001eer?t " + + "two:th?ee " + + "two:th?*ee " + + "two:short*token " + + "two:\u0001nekotgnol*rev"; + q = parserTwo.parse(condText); + assertEquals(expected, q.toString()); + } + +} diff --git a/src/test/test-files/solr/conf/schema-reversed.xml b/src/test/test-files/solr/conf/schema-reversed.xml new file mode 100644 index 00000000000..f4cea9e2d02 --- /dev/null +++ b/src/test/test-files/solr/conf/schema-reversed.xml @@ -0,0 +1,81 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + one + id + + +