diff --git a/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java b/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java index 7526548daeb..5d8208ac1f4 100644 --- a/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java +++ b/lucene/core/src/java/org/apache/lucene/util/LSBRadixSorter.java @@ -21,8 +21,9 @@ import java.util.Arrays; /** * A LSB Radix sorter for unsigned int values. + * @lucene.internal */ -final class LSBRadixSorter { +public final class LSBRadixSorter { private static final int INSERTION_SORT_THRESHOLD = 30; private static final int HISTOGRAM_SIZE = 256; diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9b8980bb71d..4f0efe0b042 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -72,6 +72,11 @@ Optimizations * SOLR-7876: Speed up queries and operations that use many terms when timeAllowed has not been specified. Speedups of up to 8% were observed. (yonik) +* SOLR-8037: Speed up creation of filters from term range queries (i.e. non-numeric range queries) + and use the filter cache for term range queries that are part of larger queries. Some observed + speedups were up to 2.5x for production of filters, and up to 10x for query evaluation with + embedded term range queres that resulted in filter cache hits. (yonik) + Other Changes ---------------------- diff --git a/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java b/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java new file mode 100644 index 00000000000..b9493b70e6d --- /dev/null +++ b/solr/core/src/java/org/apache/solr/query/SolrRangeQuery.java @@ -0,0 +1,502 @@ +package org.apache.solr.query; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BulkScorer; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.ConstantScoreScorer; +import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.Filter; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.DocIdSetBuilder; +import org.apache.lucene.util.FixedBitSet; +import org.apache.solr.search.BitDocSet; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.DocSetBuilder; +import org.apache.solr.search.DocSetProducer; +import org.apache.solr.search.ExtendedQueryBase; +import org.apache.solr.search.SolrConstantScoreQuery; +import org.apache.solr.search.SolrIndexSearcher; + +/** @lucene.experimental */ +public final class SolrRangeQuery extends ExtendedQueryBase implements DocSetProducer { + private final String field; + private final BytesRef lower; + private final BytesRef upper; + private byte flags; + private static byte FLAG_INC_LOWER = 0x01; + private static byte FLAG_INC_UPPER = 0x02; + + public SolrRangeQuery(String field, BytesRef lower, BytesRef upper, boolean includeLower, boolean includeUpper) { + this.field = field; + this.lower = lower; + this.upper = upper; + this.flags = (byte)((this.lower != null && includeLower ? FLAG_INC_LOWER : 0) | (this.upper != null && includeUpper ? FLAG_INC_UPPER : 0)); + } + + public String getField() { + return field; + } + + public boolean includeLower() { + return (flags & FLAG_INC_LOWER) != 0; + } + + public boolean includeUpper() { + return (flags & FLAG_INC_UPPER) != 0; + } + + @Override + public int hashCode() { + int hash = 0x8f2c9ba7 * (flags+1); // avoid multiplying by 0 + hash = hash * 29 + ((lower == null) ? 0 : lower.hashCode()); // TODO: simpler hash code here? + hash = hash * 29 + ((upper == null) ? 0 : upper.hashCode()); + return hash; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) { + return true; + } + if (!(obj instanceof SolrRangeQuery)) { + return false; + } + SolrRangeQuery other = (SolrRangeQuery)obj; + + return (this.flags == other.flags) + && (this.field.equals(other.field)) + && (this.lower == other.lower || (this.lower != null && other.lower != null && this.lower.equals(other.lower))) + && (this.upper == other.upper || (this.upper != null && other.upper != null && this.upper.equals(other.upper))) + ; + } + + @Override + public String toString(String field) { + StringBuilder buffer = new StringBuilder(); + if (!getField().equals(field)) { + buffer.append(getField()); + buffer.append(":"); + } + // TODO: use our schema? + buffer.append(includeLower() ? '[' : '{'); + buffer.append(endpoint(lower)); + buffer.append(" TO "); + buffer.append(endpoint(upper)); + buffer.append(includeUpper() ? ']' : '}'); + return buffer.toString(); + } + + private String endpoint(BytesRef ref) { + if (ref == null) return "*"; + String toStr = Term.toString(ref); + if ("*".equals(toStr)) { + toStr = "\\*"; + } + // TODO: other escaping + return toStr; + } + + @Override + public Query rewrite(IndexReader reader) throws IOException { + return this; + } + + @Override + public Weight createWeight(IndexSearcher searcher, boolean needScores) throws IOException { + return new ConstWeight(searcher, needScores); + /* + DocSet docs = createDocSet(searcher.getIndexReader().leaves(), searcher.getIndexReader().maxDoc()); + SolrConstantScoreQuery csq = new SolrConstantScoreQuery( docs.getTopFilter() ); + return csq.createWeight(searcher, needScores); + */ + } + + @Override + public DocSet createDocSet(SolrIndexSearcher searcher) throws IOException { + return createDocSet( searcher, Math.min(64,(searcher.maxDoc()>>>10)+4) ); + } + + private DocSet createDocSet(SolrIndexSearcher searcher, long cost) throws IOException { + int maxDoc = searcher.maxDoc(); + BitDocSet liveDocs = searcher.getLiveDocs(); + FixedBitSet liveBits = liveDocs.size() == maxDoc ? null : liveDocs.getBits(); + + DocSetBuilder builder = new DocSetBuilder(maxDoc, cost); + + List leaves = searcher.getTopReaderContext().leaves(); + + int maxTermsPerSegment = 0; + for (LeafReaderContext ctx : leaves) { + TermsEnum te = getTermsEnum(ctx); + int termsVisited = builder.add(te, ctx.docBase); + maxTermsPerSegment = Math.max(maxTermsPerSegment, termsVisited); + } + + return maxTermsPerSegment <= 1 ? builder.buildUniqueInOrder(liveBits) : builder.build(liveBits); + } + + + private class RangeTermsEnum extends TermsEnum { + + TermsEnum te; + BytesRef curr; + boolean positioned; + + public RangeTermsEnum(Terms terms) throws IOException { + if (terms == null) { + positioned = true; + } else { + te = terms.iterator(); + if (lower != null) { + TermsEnum.SeekStatus status = te.seekCeil(lower); + if (status == TermsEnum.SeekStatus.END) { + positioned = true; + curr = null; + } else if (status == SeekStatus.FOUND) { + positioned = includeLower(); + curr = te.term(); + } else { + // lower bound not found, so includeLower is irrelevant + positioned = true; + curr = te.term(); + } + } + } + } + + @Override + public SeekStatus seekCeil(BytesRef text) throws IOException { + return te.seekCeil(text); + } + + @Override + public void seekExact(long ord) throws IOException { + te.seekExact(ord); + } + + @Override + public BytesRef term() throws IOException { + return te.term(); // should be equal to curr, except if we went past the end + } + + @Override + public long ord() throws IOException { + return te.ord(); + } + + @Override + public int docFreq() throws IOException { + return te.docFreq(); + } + + @Override + public long totalTermFreq() throws IOException { + return te.totalTermFreq(); + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + return te.postings(reuse, flags); + } + + @Override + public BytesRef next() throws IOException { + if (positioned) { + positioned = false; + } else { + curr = te.next(); + } + + if (curr == null) return null; + + if (upper != null) { + int cmp = curr.compareTo(upper); + if (cmp < 0 || cmp == 0 && includeUpper()) { + return curr; + } else { + curr = null; + } + } + return curr; + } + + @Override + public AttributeSource attributes() { + return te.attributes(); + } + + @Override + public boolean seekExact(BytesRef text) throws IOException { + return te.seekExact(text); + } + + @Override + public void seekExact(BytesRef term, TermState state) throws IOException { + te.seekExact(term, state); + } + + @Override + public TermState termState() throws IOException { + return te.termState(); + } + } + + + public TermsEnum getTermsEnum(LeafReaderContext ctx) throws IOException { + return new RangeTermsEnum( ctx.reader().terms(getField()) ); + } + + + private static class TermAndState { + final BytesRef term; + final TermState state; + final int docFreq; + final long totalTermFreq; + + TermAndState(BytesRef term, TermState state, int docFreq, long totalTermFreq) { + this.term = term; + this.state = state; + this.docFreq = docFreq; + this.totalTermFreq = totalTermFreq; + } + } + + private static class SegState { + final Weight weight; + final DocIdSet set; + + SegState(Weight weight) { + this.weight = weight; + this.set = null; + } + + SegState(DocIdSet set) { + this.set = set; + this.weight = null; + } + } + + // adapted from MultiTermQueryConstantScoreWrapper + class ConstWeight extends ConstantScoreWeight { + + private static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 16; + + final IndexSearcher searcher; + final boolean needScores; + boolean checkedFilterCache; + Filter filter; + final SegState[] segStates; + + + protected ConstWeight(IndexSearcher searcher, boolean needScores) { + super( SolrRangeQuery.this ); + this.searcher = searcher; + this.segStates = new SegState[ searcher.getIndexReader().leaves().size() ]; + this.needScores = needScores; + } + + + /** Try to collect terms from the given terms enum and return count=sum(df) for terms visited so far + * or (-count - 1) if this should be rewritten into a boolean query. + * The termEnum will already be positioned on the next term if not exhausted. + */ + private long collectTerms(LeafReaderContext context, TermsEnum termsEnum, List terms) throws IOException { + long count = 0; + final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, BooleanQuery.getMaxClauseCount()); + for (int i = 0; i < threshold; ++i) { + final BytesRef term = termsEnum.next(); + if (term == null) { + return -count - 1; + } + TermState state = termsEnum.termState(); + if (state.isRealTerm() == false) { + // TermQuery does not accept fake terms for now + return count; + } + int df = termsEnum.docFreq(); + count += df; + terms.add(new TermAndState(BytesRef.deepCopyOf(term), state, df, termsEnum.totalTermFreq())); + } + return termsEnum.next() == null ? (-count - 1) : count; + } + + private SegState getSegState(LeafReaderContext context) throws IOException { + SegState segState = segStates[context.ord]; + if (segState != null) return segState; + + // first time, check our filter cache + boolean doCheck = !checkedFilterCache && context.ord == 0; + checkedFilterCache = true; + SolrIndexSearcher solrSearcher = null; + if (doCheck && searcher instanceof SolrIndexSearcher) { + solrSearcher = (SolrIndexSearcher)searcher; + if (solrSearcher.getFilterCache() == null) { + doCheck = false; + } else { + solrSearcher = (SolrIndexSearcher)searcher; + DocSet answer = solrSearcher.getFilterCache().get(SolrRangeQuery.this); + if (answer != null) { + filter = answer.getTopFilter(); + } + } + } + + if (filter != null) { + return segStates[context.ord] = new SegState(filter.getDocIdSet(context, null)); + } + + + final Terms terms = context.reader().terms(SolrRangeQuery.this.getField()); + if (terms == null) { + return segStates[context.ord] = new SegState((DocIdSet) null); + } + + final TermsEnum termsEnum = SolrRangeQuery.this.getTermsEnum(context); + + PostingsEnum docs = null; + + final List collectedTerms = new ArrayList<>(); + long count = collectTerms(context, termsEnum, collectedTerms); + if (count < 0) { + BooleanQuery.Builder bq = new BooleanQuery.Builder(); + for (TermAndState t : collectedTerms) { + final TermContext termContext = new TermContext(searcher.getTopReaderContext()); + termContext.register(t.state, context.ord, t.docFreq, t.totalTermFreq); + bq.add(new TermQuery(new Term( SolrRangeQuery.this.getField(), t.term), termContext), BooleanClause.Occur.SHOULD); + } + Query q = new ConstantScoreQuery(bq.build()); + final Weight weight = searcher.rewrite(q).createWeight(searcher, needScores); + weight.normalize(1f, score()); + return segStates[context.ord] = new SegState(weight); + } + + // Too many terms for boolean query... + + if (doCheck) { + DocSet answer = createDocSet(solrSearcher, count); + solrSearcher.getFilterCache().put(SolrRangeQuery.this, answer); + filter = answer.getTopFilter(); + return segStates[context.ord] = new SegState(filter.getDocIdSet(context, null)); + } + + /* FUTURE: reuse term states in the future to help build DocSet, use collected count so far... + Bits liveDocs = context.reader().getLiveDocs(); + int base = context.docBase; + int termsVisited = collectedTerms.size(); + + DocSetBuilder builder = new DocSetBuilder(searcher.getIndexReader().maxDoc()); + if (!collectedTerms.isEmpty()) { + TermsEnum termsEnum2 = terms.iterator(); + for (TermAndState t : collectedTerms) { + termsEnum2.seekExact(t.term, t.state); + docs = termsEnum2.postings(docs, PostingsEnum.NONE); + builder.add(docs, context.docBase, liveDocs); + } + } + + termsVisited += builder.add(termsEnum, base, liveDocs); + */ + + DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc()); + builder.grow((int)Math.min(Integer.MAX_VALUE,count)); + if (collectedTerms.isEmpty() == false) { + TermsEnum termsEnum2 = terms.iterator(); + for (TermAndState t : collectedTerms) { + termsEnum2.seekExact(t.term, t.state); + docs = termsEnum2.postings(docs, PostingsEnum.NONE); + builder.add(docs); + } + } + + do { + // already positioned on the next term, so don't call next() here... + docs = termsEnum.postings(docs, PostingsEnum.NONE); + builder.add(docs); + } while (termsEnum.next() != null); + + DocIdSet segSet = builder.build(); + return segStates[context.ord] = new SegState(segSet); + } + + private Scorer scorer(DocIdSet set) throws IOException { + if (set == null) { + return null; + } + final DocIdSetIterator disi = set.iterator(); + if (disi == null) { + return null; + } + return new ConstantScoreScorer(this, score(), disi); + } + + @Override + public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { + final SegState weightOrBitSet = getSegState(context); + if (weightOrBitSet.weight != null) { + return weightOrBitSet.weight.bulkScorer(context); + } else { + final Scorer scorer = scorer(weightOrBitSet.set); + if (scorer == null) { + return null; + } + return new DefaultBulkScorer(scorer); + } + } + + @Override + public Scorer scorer(LeafReaderContext context) throws IOException { + final SegState weightOrBitSet = getSegState(context); + if (weightOrBitSet.weight != null) { + return weightOrBitSet.weight.scorer(context); + } else { + return scorer(weightOrBitSet.set); + } + } + } +} + + + + + + + diff --git a/solr/core/src/java/org/apache/solr/schema/FieldType.java b/solr/core/src/java/org/apache/solr/schema/FieldType.java index 3bc4ea40ed8..d398af1863b 100644 --- a/solr/core/src/java/org/apache/solr/schema/FieldType.java +++ b/solr/core/src/java/org/apache/solr/schema/FieldType.java @@ -63,6 +63,7 @@ import org.apache.solr.common.SolrException.ErrorCode; import org.apache.solr.common.util.Base64; import org.apache.solr.common.util.SimpleOrderedMap; import org.apache.solr.common.util.StrUtils; +import org.apache.solr.query.SolrRangeQuery; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; import org.apache.solr.search.Sorting; @@ -720,12 +721,11 @@ public abstract class FieldType extends FieldProperties { part2 == null ? null : new BytesRef(toInternal(part2)), minInclusive, maxInclusive); } else { - MultiTermQuery rangeQuery = TermRangeQuery.newStringRange( + SolrRangeQuery rangeQuery = new SolrRangeQuery( field.getName(), - part1 == null ? null : toInternal(part1), - part2 == null ? null : toInternal(part2), + part1 == null ? null : new BytesRef(toInternal(part1)), + part2 == null ? null : new BytesRef(toInternal(part2)), minInclusive, maxInclusive); - rangeQuery.setRewriteMethod(getRewriteMethod(parser, field)); return rangeQuery; } } diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java index 9fbc4d31a9d..6362f23e8ce 100644 --- a/solr/core/src/java/org/apache/solr/schema/TextField.java +++ b/solr/core/src/java/org/apache/solr/schema/TextField.java @@ -28,6 +28,7 @@ import org.apache.lucene.uninverting.UninvertingReader.Type; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.QueryBuilder; import org.apache.solr.common.SolrException; +import org.apache.solr.query.SolrRangeQuery; import org.apache.solr.response.TextResponseWriter; import org.apache.solr.search.QParser; import org.apache.solr.search.Sorting; @@ -136,7 +137,7 @@ public class TextField extends FieldType { Analyzer multiAnalyzer = getMultiTermAnalyzer(); BytesRef lower = analyzeMultiTerm(field.getName(), part1, multiAnalyzer); BytesRef upper = analyzeMultiTerm(field.getName(), part2, multiAnalyzer); - return new TermRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive); + return new SolrRangeQuery(field.getName(), lower, upper, minInclusive, maxInclusive); } public static BytesRef analyzeMultiTerm(String field, String part, Analyzer analyzerIn) { diff --git a/solr/core/src/java/org/apache/solr/search/BitDocSet.java b/solr/core/src/java/org/apache/solr/search/BitDocSet.java index df9ca8f168a..9f4ca21135a 100644 --- a/solr/core/src/java/org/apache/solr/search/BitDocSet.java +++ b/solr/core/src/java/org/apache/solr/search/BitDocSet.java @@ -285,8 +285,7 @@ public class BitDocSet extends DocSetBase { } final int base = context.docBase; - final int maxDoc = reader.maxDoc(); - final int max = base + maxDoc; // one past the max doc in this segment. + final int max = base + reader.maxDoc(); // one past the max doc in this segment. return BitsFilteredDocIdSet.wrap(new DocIdSet() { @Override @@ -302,10 +301,11 @@ public class BitDocSet extends DocSetBase { @Override public int nextDoc() { - if (pos >= bs.length() - 1) { + int next = pos+1; + if (next >= max) { return adjustedDoc = NO_MORE_DOCS; } else { - pos = bs.nextSetBit(pos + 1); + pos = bs.nextSetBit(next); return adjustedDoc = pos < max ? pos - base : NO_MORE_DOCS; } } @@ -314,7 +314,7 @@ public class BitDocSet extends DocSetBase { public int advance(int target) { if (target == NO_MORE_DOCS) return adjustedDoc = NO_MORE_DOCS; int adjusted = target + base; - if (adjusted >= bs.length()) { + if (adjusted >= max) { return adjustedDoc = NO_MORE_DOCS; } else { pos = bs.nextSetBit(adjusted); @@ -326,6 +326,7 @@ public class BitDocSet extends DocSetBase { public long cost() { // we don't want to actually compute cardinality, but // if it's already been computed, we use it (pro-rated for the segment) + int maxDoc = max-base; if (size != -1) { return (long)(size * ((FixedBitSet.bits2words(maxDoc)<<6) / (float)bs.length())); } else { @@ -350,7 +351,7 @@ public class BitDocSet extends DocSetBase { @Override public int length() { - return maxDoc; + return max-base; } }; } diff --git a/solr/core/src/java/org/apache/solr/search/DocSetBuilder.java b/solr/core/src/java/org/apache/solr/search/DocSetBuilder.java new file mode 100644 index 00000000000..ca8969c48d1 --- /dev/null +++ b/solr/core/src/java/org/apache/solr/search/DocSetBuilder.java @@ -0,0 +1,222 @@ +package org.apache.solr.search; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.LSBRadixSorter; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Adapted from DocIdSetBuilder to build DocSets + * + * @lucene.internal + */ +public final class DocSetBuilder { + + private final int maxDoc; + private final int threshold; + + private int[] buffer; + private int pos; + + private FixedBitSet bitSet; + + + public DocSetBuilder(int maxDoc, long costEst) { + this.maxDoc = maxDoc; + // For ridiculously small sets, we'll just use a sorted int[] + // maxDoc >>> 7 is a good value if you want to save memory, lower values + // such as maxDoc >>> 11 should provide faster building but at the expense + // of using a full bitset even for quite sparse data + this.threshold = (maxDoc >>> 7) + 4; // the +4 is for better testing on small indexes + + if (costEst > threshold) { + bitSet = new FixedBitSet(maxDoc); + } else { + this.buffer = new int[Math.max((int)costEst,1)]; + } + } + + private void upgradeToBitSet() { + assert bitSet == null; + bitSet = new FixedBitSet(maxDoc); + for (int i = 0; i < pos; ++i) { + bitSet.set(buffer[i]); + } + this.buffer = null; + this.pos = 0; + } + + private void growBuffer(int minSize) { + if (minSize < buffer.length) return; + + int newSize = buffer.length; + while (newSize < minSize) { + newSize = newSize << 1; + } + newSize = Math.min(newSize, threshold); + + int[] newBuffer = new int[newSize]; + System.arraycopy(buffer, 0, newBuffer, 0, pos); + buffer = newBuffer; + } + + public void add(DocIdSetIterator iter, int base) throws IOException { + grow((int) Math.min(Integer.MAX_VALUE, iter.cost())); + + if (bitSet != null) { + add(bitSet, iter, base); + } else { + while (true) { + for (int i = pos; i < buffer.length; ++i) { + final int doc = iter.nextDoc(); + if (doc == DocIdSetIterator.NO_MORE_DOCS) { + pos = i; // update pos + return; + } + buffer[i] = doc + base; // using the loop counter may help with removal of bounds checking + } + + pos = buffer.length; // update pos + if (pos + 1 >= threshold) { + break; + } + + growBuffer(pos + 1); + } + + upgradeToBitSet(); + add(bitSet, iter, base); + } + } + + + public static void add(FixedBitSet bitSet, DocIdSetIterator iter, int base) throws IOException { + for (int doc = iter.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = iter.nextDoc()) { + bitSet.set(doc + base); + } + } + + /** Returns the number of terms visited */ + public int add(TermsEnum te, int base) throws IOException { + PostingsEnum postings = null; + + int termCount = 0; + for(;;) { + BytesRef term = te.next(); + if (term == null) break; + termCount++; + postings = te.postings(postings, PostingsEnum.NONE); + add(postings, base); + } + + return termCount; + } + + + public void grow(int numDocs) { + if (bitSet == null) { + final long newLength = pos + numDocs; + if (newLength < threshold) { + growBuffer((int) newLength); + } else { + upgradeToBitSet(); + } + } + } + + + public void add(int doc) { + if (bitSet != null) { + bitSet.set(doc); + } else { + if (pos >= buffer.length) { + if (pos + 1 >= threshold) { + upgradeToBitSet(); + bitSet.set(doc); + return; + } + growBuffer(pos + 1); + } + buffer[pos++] = doc; + } + } + + private static int dedup(int[] arr, int length, FixedBitSet acceptDocs) { + if (length == 0) { + return 0; + } + int l = 1; + int previous = arr[0]; + for (int i = 1; i < length; ++i) { + final int value = arr[i]; + assert value >= previous; + if (value != previous) { + if (acceptDocs == null || acceptDocs.get(value)) { + arr[l++] = value; + previous = value; + } + } + } + return l; + } + + + + public DocSet build(FixedBitSet filter) { + if (bitSet != null) { + if (filter != null) { + bitSet.and(filter); + } + return new BitDocSet(bitSet); + // TODO - if this set will be cached, should we make it smaller if it's below DocSetUtil.smallSetSize? + } else { + LSBRadixSorter sorter = new LSBRadixSorter(); + sorter.sort(buffer, 0, pos); + final int l = dedup(buffer, pos, filter); + assert l <= pos; + return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set + } + } + + /** Only use this if you know there were no duplicates and that docs were collected in-order! */ + public DocSet buildUniqueInOrder(FixedBitSet filter) { + if (bitSet != null) { + if (filter != null) { + bitSet.and(filter); + } + return new BitDocSet(bitSet); + } else { + // don't need to sort, but still need to remove non accepted docs + int l = pos; + if (filter != null) { + l = dedup(buffer, pos, filter); + } + return new SortedIntDocSet(buffer, l); // TODO: have option to not shrink in the future if it will be a temporary set + } + } + +} + diff --git a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java index 654c1a68001..787bfffe957 100644 --- a/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java +++ b/solr/core/src/java/org/apache/solr/search/SolrIndexSearcher.java @@ -462,6 +462,10 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn return fieldNames; } + public SolrCache getFilterCache() { + return filterCache; + } + /** * Returns a collection of the names of all stored fields which can be * highlighted the index reader knows about. @@ -919,12 +923,12 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn DocSet absAnswer = filterCache.get(absQ); if (absAnswer!=null) { if (positive) return absAnswer; - else return getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer); + else return getLiveDocs().andNot(absAnswer); } } DocSet absAnswer = getDocSetNC(absQ, null); - DocSet answer = positive ? absAnswer : getPositiveDocSet(matchAllDocsQuery).andNot(absAnswer); + DocSet answer = positive ? absAnswer : getLiveDocs().andNot(absAnswer); if (filterCache != null) { // cache negative queries as positive @@ -948,7 +952,15 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn } private static Query matchAllDocsQuery = new MatchAllDocsQuery(); + private BitDocSet liveDocs; + public BitDocSet getLiveDocs() throws IOException { + // going through the filter cache will provide thread safety here + if (liveDocs == null) { + liveDocs = getDocSetBits(matchAllDocsQuery); + } + return liveDocs; + } public static class ProcessedFilter { public DocSet answer; // the answer, if non-null @@ -1129,7 +1141,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn // Are all of our normal cached filters negative? if (end > 0 && answer==null) { - answer = getPositiveDocSet(matchAllDocsQuery); + answer = getLiveDocs(); } // do negative queries first to shrink set size @@ -1152,7 +1164,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn } else { if (postFilters == null) { if (answer == null) { - answer = getPositiveDocSet(matchAllDocsQuery); + answer = getLiveDocs(); } // "answer" is the only part of the filter, so set it. pf.answer = answer; @@ -2150,7 +2162,7 @@ public class SolrIndexSearcher extends IndexSearcher implements Closeable,SolrIn // if both negative, we need to create a temp DocSet since we // don't have a counting method that takes three. - DocSet all = getPositiveDocSet(matchAllDocsQuery); + DocSet all = getLiveDocs(); // -a -b == *:*.andNot(a).andNotSize(b) == *.*.andNotSize(a.union(b)) // we use the last form since the intermediate DocSet should normally be smaller. diff --git a/solr/core/src/test/org/apache/solr/search/TestFiltering.java b/solr/core/src/test/org/apache/solr/search/TestFiltering.java index 105f5ee3791..9587b029c42 100644 --- a/solr/core/src/test/org/apache/solr/search/TestFiltering.java +++ b/solr/core/src/test/org/apache/solr/search/TestFiltering.java @@ -21,6 +21,7 @@ package org.apache.solr.search; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.util.FixedBitSet; import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.SolrException; import org.apache.solr.request.SolrQueryRequest; @@ -34,7 +35,7 @@ public class TestFiltering extends SolrTestCaseJ4 { @BeforeClass public static void beforeTests() throws Exception { System.setProperty("enable.update.log", "false"); // schema12 doesn't support _version_ - initCore("solrconfig.xml","schema12.xml"); + initCore("solrconfig.xml","schema_latest.xml"); } @@ -132,8 +133,79 @@ public class TestFiltering extends SolrTestCaseJ4 { } static String f = "val_i"; + static String f_s = "val_s"; + static String f_s(int i) { + return String.format(Locale.ROOT, "%05d", i); + } - String frangeStr(boolean negative, int l, int u, boolean cache, int cost, boolean exclude) { + + String rangeStr(String field, boolean negative, int l, int u, boolean cache, int cost, boolean exclude) { + String topLev=""; + if (!cache || exclude) { + topLev = "{!" + (cache || random().nextBoolean() ? " cache=" + cache : "") + + (cost != 0 ? " cost=" + cost : "") + + ((exclude) ? " tag=t" : "") + "}"; + } + + String q = field + ":"; + String q2 = q; + + String lower1 = "[" + f_s(l); + String lower2 = l<=0 ? lower1 : ("{" + f_s(l-1)); + String upper1 = f_s(u) + "]"; + String upper2 = f_s(u+1) + "}"; + + if (random().nextBoolean()) { + q += lower1; + q2 += lower2; + } else { + q += lower2; + q2 += lower1; + } + + q += " TO "; + q2 += " TO "; + + if (random().nextBoolean()) { + q += upper1; + q2 += upper2; + } else { + q += upper2; + q2 += upper1; + } + + + // String q = field + ":[" + f_s(l) + " TO " + f_s(u) + "]"; + + if (negative) { + q = "-_query_:\"" + q + "\""; + // q = "-" + q; // TODO: need to be encapsulated for some reason? + } else { + if (random().nextBoolean()) { + // try some different query structures - important for testing different code paths + switch (random().nextInt(5)) { + case 0: + q = q + " OR id:RAND"+random().nextInt(); + break; + case 1: + q = "id:RAND"+random().nextInt() + " OR " + q; + break; + case 2: + q = "*:* AND " + q; + break; + case 3: + q = q + " AND " + q2; + break; + case 4: + q = q + " OR " + q2; + break; + } + } + } + return topLev + q; + } + + String frangeStr(String field, boolean negative, int l, int u, boolean cache, int cost, boolean exclude) { String topLev=""; if (!cache || exclude) { @@ -142,7 +214,7 @@ public class TestFiltering extends SolrTestCaseJ4 { + ((exclude) ? " tag=t" : ""); } - String ret = "{!frange v="+f+" l="+l+" u="+u; + String ret = "{!frange v="+field+" l="+l+" u="+u; if (negative) { ret = "-_query_:\"" + ret + "}\""; if (topLev.length()>0) { @@ -165,7 +237,7 @@ public class TestFiltering extends SolrTestCaseJ4 { FixedBitSet[] sets = facetQuery ? new FixedBitSet[]{model.facetQuery} : (exclude ? new FixedBitSet[]{model.answer, model.facetQuery} : new FixedBitSet[]{model.answer, model.multiSelect, model.facetQuery}); - if (random().nextInt(100) < 50) { + if (random().nextInt(100) < 60) { // frange int l=0; int u=0; @@ -201,7 +273,10 @@ public class TestFiltering extends SolrTestCaseJ4 { } } - return frangeStr(!positive, l, u, cache, cost, exclude); + String whichField = random().nextBoolean() ? f : f_s; + return random().nextBoolean() ? + frangeStr(f, !positive, l, u, cache, cost, exclude) // todo: frange doesn't work on the string field? + : rangeStr(whichField, !positive, l, u, cache, cost, exclude); } else { // term or boolean query int numWords = FixedBitSet.bits2words(model.indexSize); @@ -256,16 +331,17 @@ public class TestFiltering extends SolrTestCaseJ4 { Model model = new Model(); for (int iiter = 0; iiter