LUCENE-3533: nuke spanfilters

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1201787 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-11-14 17:02:30 +00:00
parent f389654f9c
commit 5682889026
25 changed files with 155 additions and 706 deletions

View File

@ -199,6 +199,9 @@ Changes in backwards compatibility policy
as these are no longer used by the scoring system. See MIGRATE.txt for more
details. (Robert Muir)
* LUCENE-3533: Removed SpanFilters, they created large lists of objects and
did not scale. (Robert Muir)
Changes in Runtime Behavior
* LUCENE-2846: omitNorms now behaves like omitTermFrequencyAndPositions, if you

View File

@ -25,6 +25,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
@ -42,6 +43,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
/**
* Class used to extract {@link WeightedSpanTerm}s from a {@link Query} based on whether
@ -247,16 +249,21 @@ public class WeightedSpanTermExtractor {
List<PositionSpan> spanPositions = new ArrayList<PositionSpan>();
for (final String field : fieldNames) {
AtomicReaderContext context = getLeafContextForField(field);
Bits acceptDocs = context.reader.getLiveDocs();
final Spans spans;
final SpanQuery q;
if (mustRewriteQuery) {
spans = queries.get(field).getSpans(context, acceptDocs);
q = queries.get(field);
} else {
spans = spanQuery.getSpans(context, acceptDocs);
q = spanQuery;
}
AtomicReaderContext context = getLeafContextForField(field);
Map<Term,TermContext> termContexts = new HashMap<Term,TermContext>();
TreeSet<Term> extractedTerms = new TreeSet<Term>();
q.extractTerms(extractedTerms);
for (Term term : extractedTerms) {
termContexts.put(term, TermContext.build(context, term, true));
}
Bits acceptDocs = context.reader.getLiveDocs();
final Spans spans = q.getSpans(context, acceptDocs, termContexts);
// collect span positions
while (spans.next()) {

View File

@ -1,136 +0,0 @@
package org.apache.lucene.search;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Wraps another SpanFilter's result and caches it. The purpose is to allow
* filters to simply filter, and then wrap with this class to add caching.
*/
public class CachingSpanFilter extends SpanFilter {
private SpanFilter filter;
/**
* A transient Filter cache (package private because of test)
*/
private final CachingWrapperFilter.FilterCache<SpanFilterResult> cache;
/** Wraps another SpanFilter's result and caches it.
* @param filter Filter to cache results of
*/
public CachingSpanFilter(SpanFilter filter) {
this.filter = filter;
this.cache = new CachingWrapperFilter.FilterCache<SpanFilterResult>();
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, final Bits acceptDocs) throws IOException {
final SpanFilterResult result = getCachedResult(context);
return BitsFilteredDocIdSet.wrap(result.getDocIdSet(), acceptDocs);
}
@Override
public SpanFilterResult bitSpans(AtomicReaderContext context, final Bits acceptDocs) throws IOException {
final SpanFilterResult result = getCachedResult(context);
if (acceptDocs == null) {
return result;
} else {
// TODO: filter positions more efficient
List<SpanFilterResult.PositionInfo> allPositions = result.getPositions();
List<SpanFilterResult.PositionInfo> positions = new ArrayList<SpanFilterResult.PositionInfo>(allPositions.size() / 2 + 1);
for (SpanFilterResult.PositionInfo p : allPositions) {
if (acceptDocs.get(p.getDoc())) {
positions.add(p);
}
}
return new SpanFilterResult(BitsFilteredDocIdSet.wrap(result.getDocIdSet(), acceptDocs), positions);
}
}
/** Provide the DocIdSet to be cached, using the DocIdSet provided
* by the wrapped Filter.
* <p>This implementation returns the given {@link DocIdSet}, if {@link DocIdSet#isCacheable}
* returns <code>true</code>, else it copies the {@link DocIdSetIterator} into
* an {@link FixedBitSet}.
*/
protected SpanFilterResult spanFilterResultToCache(SpanFilterResult result, IndexReader reader) throws IOException {
if (result == null || result.getDocIdSet() == null) {
// this is better than returning null, as the nonnull result can be cached
return SpanFilterResult.EMPTY_SPAN_FILTER_RESULT;
} else if (result.getDocIdSet().isCacheable()) {
return result;
} else {
final DocIdSetIterator it = result.getDocIdSet().iterator();
// null is allowed to be returned by iterator(),
// in this case we wrap with the empty set,
// which is cacheable.
if (it == null) {
return SpanFilterResult.EMPTY_SPAN_FILTER_RESULT;
} else {
final FixedBitSet bits = new FixedBitSet(reader.maxDoc());
bits.or(it);
return new SpanFilterResult(bits, result.getPositions());
}
}
}
// for testing
int hitCount, missCount;
private SpanFilterResult getCachedResult(AtomicReaderContext context) throws IOException {
final IndexReader reader = context.reader;
final Object coreKey = reader.getCoreCacheKey();
SpanFilterResult result = cache.get(reader, coreKey);
if (result != null) {
hitCount++;
return result;
} else {
missCount++;
// cache miss: we use no acceptDocs here
// (this saves time on building SpanFilterResult, the acceptDocs will be applied on the cached set)
result = spanFilterResultToCache(filter.bitSpans(context, null/**!!!*/), reader);
cache.put(coreKey, result);
}
return result;
}
@Override
public String toString() {
return "CachingSpanFilter("+filter+")";
}
@Override
public boolean equals(Object o) {
if (!(o instanceof CachingSpanFilter)) return false;
return this.filter.equals(((CachingSpanFilter)o).filter);
}
@Override
public int hashCode() {
return filter.hashCode() ^ 0x1117BF25;
}
}

View File

@ -1,39 +0,0 @@
package org.apache.lucene.search;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import java.io.IOException;
/** Abstract base class providing a mechanism to restrict searches to a subset
of an index and also maintains and returns position information.
This is useful if you want to compare the positions from a SpanQuery with the positions of items in
a filter. For instance, if you had a SpanFilter that marked all the occurrences of the word "foo" in documents,
and then you entered a new SpanQuery containing bar, you could not only filter by the word foo, but you could
then compare position information for post processing.
*/
public abstract class SpanFilter extends Filter{
/** Returns a SpanFilterResult with true for documents which should be permitted in
search results, and false for those that should not and Spans for where the true docs match.
* @param context The {@link AtomicReaderContext} to load position and DocIdSet information from
* @return A {@link SpanFilterResult}
* @throws java.io.IOException if there was an issue accessing the necessary information
* */
public abstract SpanFilterResult bitSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException;
}

View File

@ -1,119 +0,0 @@
package org.apache.lucene.search;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* The results of a SpanQueryFilter. Wraps the BitSet and the position information from the SpanQuery
*
* @lucene.experimental
*
**/
public class SpanFilterResult {
private DocIdSet docIdSet;
private List<PositionInfo> positions;//Spans spans;
public static final SpanFilterResult EMPTY_SPAN_FILTER_RESULT =
new SpanFilterResult(DocIdSet.EMPTY_DOCIDSET, Collections.<PositionInfo>emptyList());
/**
*
* @param docIdSet The DocIdSet for the Filter
* @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects
*/
public SpanFilterResult(DocIdSet docIdSet, List<PositionInfo> positions) {
this.docIdSet = docIdSet;
this.positions = positions;
}
/**
* The first entry in the array corresponds to the first "on" bit.
* Entries are increasing by document order
* @return A List of PositionInfo objects
*/
public List<PositionInfo> getPositions() {
return positions;
}
/** Returns the docIdSet */
public DocIdSet getDocIdSet() {
return docIdSet;
}
public static class PositionInfo {
private int doc;
private List<StartEnd> positions;
public PositionInfo(int doc) {
this.doc = doc;
positions = new ArrayList<StartEnd>();
}
public void addPosition(int start, int end)
{
positions.add(new StartEnd(start, end));
}
public int getDoc() {
return doc;
}
/**
*
* @return Positions
*/
public List<StartEnd> getPositions() {
return positions;
}
}
public static class StartEnd
{
private int start;
private int end;
public StartEnd(int start, int end) {
this.start = start;
this.end = end;
}
/**
*
* @return The end position of this match
*/
public int getEnd() {
return end;
}
/**
* The Start position
* @return The start position of this match
*/
public int getStart() {
return start;
}
}
}

View File

@ -1,103 +0,0 @@
package org.apache.lucene.search;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.FixedBitSet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Constrains search results to only match those which also match a provided
* query. Also provides position information about where each document matches
* at the cost of extra space compared with the QueryWrapperFilter.
* There is an added cost to this above what is stored in a {@link QueryWrapperFilter}. Namely,
* the position information for each matching document is stored.
* <p/>
* This filter does not cache. See the {@link org.apache.lucene.search.CachingSpanFilter} for a wrapper that
* caches.
*/
public class SpanQueryFilter extends SpanFilter {
protected SpanQuery query;
protected SpanQueryFilter()
{
}
/** Constructs a filter which only matches documents matching
* <code>query</code>.
* @param query The {@link org.apache.lucene.search.spans.SpanQuery} to use as the basis for the Filter.
*/
public SpanQueryFilter(SpanQuery query) {
this.query = query;
}
@Override
public final DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
SpanFilterResult result = bitSpans(context, acceptDocs);
return result.getDocIdSet();
}
@Override
public SpanFilterResult bitSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException {
final FixedBitSet bits = new FixedBitSet(context.reader.maxDoc());
Spans spans = query.getSpans(context, acceptDocs);
List<SpanFilterResult.PositionInfo> tmp = new ArrayList<SpanFilterResult.PositionInfo>(20);
int currentDoc = -1;
SpanFilterResult.PositionInfo currentInfo = null;
while (spans.next())
{
int doc = spans.doc();
bits.set(doc);
if (currentDoc != doc)
{
currentInfo = new SpanFilterResult.PositionInfo(doc);
tmp.add(currentInfo);
currentDoc = doc;
}
currentInfo.addPosition(spans.start(), spans.end());
}
return new SpanFilterResult(bits, tmp);
}
public SpanQuery getQuery() {
return query;
}
@Override
public String toString() {
return "SpanQueryFilter(" + query + ")";
}
@Override
public boolean equals(Object o) {
return o instanceof SpanQueryFilter && this.query.equals(((SpanQueryFilter) o).query);
}
@Override
public int hashCode() {
return query.hashCode() ^ 0x923F64B9;
}
}

View File

@ -150,7 +150,7 @@ public class PayloadNearQuery extends SpanNearQuery {
@Override
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder,
boolean topScorer, Bits acceptDocs) throws IOException {
return new PayloadNearSpanScorer(query.getSpans(context, acceptDocs), this,
return new PayloadNearSpanScorer(query.getSpans(context, acceptDocs, termContexts), this,
similarity, similarity.sloppyDocScorer(stats, query.getField(), context));
}

View File

@ -20,8 +20,11 @@ package org.apache.lucene.search.payloads;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
@ -41,6 +44,7 @@ import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.TermContext;
/**
* Experimental class to get set of payloads for most standard Lucene queries.
@ -174,9 +178,15 @@ public class PayloadSpanUtil {
private void getPayloads(Collection<byte []> payloads, SpanQuery query)
throws IOException {
Map<Term,TermContext> termContexts = new HashMap<Term,TermContext>();
TreeSet<Term> terms = new TreeSet<Term>();
query.extractTerms(terms);
for (Term term : terms) {
termContexts.put(term, TermContext.build(context, term, true));
}
final AtomicReaderContext[] leaves = ReaderUtil.leaves(context);
for (AtomicReaderContext atomicReaderContext : leaves) {
final Spans spans = query.getSpans(atomicReaderContext, atomicReaderContext.reader.getLiveDocs());
final Spans spans = query.getSpans(atomicReaderContext, atomicReaderContext.reader.getLiveDocs(), termContexts);
while (spans.next() == true) {
if (spans.isPayloadAvailable()) {
Collection<byte[]> payload = spans.getPayload();

View File

@ -81,7 +81,7 @@ public class PayloadTermQuery extends SpanTermQuery {
@Override
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder,
boolean topScorer, Bits acceptDocs) throws IOException {
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context, acceptDocs),
return new PayloadTermSpanScorer((TermSpans) query.getSpans(context, acceptDocs, termContexts),
this, similarity.sloppyDocScorer(stats, query.getField(), context));
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.spans;
*/
import java.io.IOException;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
@ -27,6 +28,7 @@ import org.apache.lucene.search.Query;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
/**
@ -93,8 +95,8 @@ public class FieldMaskingSpanQuery extends SpanQuery {
// ...this is done to be more consistent with things like SpanFirstQuery
@Override
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException {
return maskedQuery.getSpans(context, acceptDocs);
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
return maskedQuery.getSpans(context, acceptDocs, termContexts);
}
@Override

View File

@ -17,9 +17,11 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.util.ArrayList;
@ -28,6 +30,7 @@ import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
/** A Spans that is formed from the ordered subspans of a SpanNearQuery
@ -78,11 +81,11 @@ public class NearSpansOrdered extends Spans {
private SpanNearQuery query;
private boolean collectPayloads = true;
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs) throws IOException {
this(spanNearQuery, context, acceptDocs, true);
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
this(spanNearQuery, context, acceptDocs, termContexts, true);
}
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs, boolean collectPayloads)
public NearSpansOrdered(SpanNearQuery spanNearQuery, AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts, boolean collectPayloads)
throws IOException {
if (spanNearQuery.getClauses().length < 2) {
throw new IllegalArgumentException("Less than 2 clauses: "
@ -95,7 +98,7 @@ public class NearSpansOrdered extends Spans {
matchPayload = new LinkedList<byte[]>();
subSpansByDoc = new Spans[clauses.length];
for (int i = 0; i < clauses.length; i++) {
subSpans[i] = clauses[i].getSpans(context, acceptDocs);
subSpans[i] = clauses[i].getSpans(context, acceptDocs, termContexts);
subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
}
query = spanNearQuery; // kept for toString() only.

View File

@ -17,14 +17,17 @@ package org.apache.lucene.search.spans;
* limitations under the License.
*/
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.HashSet;
@ -132,7 +135,7 @@ public class NearSpansUnordered extends Spans {
}
public NearSpansUnordered(SpanNearQuery query, AtomicReaderContext context, Bits acceptDocs)
public NearSpansUnordered(SpanNearQuery query, AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts)
throws IOException {
this.query = query;
this.slop = query.getSlop();
@ -142,7 +145,7 @@ public class NearSpansUnordered extends Spans {
subSpans = new Spans[clauses.length];
for (int i = 0; i < clauses.length; i++) {
SpansCell cell =
new SpansCell(clauses[i].getSpans(context, acceptDocs), i);
new SpansCell(clauses[i].getSpans(context, acceptDocs, termContexts), i);
ordered.add(cell);
subSpans[i] = cell.spans;
}

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search.spans;
*/
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
@ -90,7 +91,7 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
}
@Override
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException {
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
throw new UnsupportedOperationException("Query should have been rewritten");
}
@ -157,6 +158,9 @@ public class SpanMultiTermQueryWrapper<Q extends MultiTermQuery> extends SpanQue
@Override
protected void addClause(SpanOrQuery topLevel, Term term, int docCount, float boost, TermContext states) {
// TODO: would be nice to not lose term-state here.
// we could add a hack option to SpanOrQuery, but the hack would only work if this is the top-level Span
// (if you put this thing in another span query, it would extractTerms/double-seek anyway)
final SpanTermQuery q = new SpanTermQuery(term);
q.setBoost(boost);
topLevel.addClause(q);

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -31,6 +32,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
/** Matches spans which are near one another. One can specify <i>slop</i>, the
@ -118,16 +120,16 @@ public class SpanNearQuery extends SpanQuery implements Cloneable {
}
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs) throws IOException {
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
if (clauses.size() == 0) // optimize 0-clause case
return new SpanOrQuery(getClauses()).getSpans(context, acceptDocs);
return new SpanOrQuery(getClauses()).getSpans(context, acceptDocs, termContexts);
if (clauses.size() == 1) // optimize 1-clause case
return clauses.get(0).getSpans(context, acceptDocs);
return clauses.get(0).getSpans(context, acceptDocs, termContexts);
return inOrder
? (Spans) new NearSpansOrdered(this, context, acceptDocs, collectPayloads)
: (Spans) new NearSpansUnordered(this, context, acceptDocs);
? (Spans) new NearSpansOrdered(this, context, acceptDocs, termContexts, collectPayloads)
: (Spans) new NearSpansUnordered(this, context, acceptDocs, termContexts);
}
@Override

View File

@ -22,11 +22,13 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
/** Removes matches which overlap with another SpanQuery. */
@ -76,12 +78,12 @@ public class SpanNotQuery extends SpanQuery implements Cloneable {
}
@Override
public Spans getSpans(final AtomicReaderContext context, final Bits acceptDocs) throws IOException {
public Spans getSpans(final AtomicReaderContext context, final Bits acceptDocs, final Map<Term,TermContext> termContexts) throws IOException {
return new Spans() {
private Spans includeSpans = include.getSpans(context, acceptDocs);
private Spans includeSpans = include.getSpans(context, acceptDocs, termContexts);
private boolean moreInclude = true;
private Spans excludeSpans = exclude.getSpans(context, acceptDocs);
private Spans excludeSpans = exclude.getSpans(context, acceptDocs, termContexts);
private boolean moreExclude = excludeSpans.next();
@Override

View File

@ -23,6 +23,7 @@ import java.util.List;
import java.util.Collection;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
@ -30,6 +31,7 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.Query;
@ -164,9 +166,9 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
}
@Override
public Spans getSpans(final AtomicReaderContext context, final Bits acceptDocs) throws IOException {
public Spans getSpans(final AtomicReaderContext context, final Bits acceptDocs, final Map<Term,TermContext> termContexts) throws IOException {
if (clauses.size() == 1) // optimize 1-clause case
return (clauses.get(0)).getSpans(context, acceptDocs);
return (clauses.get(0)).getSpans(context, acceptDocs, termContexts);
return new Spans() {
private SpanQueue queue = null;
@ -175,7 +177,7 @@ public class SpanOrQuery extends SpanQuery implements Cloneable {
queue = new SpanQueue(clauses.size());
Iterator<SpanQuery> i = clauses.iterator();
while (i.hasNext()) {
Spans spans = i.next().getSpans(context, acceptDocs);
Spans spans = i.next().getSpans(context, acceptDocs, termContexts);
if ( ((target == -1) && spans.next())
|| ((target != -1) && spans.skipTo(target))) {
queue.add(spans);

View File

@ -22,10 +22,12 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.Set;
@ -82,8 +84,8 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
protected abstract AcceptStatus acceptPosition(Spans spans) throws IOException;
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs) throws IOException {
return new PositionCheckSpan(context, acceptDocs);
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
return new PositionCheckSpan(context, acceptDocs, termContexts);
}
@ -107,8 +109,8 @@ public abstract class SpanPositionCheckQuery extends SpanQuery implements Clonea
protected class PositionCheckSpan extends Spans {
private Spans spans;
public PositionCheckSpan(AtomicReaderContext context, Bits acceptDocs) throws IOException {
spans = match.getSpans(context, acceptDocs);
public PositionCheckSpan(AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
spans = match.getSpans(context, acceptDocs, termContexts);
}
@Override

View File

@ -18,18 +18,21 @@ package org.apache.lucene.search.spans;
*/
import java.io.IOException;
import java.util.Map;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
/** Base class for span-based queries. */
public abstract class SpanQuery extends Query {
/** Expert: Returns the matches for this query in an index. Used internally
* to search for spans. */
public abstract Spans getSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException;
public abstract Spans getSpans(AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException;
/** Returns the name of the field matched by this query.*/
public abstract String getField();

View File

@ -19,12 +19,19 @@ package org.apache.lucene.search.spans;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
import java.io.IOException;
import java.util.Map;
import java.util.Set;
/** Matches spans containing a term. */
@ -82,22 +89,46 @@ public class SpanTermQuery extends SpanQuery {
}
@Override
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs) throws IOException {
final IndexReader reader = context.reader;
final DocsAndPositionsEnum postings = reader.termPositionsEnum(acceptDocs,
term.field(),
term.bytes());
public Spans getSpans(final AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
TermContext termContext = termContexts.get(term);
final TermState state;
if (termContext == null) {
// this happens with span-not query, as it doesn't include the NOT side in extractTerms()
// so we seek to the term now in this segment..., this sucks because its ugly mostly!
final Fields fields = context.reader.fields();
if (fields != null) {
final Terms terms = fields.terms(term.field());
if (terms != null) {
final TermsEnum termsEnum = terms.getThreadTermsEnum(); // thread-private don't share!
if (termsEnum.seekExact(term.bytes(), true)) {
state = termsEnum.termState();
} else {
state = null;
}
} else {
state = null;
}
} else {
state = null;
}
} else {
state = termContext.get(context.ord);
}
if (state == null) { // term is not present in that reader
return TermSpans.EMPTY_TERM_SPANS;
}
final TermsEnum termsEnum = context.reader.terms(term.field()).getThreadTermsEnum();
termsEnum.seekExact(term.bytes(), state);
final DocsAndPositionsEnum postings = termsEnum.docsAndPositions(acceptDocs, null);
if (postings != null) {
return new TermSpans(postings, term);
} else {
if (reader.termDocsEnum(reader.getLiveDocs(), term.field(), term.bytes()) != null) {
// term does exist, but has no positions
throw new IllegalStateException("field \"" + term.field() + "\" was indexed without position data; cannot run SpanTermQuery (term=" + term.text() + ")");
} else {
// term does not exist
return TermSpans.EMPTY_TERM_SPANS;
}
}
}
}

View File

@ -27,7 +27,8 @@ import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
import java.io.IOException;
import java.util.Set;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeSet;
/**
@ -35,7 +36,7 @@ import java.util.TreeSet;
*/
public class SpanWeight extends Weight {
protected Similarity similarity;
protected Set<Term> terms;
protected Map<Term,TermContext> termContexts;
protected SpanQuery query;
protected Similarity.Stats stats;
@ -44,15 +45,16 @@ public class SpanWeight extends Weight {
this.similarity = searcher.getSimilarityProvider().get(query.getField());
this.query = query;
terms=new TreeSet<Term>();
termContexts = new HashMap<Term,TermContext>();
TreeSet<Term> terms = new TreeSet<Term>();
query.extractTerms(terms);
final ReaderContext context = searcher.getTopReaderContext();
final TermContext states[] = new TermContext[terms.size()];
final TermStatistics termStats[] = new TermStatistics[terms.size()];
int i = 0;
for (Term term : terms) {
states[i] = TermContext.build(context, term, true);
termStats[i] = searcher.termStatistics(term, states[i]);
TermContext state = TermContext.build(context, term, true);
termStats[i] = searcher.termStatistics(term, state);
termContexts.put(term, state);
i++;
}
stats = similarity.computeStats(
@ -77,7 +79,7 @@ public class SpanWeight extends Weight {
@Override
public Scorer scorer(AtomicReaderContext context, boolean scoreDocsInOrder,
boolean topScorer, Bits acceptDocs) throws IOException {
return new SpanScorer(query.getSpans(context, acceptDocs), this, similarity.sloppyDocScorer(stats, query.getField(), context));
return new SpanScorer(query.getSpans(context, acceptDocs, termContexts), this, similarity.sloppyDocScorer(stats, query.getField(), context));
}
@Override

View File

@ -281,19 +281,6 @@ final class JustCompileSearch {
}
}
static final class JustCompileSpanFilter extends SpanFilter {
@Override
public SpanFilterResult bitSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}
@Override
public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
return null;
}
}
static final class JustCompileTopDocsCollector extends TopDocsCollector<ScoreDoc> {
protected JustCompileTopDocsCollector(PriorityQueue<ScoreDoc> pq) {

View File

@ -1,147 +0,0 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.SerialMergeScheduler;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;
public class TestCachingSpanFilter extends LuceneTestCase {
public void testEnforceDeletions() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(
random,
dir,
newIndexWriterConfig(random, TEST_VERSION_CURRENT, new MockAnalyzer(random)).
setMergeScheduler(new SerialMergeScheduler()).
// asserts below requires no unexpected merges:
setMergePolicy(newLogMergePolicy(10))
);
// NOTE: cannot use writer.getReader because RIW (on
// flipping a coin) may give us a newly opened reader,
// but we use .reopen on this reader below and expect to
// (must) get an NRT reader:
IndexReader reader = IndexReader.open(writer.w, true);
// same reason we don't wrap?
IndexSearcher searcher = newSearcher(reader, false);
// add a doc, refresh the reader, and check that its there
Document doc = new Document();
FieldType customType = new FieldType(TextField.TYPE_STORED);
customType.setTokenized(false);
doc.add(newField("id", "1", customType));
writer.addDocument(doc);
reader = refreshReader(reader);
searcher.close();
searcher = newSearcher(reader, false);
TopDocs docs = searcher.search(new MatchAllDocsQuery(), 1);
assertEquals("Should find a hit...", 1, docs.totalHits);
final SpanFilter startFilter = new SpanQueryFilter(new SpanTermQuery(new Term("id", "1")));
CachingSpanFilter filter = new CachingSpanFilter(startFilter);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find a hit...", 1, docs.totalHits);
int missCount = filter.missCount;
assertTrue(missCount > 0);
Query constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 1, docs.totalHits);
assertEquals(missCount, filter.missCount);
// NOTE: important to hold ref here so GC doesn't clear
// the cache entry! Else the assert below may sometimes
// fail:
IndexReader oldReader = reader;
writer.addDocument(doc);
reader = refreshReader(reader);
searcher.close();
searcher = newSearcher(reader, false);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should find 2 hits...", 2, docs.totalHits);
assertTrue(filter.missCount > missCount);
missCount = filter.missCount;
constantScore = new ConstantScoreQuery(filter);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should find a hit...", 2, docs.totalHits);
assertEquals(missCount, filter.missCount);
// NOTE: important to hold ref here so GC doesn't clear
// the cache entry! Else the assert below may sometimes
// fail:
IndexReader oldReader2 = reader;
// now delete the doc, refresh the reader, and see that it's not there
writer.deleteDocuments(new Term("id", "1"));
reader = refreshReader(reader);
searcher.close();
searcher = newSearcher(reader, false);
docs = searcher.search(new MatchAllDocsQuery(), filter, 1);
assertEquals("[query + filter] Should *not* find a hit...", 0, docs.totalHits);
assertEquals(missCount, filter.missCount);
docs = searcher.search(constantScore, 1);
assertEquals("[just filter] Should *not* find a hit...", 0, docs.totalHits);
assertEquals(missCount, filter.missCount);
// NOTE: silliness to make sure JRE does not optimize
// away our holding onto oldReader to prevent
// CachingWrapperFilter's WeakHashMap from dropping the
// entry:
assertTrue(oldReader != null);
assertTrue(oldReader2 != null);
searcher.close();
writer.close();
reader.close();
dir.close();
}
private static IndexReader refreshReader(IndexReader reader) throws IOException {
IndexReader oldReader = reader;
reader = IndexReader.openIfChanged(reader);
if (reader != null) {
oldReader.close();
return reader;
} else {
return oldReader;
}
}
}

View File

@ -1,86 +0,0 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.List;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.English;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.ReaderUtil;
public class TestSpanQueryFilter extends LuceneTestCase {
public void testFilterWorks() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, dir, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
for (int i = 0; i < 500; i++) {
Document document = new Document();
document.add(newField("field", English.intToEnglish(i) + " equals " + English.intToEnglish(i),
TextField.TYPE_UNSTORED));
writer.addDocument(document);
}
final int number = 10;
IndexReader reader = writer.getReader();
writer.close();
AtomicReaderContext[] leaves = ReaderUtil.leaves(reader.getTopReaderContext());
int subIndex = ReaderUtil.subIndex(number, leaves); // find the reader with this document in it
SpanTermQuery query = new SpanTermQuery(new Term("field", English.intToEnglish(number).trim()));
SpanQueryFilter filter = new SpanQueryFilter(query);
SpanFilterResult result = filter.bitSpans(leaves[subIndex], leaves[subIndex].reader.getLiveDocs());
DocIdSet docIdSet = result.getDocIdSet();
assertTrue("docIdSet is null and it shouldn't be", docIdSet != null);
assertContainsDocId("docIdSet doesn't contain docId 10", docIdSet, number - leaves[subIndex].docBase);
List<SpanFilterResult.PositionInfo> spans = result.getPositions();
assertTrue("spans is null and it shouldn't be", spans != null);
int size = getDocIdSetSize(docIdSet);
assertTrue("spans Size: " + spans.size() + " is not: " + size, spans.size() == size);
for (final SpanFilterResult.PositionInfo info: spans) {
assertTrue("info is null and it shouldn't be", info != null);
//The doc should indicate the bit is on
assertContainsDocId("docIdSet doesn't contain docId " + info.getDoc(), docIdSet, info.getDoc());
//There should be two positions in each
assertTrue("info.getPositions() Size: " + info.getPositions().size() + " is not: " + 2, info.getPositions().size() == 2);
}
reader.close();
dir.close();
}
int getDocIdSetSize(DocIdSet docIdSet) throws Exception {
int size = 0;
DocIdSetIterator it = docIdSet.iterator();
while (it.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
size++;
}
return size;
}
public void assertContainsDocId(String msg, DocIdSet docIdSet, int docId) throws Exception {
DocIdSetIterator it = docIdSet.iterator();
assertTrue(msg, it.advance(docId) != DocIdSetIterator.NO_MORE_DOCS);
assertTrue(msg, it.docID() == docId);
}
}

View File

@ -19,11 +19,14 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.TermContext;
/**
* Holds all implementations of classes in the o.a.l.s.spans package as a
@ -83,7 +86,7 @@ final class JustCompileSearchSpans {
}
@Override
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs) throws IOException {
public Spans getSpans(AtomicReaderContext context, Bits acceptDocs, Map<Term,TermContext> termContexts) throws IOException {
throw new UnsupportedOperationException(UNSUPPORTED_MSG);
}

View File

@ -20,11 +20,16 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeSet;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.util.ReaderUtil;
import org.apache.lucene.util.TermContext;
/**
*
@ -39,19 +44,27 @@ public class MultiSpansWrapper extends Spans { // can't be package private due t
private AtomicReaderContext[] leaves;
private int leafOrd = 0;
private Spans current;
private Map<Term,TermContext> termContexts;
private MultiSpansWrapper(AtomicReaderContext[] leaves, SpanQuery query) {
private MultiSpansWrapper(AtomicReaderContext[] leaves, SpanQuery query, Map<Term,TermContext> termContexts) {
this.query = query;
this.leaves = leaves;
this.termContexts = termContexts;
}
public static Spans wrap(ReaderContext topLevelReaderContext, SpanQuery query) throws IOException {
Map<Term,TermContext> termContexts = new HashMap<Term,TermContext>();
TreeSet<Term> terms = new TreeSet<Term>();
query.extractTerms(terms);
for (Term term : terms) {
termContexts.put(term, TermContext.build(topLevelReaderContext, term, true));
}
AtomicReaderContext[] leaves = ReaderUtil.leaves(topLevelReaderContext);
if(leaves.length == 1) {
return query.getSpans(leaves[0], leaves[0].reader.getLiveDocs());
return query.getSpans(leaves[0], leaves[0].reader.getLiveDocs(), termContexts);
}
return new MultiSpansWrapper(leaves, query);
return new MultiSpansWrapper(leaves, query, termContexts);
}
@Override
@ -60,14 +73,14 @@ public class MultiSpansWrapper extends Spans { // can't be package private due t
return false;
}
if (current == null) {
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs());
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs(), termContexts);
}
while(true) {
if (current.next()) {
return true;
}
if (++leafOrd < leaves.length) {
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs());
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs(), termContexts);
} else {
current = null;
break;
@ -85,17 +98,17 @@ public class MultiSpansWrapper extends Spans { // can't be package private due t
int subIndex = ReaderUtil.subIndex(target, leaves);
assert subIndex >= leafOrd;
if (subIndex != leafOrd) {
current = query.getSpans(leaves[subIndex], leaves[subIndex].reader.getLiveDocs());
current = query.getSpans(leaves[subIndex], leaves[subIndex].reader.getLiveDocs(), termContexts);
leafOrd = subIndex;
} else if (current == null) {
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs());
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs(), termContexts);
}
while (true) {
if (current.skipTo(target - leaves[leafOrd].docBase)) {
return true;
}
if (++leafOrd < leaves.length) {
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs());
current = query.getSpans(leaves[leafOrd], leaves[leafOrd].reader.getLiveDocs(), termContexts);
} else {
current = null;
break;