mirror of https://github.com/apache/lucene.git
LUCENE-960: Added SpanFilter mechanism that provides BitSet information and Span information in a filter. This can be used to get position information on where in a Document that is "on" in the filter matched.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@557105 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
aae68e0ba6
commit
4293019522
|
@ -45,6 +45,8 @@ New features
|
|||
1. LUCENE-906: Elision filter for French.
|
||||
(Mathieu Lecarme via Otis Gospodnetic)
|
||||
|
||||
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
|
||||
|
||||
Optimizations
|
||||
|
||||
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the
|
||||
|
|
|
@ -0,0 +1,84 @@
|
|||
package org.apache.lucene.search;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.BitSet;
|
||||
import java.util.Map;
|
||||
import java.util.WeakHashMap;
|
||||
|
||||
/**
|
||||
* Wraps another SpanFilter's result and caches it. The purpose is to allow
|
||||
* filters to simply filter, and then wrap with this class to add caching.
|
||||
*/
|
||||
public class CachingSpanFilter extends SpanFilter {
|
||||
protected SpanFilter filter;
|
||||
|
||||
/**
|
||||
* A transient Filter cache. To cache Filters even when using {@link org.apache.lucene.search.RemoteSearchable} use
|
||||
* {@link org.apache.lucene.search.RemoteCachingWrapperFilter} instead.
|
||||
*/
|
||||
protected transient Map cache;
|
||||
|
||||
/**
|
||||
* @param filter Filter to cache results of
|
||||
*/
|
||||
public CachingSpanFilter(SpanFilter filter) {
|
||||
this.filter = filter;
|
||||
}
|
||||
|
||||
public BitSet bits(IndexReader reader) throws IOException {
|
||||
SpanFilterResult result = getCachedResult(reader);
|
||||
return result != null ? result.getBits() : null;
|
||||
}
|
||||
|
||||
private SpanFilterResult getCachedResult(IndexReader reader) throws IOException {
|
||||
SpanFilterResult result = null;
|
||||
if (cache == null) {
|
||||
cache = new WeakHashMap();
|
||||
}
|
||||
|
||||
synchronized (cache) { // check cache
|
||||
result = (SpanFilterResult) cache.get(reader);
|
||||
if (result == null) {
|
||||
result = filter.bitSpans(reader);
|
||||
cache.put(reader, result);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
|
||||
return getCachedResult(reader);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "CachingSpanFilter("+filter+")";
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
if (!(o instanceof CachingSpanFilter)) return false;
|
||||
return this.filter.equals(((CachingSpanFilter)o).filter);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return filter.hashCode() ^ 0x1117BF25;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.lucene.search;
|
||||
/**
|
||||
* Copyright 2007 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/** Abstract base class providing a mechanism to restrict searches to a subset
|
||||
of an index and also maintains and returns position information.
|
||||
|
||||
This is useful if you want to compare the positions from a SpanQuery with the positions of items in
|
||||
a filter. For instance, if you had a SpanFilter that marked all the occurrences of the word "foo" in documents,
|
||||
and then you entered a new SpanQuery containing bar, you could not only filter by the word foo, but you could
|
||||
then compare position information for post processing.
|
||||
*/
|
||||
public abstract class SpanFilter extends Filter{
|
||||
/** Returns a SpanFilterResult with true for documents which should be permitted in
|
||||
search results, and false for those that should not and Spans for where the true docs match.
|
||||
* @param reader The {@link org.apache.lucene.index.IndexReader} to load position and bitset information from
|
||||
* @return A {@link SpanFilterResult}
|
||||
* @throws java.io.IOException if there was an issue accessing the necessary information
|
||||
* */
|
||||
public abstract SpanFilterResult bitSpans(IndexReader reader) throws IOException;
|
||||
}
|
|
@ -0,0 +1,117 @@
|
|||
package org.apache.lucene.search;
|
||||
/**
|
||||
* Copyright 2005 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
|
||||
|
||||
/**
|
||||
* The results of a SpanQueryFilter. Wraps the BitSet and the position infomration from the SpanQuery
|
||||
*
|
||||
*<p/>
|
||||
* NOTE: This API is still experimental and subject to change.
|
||||
*
|
||||
**/
|
||||
public class SpanFilterResult {
|
||||
private BitSet bits;
|
||||
private List positions;//Spans spans;
|
||||
|
||||
/**
|
||||
*
|
||||
* @param bits The bits for the Filter
|
||||
* @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects
|
||||
*/
|
||||
public SpanFilterResult(BitSet bits, List positions) {
|
||||
this.bits = bits;
|
||||
this.positions = positions;
|
||||
}
|
||||
|
||||
/**
|
||||
* The first entry in the array corresponds to the first "on" bit.
|
||||
* Entries are increasing by document order
|
||||
* @return A List of PositionInfo objects
|
||||
*/
|
||||
public List getPositions() {
|
||||
return positions;
|
||||
}
|
||||
|
||||
public BitSet getBits() {
|
||||
return bits;
|
||||
}
|
||||
|
||||
|
||||
|
||||
public static class PositionInfo {
|
||||
private int doc;
|
||||
private List positions;
|
||||
|
||||
|
||||
public PositionInfo(int doc) {
|
||||
this.doc = doc;
|
||||
positions = new ArrayList();
|
||||
}
|
||||
|
||||
public void addPosition(int start, int end)
|
||||
{
|
||||
positions.add(new StartEnd(start, end));
|
||||
}
|
||||
|
||||
public int getDoc() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return A List of {@link org.apache.lucene.search.SpanFilterResult.StartEnd} objects
|
||||
*/
|
||||
public List getPositions() {
|
||||
return positions;
|
||||
}
|
||||
}
|
||||
|
||||
public static class StartEnd
|
||||
{
|
||||
private int start;
|
||||
private int end;
|
||||
|
||||
|
||||
public StartEnd(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @return The end position of this match
|
||||
*/
|
||||
public int getEnd() {
|
||||
return end;
|
||||
}
|
||||
|
||||
/**
|
||||
* The Start position
|
||||
* @return The start position of this match
|
||||
*/
|
||||
public int getStart() {
|
||||
return start;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,101 @@
|
|||
package org.apache.lucene.search;
|
||||
/**
|
||||
* Copyright 2007 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.Spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.BitSet;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Constrains search results to only match those which also match a provided
|
||||
* query. Also provides position information about where each document matches
|
||||
* at the cost of extra space compared with the QueryWrapperFilter.
|
||||
* There is an added cost to this above what is stored in a {@link QueryWrapperFilter}. Namely,
|
||||
* the position information for each matching document is stored.
|
||||
* <p/>
|
||||
* This filter does not cache. See the {@link org.apache.lucene.search.CachingSpanFilter} for a wrapper that
|
||||
* caches.
|
||||
*
|
||||
*
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class SpanQueryFilter extends SpanFilter {
|
||||
protected SpanQuery query;
|
||||
|
||||
protected SpanQueryFilter()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
/** Constructs a filter which only matches documents matching
|
||||
* <code>query</code>.
|
||||
* @param query The {@link org.apache.lucene.search.spans.SpanQuery} to use as the basis for the Filter.
|
||||
*/
|
||||
public SpanQueryFilter(SpanQuery query) {
|
||||
this.query = query;
|
||||
}
|
||||
|
||||
public BitSet bits(IndexReader reader) throws IOException {
|
||||
SpanFilterResult result = bitSpans(reader);
|
||||
return result.getBits();
|
||||
}
|
||||
|
||||
|
||||
public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
|
||||
|
||||
final BitSet bits = new BitSet(reader.maxDoc());
|
||||
Spans spans = query.getSpans(reader);
|
||||
List tmp = new ArrayList(20);
|
||||
int currentDoc = -1;
|
||||
SpanFilterResult.PositionInfo currentInfo = null;
|
||||
while (spans.next())
|
||||
{
|
||||
int doc = spans.doc();
|
||||
bits.set(doc);
|
||||
if (currentDoc != doc)
|
||||
{
|
||||
currentInfo = new SpanFilterResult.PositionInfo(doc);
|
||||
tmp.add(currentInfo);
|
||||
currentDoc = doc;
|
||||
}
|
||||
currentInfo.addPosition(spans.start(), spans.end());
|
||||
}
|
||||
return new SpanFilterResult(bits, tmp);
|
||||
}
|
||||
|
||||
|
||||
public SpanQuery getQuery() {
|
||||
return query;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "QueryWrapperFilter(" + query + ")";
|
||||
}
|
||||
|
||||
public boolean equals(Object o) {
|
||||
return o instanceof SpanQueryFilter && this.query.equals(((SpanQueryFilter) o).query);
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return query.hashCode() ^ 0x923F64B9;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
package org.apache.lucene.search;
|
||||
|
||||
/**
|
||||
* Copyright 2004 The Apache Software Foundation
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
import org.apache.lucene.util.English;
|
||||
|
||||
import java.util.BitSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
public class TestSpanQueryFilter extends TestCase {
|
||||
|
||||
|
||||
public TestSpanQueryFilter(String s) {
|
||||
super(s);
|
||||
}
|
||||
|
||||
protected void setUp() {
|
||||
}
|
||||
|
||||
protected void tearDown() {
|
||||
|
||||
}
|
||||
|
||||
public void testFilterWorks() throws Exception {
|
||||
Directory dir = new RAMDirectory();
|
||||
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
|
||||
for (int i = 0; i < 500; i++) {
|
||||
Document document = new Document();
|
||||
document.add(new Field("field", English.intToEnglish(i) + " equals " + English.intToEnglish(i),
|
||||
Field.Store.NO, Field.Index.TOKENIZED));
|
||||
writer.addDocument(document);
|
||||
}
|
||||
writer.close();
|
||||
|
||||
IndexReader reader = IndexReader.open(dir);
|
||||
|
||||
SpanTermQuery query = new SpanTermQuery(new Term("field", English.intToEnglish(10).trim()));
|
||||
SpanQueryFilter filter = new SpanQueryFilter(query);
|
||||
SpanFilterResult result = filter.bitSpans(reader);
|
||||
BitSet bits = result.getBits();
|
||||
assertTrue("bits is null and it shouldn't be", bits != null);
|
||||
assertTrue("tenth bit is not on", bits.get(10));
|
||||
List spans = result.getPositions();
|
||||
assertTrue("spans is null and it shouldn't be", spans != null);
|
||||
assertTrue("spans Size: " + spans.size() + " is not: " + bits.cardinality(), spans.size() == bits.cardinality());
|
||||
for (Iterator iterator = spans.iterator(); iterator.hasNext();) {
|
||||
SpanFilterResult.PositionInfo info = (SpanFilterResult.PositionInfo) iterator.next();
|
||||
assertTrue("info is null and it shouldn't be", info != null);
|
||||
//The doc should indicate the bit is on
|
||||
assertTrue("Bit is not on and it should be", bits.get(info.getDoc()));
|
||||
//There should be two positions in each
|
||||
assertTrue("info.getPositions() Size: " + info.getPositions().size() + " is not: " + 2, info.getPositions().size() == 2);
|
||||
}
|
||||
reader.close();
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue