LUCENE-960: Added SpanFilter mechanism that provides BitSet information and Span information in a filter. This can be used to get position information on where in a Document that is "on" in the filter matched.

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@557105 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2007-07-18 00:52:07 +00:00
parent aae68e0ba6
commit 4293019522
6 changed files with 423 additions and 0 deletions

View File

@ -45,6 +45,8 @@ New features
1. LUCENE-906: Elision filter for French.
(Mathieu Lecarme via Otis Gospodnetic)
2. LUCENE-960: Added a SpanQueryFilter and related classes to allow for not only filtering, but knowing where in a Document a Filter matches (Grant Ingersoll)
Optimizations
1. LUCENE-937: CachingTokenFilter now uses an iterator to access the

View File

@ -0,0 +1,84 @@
package org.apache.lucene.search;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
import java.util.BitSet;
import java.util.Map;
import java.util.WeakHashMap;
/**
* Wraps another SpanFilter's result and caches it. The purpose is to allow
* filters to simply filter, and then wrap with this class to add caching.
*/
public class CachingSpanFilter extends SpanFilter {
protected SpanFilter filter;
/**
* A transient Filter cache. To cache Filters even when using {@link org.apache.lucene.search.RemoteSearchable} use
* {@link org.apache.lucene.search.RemoteCachingWrapperFilter} instead.
*/
protected transient Map cache;
/**
* @param filter Filter to cache results of
*/
public CachingSpanFilter(SpanFilter filter) {
this.filter = filter;
}
public BitSet bits(IndexReader reader) throws IOException {
SpanFilterResult result = getCachedResult(reader);
return result != null ? result.getBits() : null;
}
private SpanFilterResult getCachedResult(IndexReader reader) throws IOException {
SpanFilterResult result = null;
if (cache == null) {
cache = new WeakHashMap();
}
synchronized (cache) { // check cache
result = (SpanFilterResult) cache.get(reader);
if (result == null) {
result = filter.bitSpans(reader);
cache.put(reader, result);
}
}
return result;
}
public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
return getCachedResult(reader);
}
public String toString() {
return "CachingSpanFilter("+filter+")";
}
public boolean equals(Object o) {
if (!(o instanceof CachingSpanFilter)) return false;
return this.filter.equals(((CachingSpanFilter)o).filter);
}
public int hashCode() {
return filter.hashCode() ^ 0x1117BF25;
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.search;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import java.io.IOException;
/** Abstract base class providing a mechanism to restrict searches to a subset
of an index and also maintains and returns position information.
This is useful if you want to compare the positions from a SpanQuery with the positions of items in
a filter. For instance, if you had a SpanFilter that marked all the occurrences of the word "foo" in documents,
and then you entered a new SpanQuery containing bar, you could not only filter by the word foo, but you could
then compare position information for post processing.
*/
public abstract class SpanFilter extends Filter{
/** Returns a SpanFilterResult with true for documents which should be permitted in
search results, and false for those that should not and Spans for where the true docs match.
* @param reader The {@link org.apache.lucene.index.IndexReader} to load position and bitset information from
* @return A {@link SpanFilterResult}
* @throws java.io.IOException if there was an issue accessing the necessary information
* */
public abstract SpanFilterResult bitSpans(IndexReader reader) throws IOException;
}

View File

@ -0,0 +1,117 @@
package org.apache.lucene.search;
/**
* Copyright 2005 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
/**
* The results of a SpanQueryFilter. Wraps the BitSet and the position infomration from the SpanQuery
*
*<p/>
* NOTE: This API is still experimental and subject to change.
*
**/
public class SpanFilterResult {
private BitSet bits;
private List positions;//Spans spans;
/**
*
* @param bits The bits for the Filter
* @param positions A List of {@link org.apache.lucene.search.SpanFilterResult.PositionInfo} objects
*/
public SpanFilterResult(BitSet bits, List positions) {
this.bits = bits;
this.positions = positions;
}
/**
* The first entry in the array corresponds to the first "on" bit.
* Entries are increasing by document order
* @return A List of PositionInfo objects
*/
public List getPositions() {
return positions;
}
public BitSet getBits() {
return bits;
}
public static class PositionInfo {
private int doc;
private List positions;
public PositionInfo(int doc) {
this.doc = doc;
positions = new ArrayList();
}
public void addPosition(int start, int end)
{
positions.add(new StartEnd(start, end));
}
public int getDoc() {
return doc;
}
/**
*
* @return A List of {@link org.apache.lucene.search.SpanFilterResult.StartEnd} objects
*/
public List getPositions() {
return positions;
}
}
public static class StartEnd
{
private int start;
private int end;
public StartEnd(int start, int end) {
this.start = start;
this.end = end;
}
/**
*
* @return The end position of this match
*/
public int getEnd() {
return end;
}
/**
* The Start position
* @return The start position of this match
*/
public int getStart() {
return start;
}
}
}

View File

@ -0,0 +1,101 @@
package org.apache.lucene.search;
/**
* Copyright 2007 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.Spans;
import java.io.IOException;
import java.util.ArrayList;
import java.util.BitSet;
import java.util.List;
/**
* Constrains search results to only match those which also match a provided
* query. Also provides position information about where each document matches
* at the cost of extra space compared with the QueryWrapperFilter.
* There is an added cost to this above what is stored in a {@link QueryWrapperFilter}. Namely,
* the position information for each matching document is stored.
* <p/>
* This filter does not cache. See the {@link org.apache.lucene.search.CachingSpanFilter} for a wrapper that
* caches.
*
*
* @version $Id:$
*/
public class SpanQueryFilter extends SpanFilter {
protected SpanQuery query;
protected SpanQueryFilter()
{
}
/** Constructs a filter which only matches documents matching
* <code>query</code>.
* @param query The {@link org.apache.lucene.search.spans.SpanQuery} to use as the basis for the Filter.
*/
public SpanQueryFilter(SpanQuery query) {
this.query = query;
}
public BitSet bits(IndexReader reader) throws IOException {
SpanFilterResult result = bitSpans(reader);
return result.getBits();
}
public SpanFilterResult bitSpans(IndexReader reader) throws IOException {
final BitSet bits = new BitSet(reader.maxDoc());
Spans spans = query.getSpans(reader);
List tmp = new ArrayList(20);
int currentDoc = -1;
SpanFilterResult.PositionInfo currentInfo = null;
while (spans.next())
{
int doc = spans.doc();
bits.set(doc);
if (currentDoc != doc)
{
currentInfo = new SpanFilterResult.PositionInfo(doc);
tmp.add(currentInfo);
currentDoc = doc;
}
currentInfo.addPosition(spans.start(), spans.end());
}
return new SpanFilterResult(bits, tmp);
}
public SpanQuery getQuery() {
return query;
}
public String toString() {
return "QueryWrapperFilter(" + query + ")";
}
public boolean equals(Object o) {
return o instanceof SpanQueryFilter && this.query.equals(((SpanQueryFilter) o).query);
}
public int hashCode() {
return query.hashCode() ^ 0x923F64B9;
}
}

View File

@ -0,0 +1,81 @@
package org.apache.lucene.search;
/**
* Copyright 2004 The Apache Software Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.English;
import java.util.BitSet;
import java.util.Iterator;
import java.util.List;
public class TestSpanQueryFilter extends TestCase {
public TestSpanQueryFilter(String s) {
super(s);
}
protected void setUp() {
}
protected void tearDown() {
}
public void testFilterWorks() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true);
for (int i = 0; i < 500; i++) {
Document document = new Document();
document.add(new Field("field", English.intToEnglish(i) + " equals " + English.intToEnglish(i),
Field.Store.NO, Field.Index.TOKENIZED));
writer.addDocument(document);
}
writer.close();
IndexReader reader = IndexReader.open(dir);
SpanTermQuery query = new SpanTermQuery(new Term("field", English.intToEnglish(10).trim()));
SpanQueryFilter filter = new SpanQueryFilter(query);
SpanFilterResult result = filter.bitSpans(reader);
BitSet bits = result.getBits();
assertTrue("bits is null and it shouldn't be", bits != null);
assertTrue("tenth bit is not on", bits.get(10));
List spans = result.getPositions();
assertTrue("spans is null and it shouldn't be", spans != null);
assertTrue("spans Size: " + spans.size() + " is not: " + bits.cardinality(), spans.size() == bits.cardinality());
for (Iterator iterator = spans.iterator(); iterator.hasNext();) {
SpanFilterResult.PositionInfo info = (SpanFilterResult.PositionInfo) iterator.next();
assertTrue("info is null and it shouldn't be", info != null);
//The doc should indicate the bit is on
assertTrue("Bit is not on and it should be", bits.get(info.getDoc()));
//There should be two positions in each
assertTrue("info.getPositions() Size: " + info.getPositions().size() + " is not: " + 2, info.getPositions().size() == 2);
}
reader.close();
}
}