LUCENE-1494: Added FieldMaskingSpanQuery which can be used to cross-correlate Spans from different fields

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@770794 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Chris M. Hostetter 2009-05-01 19:14:31 +00:00
parent ab1254b4e6
commit 582356dc2e
3 changed files with 508 additions and 0 deletions

View File

@ -283,6 +283,10 @@ Bug fixes
22. LUCENE-1605: Added BitVector.subset(). (Jeremy Volkman via Mike
McCandless)
23. LUCENE-1494: Added FieldMaskingSpanQuery which can be used to
cross-correlate Spans from different fields.
(Paul Cowan and Chris Hostetter)
Optimizations
1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

View File

@ -0,0 +1,158 @@
package org.apache.lucene.search.spans;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Collection;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.ToStringUtils;
/**
* <p>Wrapper to allow {@link SpanQuery} objects participate in composite
* single-field SpanQueries by 'lying' about their search field. That is,
* the masked SpanQuery will function as normal,
* but {@link SpanQuery#getField()} simply hands back the value supplied
* in this class's constructor.</p>
*
* <p>This can be used to support Queries like {@link SpanNearQuery} or
* {@link SpanOrQuery} across different fields, which is not ordinarily
* permitted.</p>
*
* <p>This can be useful for denormalized relational data: for example, when
* indexing a document with conceptually many 'children': </p>
*
* <pre>
* teacherid: 1
* studentfirstname: james
* studentsurname: jones
*
* teacherid: 2
* studenfirstname: james
* studentsurname: smith
* studentfirstname: sally
* studentsurname: jones
* </pre>
*
* <p>a SpanNearQuery with a slop of 0 can be applied across two
* {@link SpanTermQuery} objects as follows:
* <pre>
* SpanQuery q1 = new SpanTermQuery(new Term("studentfirstname", "james"));
* SpanQuery q2 = new SpanTermQuery(new Term("studentsurname", "jones"));
* SpanQuery q2m new FieldMaskingSpanQuery(q2, "studentfirstname");
* Query q = new SpanNearQuery(new SpanQuery[]{q1, q2m}, -1, false);
* </pre>
* to search for 'studentfirstname:james studentsurname:jones' and find
* teacherid 1 without matching teacherid 2 (which has a 'james' in position 0
* and 'jones' in position 1). </p>
*
* <p>Note: as {@link #getField()} returns the masked field, scoring will be
* done using the norms of the field name supplied. This may lead to unexpected
* scoring behaviour.</p>
*/
public class FieldMaskingSpanQuery extends SpanQuery {
private SpanQuery maskedQuery;
private String field;
public FieldMaskingSpanQuery(SpanQuery maskedQuery, String maskedField) {
this.maskedQuery = maskedQuery;
this.field = maskedField;
}
public String getField() {
return field;
}
public SpanQuery getMaskedQuery() {
return maskedQuery;
}
// :NOTE: getBoost and setBoost are not proxied to the maskedQuery
// ...this is done to be more consistent with thigns like SpanFirstQuery
public Spans getSpans(IndexReader reader) throws IOException {
return maskedQuery.getSpans(reader);
}
public PayloadSpans getPayloadSpans(IndexReader reader) throws IOException {
return maskedQuery.getPayloadSpans(reader);
}
public Collection getTerms() {
return maskedQuery.getTerms();
}
public void extractTerms(Set terms) {
maskedQuery.extractTerms(terms);
}
protected Weight createWeight(Searcher searcher) throws IOException {
return maskedQuery.createWeight(searcher);
}
public Similarity getSimilarity(Searcher searcher) {
return maskedQuery.getSimilarity(searcher);
}
public Query rewrite(IndexReader reader) throws IOException {
FieldMaskingSpanQuery clone = null;
SpanQuery rewritten = (SpanQuery) maskedQuery.rewrite(reader);
if (rewritten != maskedQuery) {
clone = (FieldMaskingSpanQuery) this.clone();
clone.maskedQuery = rewritten;
}
if (clone != null) {
return clone;
} else {
return this;
}
}
public String toString(String field) {
StringBuffer buffer = new StringBuffer();
buffer.append("mask(");
buffer.append(maskedQuery.toString(field));
buffer.append(")");
buffer.append(ToStringUtils.boost(getBoost()));
buffer.append(" as ");
buffer.append(this.field);
return buffer.toString();
}
public boolean equals(Object o) {
if (!(o instanceof FieldMaskingSpanQuery))
return false;
FieldMaskingSpanQuery other = (FieldMaskingSpanQuery) o;
return (this.getField().equals(other.getField())
&& (this.getBoost() == other.getBoost())
&& this.getMaskedQuery().equals(other.getMaskedQuery()));
}
public int hashCode() {
return getMaskedQuery().hashCode()
^ getField().hashCode()
^ Float.floatToRawIntBits(getBoost());
}
}

View File

@ -0,0 +1,346 @@
package org.apache.lucene.search.spans;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.CheckHits;
import org.apache.lucene.search.QueryUtils;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.util.LuceneTestCase;
import java.util.HashSet;
public class TestFieldMaskingSpanQuery extends LuceneTestCase {
protected static Document doc(Field[] fields) {
Document doc = new Document();
for (int i = 0; i < fields.length; i++) {
doc.add(fields[i]);
}
return doc;
}
protected static Field field(String name, String value) {
return new Field(name, value, Field.Store.NO, Field.Index.ANALYZED);
}
protected IndexSearcher searcher;
public void setUp() throws Exception {
super.setUp();
RAMDirectory directory = new RAMDirectory();
IndexWriter writer= new IndexWriter(directory,
new WhitespaceAnalyzer(), true,
IndexWriter.MaxFieldLength.LIMITED);
writer.addDocument(doc(new Field[] { field("id", "0")
,
field("gender", "male"),
field("first", "james"),
field("last", "jones") }));
writer.addDocument(doc(new Field[] { field("id", "1")
,
field("gender", "male"),
field("first", "james"),
field("last", "smith")
,
field("gender", "female"),
field("first", "sally"),
field("last", "jones") }));
writer.addDocument(doc(new Field[] { field("id", "2")
,
field("gender", "female"),
field("first", "greta"),
field("last", "jones")
,
field("gender", "female"),
field("first", "sally"),
field("last", "smith")
,
field("gender", "male"),
field("first", "james"),
field("last", "jones") }));
writer.addDocument(doc(new Field[] { field("id", "3")
,
field("gender", "female"),
field("first", "lisa"),
field("last", "jones")
,
field("gender", "male"),
field("first", "bob"),
field("last", "costas") }));
writer.addDocument(doc(new Field[] { field("id", "4")
,
field("gender", "female"),
field("first", "sally"),
field("last", "smith")
,
field("gender", "female"),
field("first", "linda"),
field("last", "dixit")
,
field("gender", "male"),
field("first", "bubba"),
field("last", "jones") }));
writer.close();
searcher = new IndexSearcher(directory);
}
public void tearDown() throws Exception {
super.tearDown();
searcher.close();
}
protected void check(SpanQuery q, int[] docs) throws Exception {
CheckHits.checkHitCollector(q, null, searcher, docs);
}
public void testRewrite0() throws Exception {
SpanQuery q = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "first");
q.setBoost(8.7654321f);
SpanQuery qr = (SpanQuery) searcher.rewrite(q);
QueryUtils.checkEqual(q, qr);
assertEquals(1, qr.getTerms().size());
}
public void testRewrite1() throws Exception {
// mask an anon SpanQuery class that rewrites to something else.
SpanQuery q = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) {
public Query rewrite(IndexReader reader) {
return new SpanOrQuery(new SpanQuery[] {
new SpanTermQuery(new Term("first", "sally")),
new SpanTermQuery(new Term("first", "james")) });
}
}, "first");
SpanQuery qr = (SpanQuery) searcher.rewrite(q);
QueryUtils.checkUnequal(q, qr);
assertEquals(2, qr.getTerms().size());
}
public void testRewrite2() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("last", "smith"));
SpanQuery q2 = new SpanTermQuery(new Term("last", "jones"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
{ q1, new FieldMaskingSpanQuery(q2, "last")}, 1, true );
Query qr = searcher.rewrite(q);
QueryUtils.checkEqual(q, qr);
HashSet set = new HashSet();
qr.extractTerms(set);
assertEquals(2, set.size());
}
public void testEquality1() {
SpanQuery q1 = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "first");
SpanQuery q2 = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "first");
SpanQuery q3 = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "XXXXX");
SpanQuery q4 = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "XXXXX")) , "first");
SpanQuery q5 = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("xXXX", "sally")) , "first");
QueryUtils.checkEqual(q1, q2);
QueryUtils.checkUnequal(q1, q3);
QueryUtils.checkUnequal(q1, q4);
QueryUtils.checkUnequal(q1, q5);
SpanQuery qA = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "first");
qA.setBoost(9f);
SpanQuery qB = new FieldMaskingSpanQuery
(new SpanTermQuery(new Term("last", "sally")) , "first");
QueryUtils.checkUnequal(qA, qB);
qB.setBoost(9f);
QueryUtils.checkEqual(qA, qB);
}
public void testNoop0() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("last", "sally"));
SpanQuery q = new FieldMaskingSpanQuery(q1, "first");
check(q, new int[] { /* :EMPTY: */ });
}
public void testNoop1() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("last", "smith"));
SpanQuery q2 = new SpanTermQuery(new Term("last", "jones"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
{ q1, new FieldMaskingSpanQuery(q2, "last")}, 0, true );
check(q, new int[] { 1, 2 });
q = new SpanNearQuery(new SpanQuery[]
{ new FieldMaskingSpanQuery(q1, "last"),
new FieldMaskingSpanQuery(q2, "last")}, 0, true );
check(q, new int[] { 1, 2 });
}
public void testSimple1() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("first", "james"));
SpanQuery q2 = new SpanTermQuery(new Term("last", "jones"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
{ q1, new FieldMaskingSpanQuery(q2, "first")}, -1, false );
check(q, new int[] { 0, 2 });
q = new SpanNearQuery(new SpanQuery[]
{ new FieldMaskingSpanQuery(q2, "first"), q1}, -1, false );
check(q, new int[] { 0, 2 });
q = new SpanNearQuery(new SpanQuery[]
{ q2, new FieldMaskingSpanQuery(q1, "last")}, -1, false );
check(q, new int[] { 0, 2 });
q = new SpanNearQuery(new SpanQuery[]
{ new FieldMaskingSpanQuery(q1, "last"), q2}, -1, false );
check(q, new int[] { 0, 2 });
}
public void testSimple2() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("gender", "female"));
SpanQuery q2 = new SpanTermQuery(new Term("last", "smith"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
{ q1, new FieldMaskingSpanQuery(q2, "gender")}, -1, false );
check(q, new int[] { 2, 4 });
q = new SpanNearQuery(new SpanQuery[]
{ new FieldMaskingSpanQuery(q1, "id"),
new FieldMaskingSpanQuery(q2, "id") }, -1, false );
check(q, new int[] { 2, 4 });
}
public void testSpans0() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("gender", "female"));
SpanQuery q2 = new SpanTermQuery(new Term("first", "james"));
SpanQuery q = new SpanOrQuery(new SpanQuery[]
{ q1, new FieldMaskingSpanQuery(q2, "gender")});
check(q, new int[] { 0, 1, 2, 3, 4 });
Spans span = q.getSpans(searcher.getIndexReader());
assertEquals(true, span.next());
assertEquals(s(0,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(1,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(1,1,2), s(span));
assertEquals(true, span.next());
assertEquals(s(2,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(2,1,2), s(span));
assertEquals(true, span.next());
assertEquals(s(2,2,3), s(span));
assertEquals(true, span.next());
assertEquals(s(3,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(4,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(4,1,2), s(span));
assertEquals(false, span.next());
}
public void testSpans1() throws Exception {
SpanQuery q1 = new SpanTermQuery(new Term("first", "sally"));
SpanQuery q2 = new SpanTermQuery(new Term("first", "james"));
SpanQuery qA = new SpanOrQuery(new SpanQuery[] { q1, q2 });
SpanQuery qB = new FieldMaskingSpanQuery(qA, "id");
check(qA, new int[] { 0, 1, 2, 4 });
check(qB, new int[] { 0, 1, 2, 4 });
Spans spanA = qA.getSpans(searcher.getIndexReader());
Spans spanB = qB.getSpans(searcher.getIndexReader());
while (spanA.next()) {
assertTrue("spanB not still going", spanB.next());
assertEquals("spanA not equal spanB", s(spanA), s(spanB));
}
assertTrue("spanB still going even tough spanA is done", !(spanB.next()));
}
public void testSpans2() throws Exception {
SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female"));
SpanQuery qA2 = new SpanTermQuery(new Term("first", "james"));
SpanQuery qA = new SpanOrQuery(new SpanQuery[]
{ qA1, new FieldMaskingSpanQuery(qA2, "gender")});
SpanQuery qB = new SpanTermQuery(new Term("last", "jones"));
SpanQuery q = new SpanNearQuery(new SpanQuery[]
{ new FieldMaskingSpanQuery(qA, "id"),
new FieldMaskingSpanQuery(qB, "id") }, -1, false );
check(q, new int[] { 0, 1, 2, 3 });
Spans span = q.getSpans(searcher.getIndexReader());
assertEquals(true, span.next());
assertEquals(s(0,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(1,1,2), s(span));
assertEquals(true, span.next());
assertEquals(s(2,0,1), s(span));
assertEquals(true, span.next());
assertEquals(s(2,2,3), s(span));
assertEquals(true, span.next());
assertEquals(s(3,0,1), s(span));
assertEquals(false, span.next());
}
public String s(Spans span) {
return s(span.doc(), span.start(), span.end());
}
public String s(int doc, int start, int end) {
return "s(" + doc + "," + start + "," + end +")";
}
}