From 582356dc2ec71927d5d770b725f154585ed2cc05 Mon Sep 17 00:00:00 2001 From: "Chris M. Hostetter" Date: Fri, 1 May 2009 19:14:31 +0000 Subject: [PATCH] LUCENE-1494: Added FieldMaskingSpanQuery which can be used to cross-correlate Spans from different fields git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@770794 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 + .../search/spans/FieldMaskingSpanQuery.java | 158 ++++++++ .../spans/TestFieldMaskingSpanQuery.java | 346 ++++++++++++++++++ 3 files changed, 508 insertions(+) create mode 100644 src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java create mode 100644 src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java diff --git a/CHANGES.txt b/CHANGES.txt index dcca9f09d1f..01e879f4a0f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -283,6 +283,10 @@ Bug fixes 22. LUCENE-1605: Added BitVector.subset(). (Jeremy Volkman via Mike McCandless) +23. LUCENE-1494: Added FieldMaskingSpanQuery which can be used to + cross-correlate Spans from different fields. + (Paul Cowan and Chris Hostetter) + Optimizations 1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing diff --git a/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java b/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java new file mode 100644 index 00000000000..6bf8b906203 --- /dev/null +++ b/src/java/org/apache/lucene/search/spans/FieldMaskingSpanQuery.java @@ -0,0 +1,158 @@ +package org.apache.lucene.search.spans; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Collection; +import java.util.Set; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Searcher; +import org.apache.lucene.search.Similarity; +import org.apache.lucene.search.Weight; +import org.apache.lucene.util.ToStringUtils; + +/** + *

Wrapper to allow {@link SpanQuery} objects participate in composite + * single-field SpanQueries by 'lying' about their search field. That is, + * the masked SpanQuery will function as normal, + * but {@link SpanQuery#getField()} simply hands back the value supplied + * in this class's constructor.

+ * + *

This can be used to support Queries like {@link SpanNearQuery} or + * {@link SpanOrQuery} across different fields, which is not ordinarily + * permitted.

+ * + *

This can be useful for denormalized relational data: for example, when + * indexing a document with conceptually many 'children':

+ * + *
+ *  teacherid: 1
+ *  studentfirstname: james
+ *  studentsurname: jones
+ *  
+ *  teacherid: 2
+ *  studenfirstname: james
+ *  studentsurname: smith
+ *  studentfirstname: sally
+ *  studentsurname: jones
+ * 
+ * + *

a SpanNearQuery with a slop of 0 can be applied across two + * {@link SpanTermQuery} objects as follows: + *

+ *    SpanQuery q1  = new SpanTermQuery(new Term("studentfirstname", "james"));
+ *    SpanQuery q2  = new SpanTermQuery(new Term("studentsurname", "jones"));
+ *    SpanQuery q2m new FieldMaskingSpanQuery(q2, "studentfirstname");
+ *    Query q = new SpanNearQuery(new SpanQuery[]{q1, q2m}, -1, false);
+ * 
+ * to search for 'studentfirstname:james studentsurname:jones' and find + * teacherid 1 without matching teacherid 2 (which has a 'james' in position 0 + * and 'jones' in position 1).

+ * + *

Note: as {@link #getField()} returns the masked field, scoring will be + * done using the norms of the field name supplied. This may lead to unexpected + * scoring behaviour.

+ */ +public class FieldMaskingSpanQuery extends SpanQuery { + private SpanQuery maskedQuery; + private String field; + + public FieldMaskingSpanQuery(SpanQuery maskedQuery, String maskedField) { + this.maskedQuery = maskedQuery; + this.field = maskedField; + } + + public String getField() { + return field; + } + + public SpanQuery getMaskedQuery() { + return maskedQuery; + } + + // :NOTE: getBoost and setBoost are not proxied to the maskedQuery + // ...this is done to be more consistent with thigns like SpanFirstQuery + + public Spans getSpans(IndexReader reader) throws IOException { + return maskedQuery.getSpans(reader); + } + public PayloadSpans getPayloadSpans(IndexReader reader) throws IOException { + return maskedQuery.getPayloadSpans(reader); + } + + public Collection getTerms() { + return maskedQuery.getTerms(); + } + + public void extractTerms(Set terms) { + maskedQuery.extractTerms(terms); + } + + protected Weight createWeight(Searcher searcher) throws IOException { + return maskedQuery.createWeight(searcher); + } + + public Similarity getSimilarity(Searcher searcher) { + return maskedQuery.getSimilarity(searcher); + } + + public Query rewrite(IndexReader reader) throws IOException { + FieldMaskingSpanQuery clone = null; + + SpanQuery rewritten = (SpanQuery) maskedQuery.rewrite(reader); + if (rewritten != maskedQuery) { + clone = (FieldMaskingSpanQuery) this.clone(); + clone.maskedQuery = rewritten; + } + + if (clone != null) { + return clone; + } else { + return this; + } + } + + public String toString(String field) { + StringBuffer buffer = new StringBuffer(); + buffer.append("mask("); + buffer.append(maskedQuery.toString(field)); + buffer.append(")"); + buffer.append(ToStringUtils.boost(getBoost())); + buffer.append(" as "); + buffer.append(this.field); + return buffer.toString(); + } + + public boolean equals(Object o) { + if (!(o instanceof FieldMaskingSpanQuery)) + return false; + FieldMaskingSpanQuery other = (FieldMaskingSpanQuery) o; + return (this.getField().equals(other.getField()) + && (this.getBoost() == other.getBoost()) + && this.getMaskedQuery().equals(other.getMaskedQuery())); + + } + + public int hashCode() { + return getMaskedQuery().hashCode() + ^ getField().hashCode() + ^ Float.floatToRawIntBits(getBoost()); + } +} diff --git a/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java b/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java new file mode 100644 index 00000000000..467b69c1c63 --- /dev/null +++ b/src/test/org/apache/lucene/search/spans/TestFieldMaskingSpanQuery.java @@ -0,0 +1,346 @@ +package org.apache.lucene.search.spans; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Scorer; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.CheckHits; +import org.apache.lucene.search.QueryUtils; + +import org.apache.lucene.store.RAMDirectory; + +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; + +import org.apache.lucene.analysis.WhitespaceAnalyzer; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; + +import org.apache.lucene.util.LuceneTestCase; + +import java.util.HashSet; + +public class TestFieldMaskingSpanQuery extends LuceneTestCase { + + protected static Document doc(Field[] fields) { + Document doc = new Document(); + for (int i = 0; i < fields.length; i++) { + doc.add(fields[i]); + } + return doc; + } + + protected static Field field(String name, String value) { + return new Field(name, value, Field.Store.NO, Field.Index.ANALYZED); + } + + protected IndexSearcher searcher; + + public void setUp() throws Exception { + super.setUp(); + RAMDirectory directory = new RAMDirectory(); + IndexWriter writer= new IndexWriter(directory, + new WhitespaceAnalyzer(), true, + IndexWriter.MaxFieldLength.LIMITED); + + writer.addDocument(doc(new Field[] { field("id", "0") + , + field("gender", "male"), + field("first", "james"), + field("last", "jones") })); + + writer.addDocument(doc(new Field[] { field("id", "1") + , + field("gender", "male"), + field("first", "james"), + field("last", "smith") + , + field("gender", "female"), + field("first", "sally"), + field("last", "jones") })); + + writer.addDocument(doc(new Field[] { field("id", "2") + , + field("gender", "female"), + field("first", "greta"), + field("last", "jones") + , + field("gender", "female"), + field("first", "sally"), + field("last", "smith") + , + field("gender", "male"), + field("first", "james"), + field("last", "jones") })); + + writer.addDocument(doc(new Field[] { field("id", "3") + , + field("gender", "female"), + field("first", "lisa"), + field("last", "jones") + , + field("gender", "male"), + field("first", "bob"), + field("last", "costas") })); + + writer.addDocument(doc(new Field[] { field("id", "4") + , + field("gender", "female"), + field("first", "sally"), + field("last", "smith") + , + field("gender", "female"), + field("first", "linda"), + field("last", "dixit") + , + field("gender", "male"), + field("first", "bubba"), + field("last", "jones") })); + + writer.close(); + searcher = new IndexSearcher(directory); + } + + public void tearDown() throws Exception { + super.tearDown(); + searcher.close(); + } + + protected void check(SpanQuery q, int[] docs) throws Exception { + CheckHits.checkHitCollector(q, null, searcher, docs); + } + + public void testRewrite0() throws Exception { + SpanQuery q = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "first"); + q.setBoost(8.7654321f); + SpanQuery qr = (SpanQuery) searcher.rewrite(q); + + QueryUtils.checkEqual(q, qr); + + assertEquals(1, qr.getTerms().size()); + } + + public void testRewrite1() throws Exception { + // mask an anon SpanQuery class that rewrites to something else. + SpanQuery q = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) { + public Query rewrite(IndexReader reader) { + return new SpanOrQuery(new SpanQuery[] { + new SpanTermQuery(new Term("first", "sally")), + new SpanTermQuery(new Term("first", "james")) }); + } + }, "first"); + + SpanQuery qr = (SpanQuery) searcher.rewrite(q); + + QueryUtils.checkUnequal(q, qr); + + assertEquals(2, qr.getTerms().size()); + } + + public void testRewrite2() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("last", "smith")); + SpanQuery q2 = new SpanTermQuery(new Term("last", "jones")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] + { q1, new FieldMaskingSpanQuery(q2, "last")}, 1, true ); + Query qr = searcher.rewrite(q); + + QueryUtils.checkEqual(q, qr); + + HashSet set = new HashSet(); + qr.extractTerms(set); + assertEquals(2, set.size()); + } + + public void testEquality1() { + SpanQuery q1 = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "first"); + SpanQuery q2 = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "first"); + SpanQuery q3 = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "XXXXX"); + SpanQuery q4 = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "XXXXX")) , "first"); + SpanQuery q5 = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("xXXX", "sally")) , "first"); + QueryUtils.checkEqual(q1, q2); + QueryUtils.checkUnequal(q1, q3); + QueryUtils.checkUnequal(q1, q4); + QueryUtils.checkUnequal(q1, q5); + + SpanQuery qA = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "first"); + qA.setBoost(9f); + SpanQuery qB = new FieldMaskingSpanQuery + (new SpanTermQuery(new Term("last", "sally")) , "first"); + QueryUtils.checkUnequal(qA, qB); + qB.setBoost(9f); + QueryUtils.checkEqual(qA, qB); + + } + + public void testNoop0() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("last", "sally")); + SpanQuery q = new FieldMaskingSpanQuery(q1, "first"); + check(q, new int[] { /* :EMPTY: */ }); + } + public void testNoop1() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("last", "smith")); + SpanQuery q2 = new SpanTermQuery(new Term("last", "jones")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] + { q1, new FieldMaskingSpanQuery(q2, "last")}, 0, true ); + check(q, new int[] { 1, 2 }); + q = new SpanNearQuery(new SpanQuery[] + { new FieldMaskingSpanQuery(q1, "last"), + new FieldMaskingSpanQuery(q2, "last")}, 0, true ); + check(q, new int[] { 1, 2 }); + } + + public void testSimple1() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("first", "james")); + SpanQuery q2 = new SpanTermQuery(new Term("last", "jones")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] + { q1, new FieldMaskingSpanQuery(q2, "first")}, -1, false ); + check(q, new int[] { 0, 2 }); + q = new SpanNearQuery(new SpanQuery[] + { new FieldMaskingSpanQuery(q2, "first"), q1}, -1, false ); + check(q, new int[] { 0, 2 }); + q = new SpanNearQuery(new SpanQuery[] + { q2, new FieldMaskingSpanQuery(q1, "last")}, -1, false ); + check(q, new int[] { 0, 2 }); + q = new SpanNearQuery(new SpanQuery[] + { new FieldMaskingSpanQuery(q1, "last"), q2}, -1, false ); + check(q, new int[] { 0, 2 }); + + } + + public void testSimple2() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("gender", "female")); + SpanQuery q2 = new SpanTermQuery(new Term("last", "smith")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] + { q1, new FieldMaskingSpanQuery(q2, "gender")}, -1, false ); + check(q, new int[] { 2, 4 }); + q = new SpanNearQuery(new SpanQuery[] + { new FieldMaskingSpanQuery(q1, "id"), + new FieldMaskingSpanQuery(q2, "id") }, -1, false ); + check(q, new int[] { 2, 4 }); + } + + public void testSpans0() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("gender", "female")); + SpanQuery q2 = new SpanTermQuery(new Term("first", "james")); + SpanQuery q = new SpanOrQuery(new SpanQuery[] + { q1, new FieldMaskingSpanQuery(q2, "gender")}); + check(q, new int[] { 0, 1, 2, 3, 4 }); + + Spans span = q.getSpans(searcher.getIndexReader()); + + assertEquals(true, span.next()); + assertEquals(s(0,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(1,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(1,1,2), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(2,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(2,1,2), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(2,2,3), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(3,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(4,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(4,1,2), s(span)); + + assertEquals(false, span.next()); + } + + public void testSpans1() throws Exception { + SpanQuery q1 = new SpanTermQuery(new Term("first", "sally")); + SpanQuery q2 = new SpanTermQuery(new Term("first", "james")); + SpanQuery qA = new SpanOrQuery(new SpanQuery[] { q1, q2 }); + SpanQuery qB = new FieldMaskingSpanQuery(qA, "id"); + + check(qA, new int[] { 0, 1, 2, 4 }); + check(qB, new int[] { 0, 1, 2, 4 }); + + Spans spanA = qA.getSpans(searcher.getIndexReader()); + Spans spanB = qB.getSpans(searcher.getIndexReader()); + + while (spanA.next()) { + assertTrue("spanB not still going", spanB.next()); + assertEquals("spanA not equal spanB", s(spanA), s(spanB)); + } + assertTrue("spanB still going even tough spanA is done", !(spanB.next())); + + } + + public void testSpans2() throws Exception { + SpanQuery qA1 = new SpanTermQuery(new Term("gender", "female")); + SpanQuery qA2 = new SpanTermQuery(new Term("first", "james")); + SpanQuery qA = new SpanOrQuery(new SpanQuery[] + { qA1, new FieldMaskingSpanQuery(qA2, "gender")}); + SpanQuery qB = new SpanTermQuery(new Term("last", "jones")); + SpanQuery q = new SpanNearQuery(new SpanQuery[] + { new FieldMaskingSpanQuery(qA, "id"), + new FieldMaskingSpanQuery(qB, "id") }, -1, false ); + check(q, new int[] { 0, 1, 2, 3 }); + + Spans span = q.getSpans(searcher.getIndexReader()); + + assertEquals(true, span.next()); + assertEquals(s(0,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(1,1,2), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(2,0,1), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(2,2,3), s(span)); + + assertEquals(true, span.next()); + assertEquals(s(3,0,1), s(span)); + + assertEquals(false, span.next()); + } + + public String s(Spans span) { + return s(span.doc(), span.start(), span.end()); + } + public String s(int doc, int start, int end) { + return "s(" + doc + "," + start + "," + end +")"; + } + +}