From 2e1b9f5f16523cee3a3918732148b32d45694496 Mon Sep 17 00:00:00 2001 From: David Wayne Smiley Date: Fri, 26 Jul 2013 18:47:37 +0000 Subject: [PATCH] LUCENE-5091: add not-near capability to SpanNotQuery git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1507396 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 3 + .../lucene/search/spans/SpanNotQuery.java | 58 +++++++++++---- .../apache/lucene/search/spans/package.html | 4 +- .../lucene/search/spans/TestBasics.java | 71 +++++++++++++++++++ .../apache/lucene/search/spans/TestSpans.java | 51 ++++++++++++- 5 files changed, 170 insertions(+), 17 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index bfba78bbab0..72d8d278809 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -67,6 +67,9 @@ New features * LUCENE-5118: SpatialStrategy.makeDistanceValueSource() now has an optional multiplier for scaling degrees to another unit. (David Smiley) +* LUCENE-5091: SpanNotQuery can now be configured with pre and post slop to act + as a hypothetical SpanNotNearQuery. (Tim Allison via David Smiley) + Bug Fixes * LUCENE-5116: IndexWriter.addIndexes(IndexReader...) should drop empty (or all diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java index 9bba5db67c3..7a7fcf91c80 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java @@ -31,16 +31,36 @@ import java.util.Collection; import java.util.Map; import java.util.Set; -/** Removes matches which overlap with another SpanQuery. */ +/** Removes matches which overlap with another SpanQuery or + * within a x tokens before or y tokens after another SpanQuery. */ public class SpanNotQuery extends SpanQuery implements Cloneable { private SpanQuery include; private SpanQuery exclude; + private final int pre; + private final int post; /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude.*/ public SpanNotQuery(SpanQuery include, SpanQuery exclude) { + this(include, exclude, 0, 0); + } + + + /** Construct a SpanNotQuery matching spans from include which + * have no overlap with spans from exclude within + * dist tokens of include. */ + public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) { + this(include, exclude, dist, dist); + } + + /** Construct a SpanNotQuery matching spans from include which + * have no overlap with spans from exclude within + * pre tokens before or post tokens of include. */ + public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) { this.include = include; this.exclude = exclude; + this.pre = (pre >=0) ? pre : 0; + this.post = (post >= 0) ? post : 0; if (!include.getField().equals(exclude.getField())) throw new IllegalArgumentException("Clauses must have same field."); @@ -65,6 +85,10 @@ public class SpanNotQuery extends SpanQuery implements Cloneable { buffer.append(include.toString(field)); buffer.append(", "); buffer.append(exclude.toString(field)); + buffer.append(", "); + buffer.append(Integer.toString(pre)); + buffer.append(", "); + buffer.append(Integer.toString(post)); buffer.append(")"); buffer.append(ToStringUtils.boost(getBoost())); return buffer.toString(); @@ -72,7 +96,8 @@ public class SpanNotQuery extends SpanQuery implements Cloneable { @Override public SpanNotQuery clone() { - SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(),(SpanQuery) exclude.clone()); + SpanNotQuery spanNotQuery = new SpanNotQuery((SpanQuery)include.clone(), + (SpanQuery) exclude.clone(), pre, post); spanNotQuery.setBoost(getBoost()); return spanNotQuery; } @@ -98,13 +123,13 @@ public class SpanNotQuery extends SpanQuery implements Cloneable { while (moreExclude // while exclude is before && includeSpans.doc() == excludeSpans.doc() - && excludeSpans.end() <= includeSpans.start()) { + && excludeSpans.end() <= includeSpans.start() - pre) { moreExclude = excludeSpans.next(); // increment exclude } if (!moreExclude // if no intersection || includeSpans.doc() != excludeSpans.doc() - || includeSpans.end() <= excludeSpans.start()) + || includeSpans.end()+post <= excludeSpans.start()) break; // we found a match moreInclude = includeSpans.next(); // intersected: keep scanning @@ -126,13 +151,13 @@ public class SpanNotQuery extends SpanQuery implements Cloneable { while (moreExclude // while exclude is before && includeSpans.doc() == excludeSpans.doc() - && excludeSpans.end() <= includeSpans.start()) { + && excludeSpans.end() <= includeSpans.start()-pre) { moreExclude = excludeSpans.next(); // increment exclude } if (!moreExclude // if no intersection || includeSpans.doc() != excludeSpans.doc() - || includeSpans.end() <= excludeSpans.start()) + || includeSpans.end()+post <= excludeSpans.start()) return true; // we found a match return next(); // scan to next match @@ -199,23 +224,28 @@ public class SpanNotQuery extends SpanQuery implements Cloneable { /** Returns true iff o is equal to this. */ @Override public boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof SpanNotQuery)) return false; + if (!super.equals(o)) + return false; SpanNotQuery other = (SpanNotQuery)o; return this.include.equals(other.include) && this.exclude.equals(other.exclude) - && this.getBoost() == other.getBoost(); + && this.pre == other.pre + && this.post == other.post; } @Override public int hashCode() { - int h = include.hashCode(); - h = (h<<1) | (h >>> 31); // rotate left + int h = super.hashCode(); + h = Integer.rotateLeft(h, 1); + h ^= include.hashCode(); + h = Integer.rotateLeft(h, 1); h ^= exclude.hashCode(); - h = (h<<1) | (h >>> 31); // rotate left - h ^= Float.floatToRawIntBits(getBoost()); + h = Integer.rotateLeft(h, 1); + h ^= pre; + h = Integer.rotateLeft(h, 1); + h ^= post; return h; } -} +} \ No newline at end of file diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/package.html b/lucene/core/src/java/org/apache/lucene/search/spans/package.html index 3e43d3bd4f6..054336e34a2 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/package.html +++ b/lucene/core/src/java/org/apache/lucene/search/spans/package.html @@ -38,8 +38,8 @@ and inter-phrase proximity (when constructed from other {@link org.apache.lucene number of other {@link org.apache.lucene.search.spans.SpanQuery}s.
  • A {@link org.apache.lucene.search.spans.SpanNotQuery SpanNotQuery} removes spans -matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap -another. This can be used, e.g., to implement within-paragraph +matching one {@link org.apache.lucene.search.spans.SpanQuery SpanQuery} which overlap (or comes +near) another. This can be used, e.g., to implement within-paragraph search.
  • A {@link org.apache.lucene.search.spans.SpanFirstQuery SpanFirstQuery} matches spans diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java index 82ec34e8d86..1b00341f282 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java @@ -363,6 +363,77 @@ public class TestBasics extends LuceneTestCase { 1847, 1848, 1849, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949}); } + @Test + public void testSpanNotWindowOne() throws Exception { + SpanTermQuery term1 = new SpanTermQuery(new Term("field", "eight")); + SpanTermQuery term2 = new SpanTermQuery(new Term("field", "forty")); + SpanNearQuery near = new SpanNearQuery(new SpanQuery[] {term1, term2}, + 4, true); + SpanTermQuery term3 = new SpanTermQuery(new Term("field", "one")); + SpanNotQuery query = new SpanNotQuery(near, term3, 1, 1); + + checkHits(query, new int[] + {840, 842, 843, 844, 845, 846, 847, 848, 849, + 1840, 1842, 1843, 1844, 1845, 1846, 1847, 1848, 1849}); + + assertTrue(searcher.explain(query, 840).getValue() > 0.0f); + assertTrue(searcher.explain(query, 1842).getValue() > 0.0f); + } + + @Test + public void testSpanNotWindowTwoBefore() throws Exception { + SpanTermQuery term1 = new SpanTermQuery(new Term("field", "eight")); + SpanTermQuery term2 = new SpanTermQuery(new Term("field", "forty")); + SpanNearQuery near = new SpanNearQuery(new SpanQuery[] {term1, term2}, + 4, true); + SpanTermQuery term3 = new SpanTermQuery(new Term("field", "one")); + SpanNotQuery query = new SpanNotQuery(near, term3, 2, 0); + + checkHits(query, new int[] + {840, 841, 842, 843, 844, 845, 846, 847, 848, 849}); + + assertTrue(searcher.explain(query, 840).getValue() > 0.0f); + assertTrue(searcher.explain(query, 849).getValue() > 0.0f); + } + + @Test + public void testSpanNotWindowNeg() throws Exception { + //test handling of invalid window < 0 + SpanTermQuery term1 = new SpanTermQuery(new Term("field", "eight")); + SpanTermQuery term2 = new SpanTermQuery(new Term("field", "one")); + SpanNearQuery near = new SpanNearQuery(new SpanQuery[] {term1, term2}, + 4, true); + SpanTermQuery term3 = new SpanTermQuery(new Term("field", "forty")); + + SpanOrQuery or = new SpanOrQuery(term3); + + SpanNotQuery query = new SpanNotQuery(near, or); + + checkHits(query, new int[] + {801, 821, 831, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1851, 1861, 1871, 1881, 1891}); + + assertTrue(searcher.explain(query, 801).getValue() > 0.0f); + assertTrue(searcher.explain(query, 891).getValue() > 0.0f); + } + + @Test + public void testSpanNotWindowDoubleExcludesBefore() throws Exception { + //test hitting two excludes before an include + SpanTermQuery term1 = new SpanTermQuery(new Term("field", "forty")); + SpanTermQuery term2 = new SpanTermQuery(new Term("field", "two")); + SpanNearQuery near = new SpanNearQuery(new SpanTermQuery[]{term1, term2}, 2, true); + SpanTermQuery exclude = new SpanTermQuery(new Term("field", "one")); + + SpanNotQuery query = new SpanNotQuery(near, exclude, 4, 1); + + checkHits(query, new int[] + {42, 242, 342, 442, 542, 642, 742, 842, 942}); + + assertTrue(searcher.explain(query, 242).getValue() > 0.0f); + assertTrue(searcher.explain(query, 942).getValue() > 0.0f); + } + @Test public void testSpanFirst() throws Exception { SpanTermQuery term1 = new SpanTermQuery(new Term("field", "five")); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java index 61495de661c..30e8a9e3297 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java @@ -84,7 +84,8 @@ public class TestSpans extends LuceneTestCase { "u2 xx u1 u2", "u2 u1 xx u2", "u1 u2 xx u2", - "t1 t2 t1 t3 t2 t3" + "t1 t2 t1 t3 t2 t3", + "s2 s1 s1 xx xx s2 xx s2 xx s1 xx xx xx xx xx s2 xx" }; public SpanTermQuery makeSpanTermQuery(String text) { @@ -502,4 +503,52 @@ public class TestSpans extends LuceneTestCase { reader.close(); dir.close(); } + + + public void testSpanNots() throws Throwable{ + assertEquals("SpanNotIncludeExcludeSame1", 0, spanCount("s2", "s2", 0, 0), 0); + assertEquals("SpanNotIncludeExcludeSame2", 0, spanCount("s2", "s2", 10, 10), 0); + + //focus on behind + assertEquals("SpanNotS2NotS1_6_0", 1, spanCount("s2", "s1", 6, 0)); + assertEquals("SpanNotS2NotS1_5_0", 2, spanCount("s2", "s1", 5, 0)); + assertEquals("SpanNotS2NotS1_3_0", 3, spanCount("s2", "s1", 3, 0)); + assertEquals("SpanNotS2NotS1_2_0", 4, spanCount("s2", "s1", 2, 0)); + assertEquals("SpanNotS2NotS1_0_0", 4, spanCount("s2", "s1", 0, 0)); + + //focus on both + assertEquals("SpanNotS2NotS1_3_1", 2, spanCount("s2", "s1", 3, 1)); + assertEquals("SpanNotS2NotS1_2_1", 3, spanCount("s2", "s1", 2, 1)); + assertEquals("SpanNotS2NotS1_1_1", 3, spanCount("s2", "s1", 1, 1)); + assertEquals("SpanNotS2NotS1_10_10", 0, spanCount("s2", "s1", 10, 10)); + + //focus on ahead + assertEquals("SpanNotS1NotS2_10_10", 0, spanCount("s1", "s2", 10, 10)); + assertEquals("SpanNotS1NotS2_0_1", 3, spanCount("s1", "s2", 0, 1)); + assertEquals("SpanNotS1NotS2_0_2", 3, spanCount("s1", "s2", 0, 2)); + assertEquals("SpanNotS1NotS2_0_3", 2, spanCount("s1", "s2", 0, 3)); + assertEquals("SpanNotS1NotS2_0_4", 1, spanCount("s1", "s2", 0, 4)); + assertEquals("SpanNotS1NotS2_0_8", 0, spanCount("s1", "s2", 0, 8)); + + //exclude doesn't exist + assertEquals("SpanNotS1NotS3_8_8", 3, spanCount("s1", "s3", 8, 8)); + + //include doesn't exist + assertEquals("SpanNotS3NotS1_8_8", 0, spanCount("s3", "s1", 8, 8)); + + } + + private int spanCount(String include, String exclude, int pre, int post) throws IOException{ + SpanTermQuery iq = new SpanTermQuery(new Term(field, include)); + SpanTermQuery eq = new SpanTermQuery(new Term(field, exclude)); + SpanNotQuery snq = new SpanNotQuery(iq, eq, pre, post); + Spans spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq); + + int i = 0; + while (spans.next()){ + i++; + } + return i; + } + }