From 750da7c5f7af74f2d9db36265639d7ae9dc9e4eb Mon Sep 17 00:00:00 2001 From: David Smiley Date: Tue, 8 Nov 2016 12:45:23 -0500 Subject: [PATCH] LUCENE-7431: SpanNotQuery should support negative pre/post distance for overlap --- lucene/CHANGES.txt | 4 + .../lucene/search/spans/SpanNotQuery.java | 14 ++-- .../lucene/search/spans/TestBasics.java | 30 ++++++- .../apache/lucene/search/spans/TestSpans.java | 79 +++++++++++-------- 4 files changed, 84 insertions(+), 43 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 805fc7e4ca3..0ccb5ee8c59 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -63,6 +63,10 @@ Improvements PhraseQuery or MultiPhraseQuery when the word automaton is simple (Mike McCandless) +* LUCENE-7431: Allow a certain amount of overlap to be specified between the include + and exclude arguments of SpanNotQuery via negative pre and/or post arguments. + (Marc Morissette via David Smiley) + ======================= Lucene 6.3.0 ======================= API Changes diff --git a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java index 05d3f8ef481..00bcc4c1ac7 100644 --- a/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/spans/SpanNotQuery.java @@ -49,19 +49,23 @@ public final class SpanNotQuery extends SpanQuery { /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude within - * dist tokens of include. */ + * dist tokens of include. Inversely, a negative + * dist value may be used to specify a certain amount of allowable + * overlap. */ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int dist) { this(include, exclude, dist, dist); } /** Construct a SpanNotQuery matching spans from include which * have no overlap with spans from exclude within - * pre tokens before or post tokens of include. */ + * pre tokens before or post tokens of + * include. Inversely, negative values for pre and/or + * post allow a certain amount of overlap to occur. */ public SpanNotQuery(SpanQuery include, SpanQuery exclude, int pre, int post) { this.include = Objects.requireNonNull(include); this.exclude = Objects.requireNonNull(exclude); - this.pre = (pre >=0) ? pre : 0; - this.post = (post >= 0) ? post : 0; + this.pre = pre; + this.post = post; if (include.getField() != null && exclude.getField() != null && !include.getField().equals(exclude.getField())) throw new IllegalArgumentException("Clauses must have same field."); @@ -226,4 +230,4 @@ public final class SpanNotQuery extends SpanQuery { return h; } -} \ No newline at end of file +} diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java index b18a38df2d5..d699719e478 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestBasics.java @@ -274,20 +274,42 @@ public class TestBasics extends LuceneTestCase { assertTrue(searcher.explain(query, 849).getValue() > 0.0f); } - public void testSpanNotWindowNeg() throws Exception { + public void testSpanNotWindowNegPost() throws Exception { //test handling of invalid window < 0 SpanQuery near = spanNearOrderedQuery("field", 4, "eight", "one"); SpanQuery or = spanOrQuery("field", "forty"); - SpanQuery query = spanNotQuery(near, or); - + SpanQuery query = spanNotQuery(near, or, 0, -1); checkHits(query, new int[] {801, 821, 831, 851, 861, 871, 881, 891, 1801, 1821, 1831, 1851, 1861, 1871, 1881, 1891}); + query = spanNotQuery(near, or, 0, -2); + checkHits(query, new int[] + {801, 821, 831, 841, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1841, 1851, 1861, 1871, 1881, 1891}); + assertTrue(searcher.explain(query, 801).getValue() > 0.0f); assertTrue(searcher.explain(query, 891).getValue() > 0.0f); } - + + public void testSpanNotWindowNegPre() throws Exception { + //test handling of invalid window < 0 + SpanQuery near = spanNearOrderedQuery("field", 4, "eight", "one"); + SpanQuery or = spanOrQuery("field", "forty"); + SpanQuery query = spanNotQuery(near, or, -2, 0); + checkHits(query, new int[] + {801, 821, 831, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1851, 1861, 1871, 1881, 1891}); + + query = spanNotQuery(near, or, -3, 0); + checkHits(query, new int[] + {801, 821, 831, 841, 851, 861, 871, 881, 891, + 1801, 1821, 1831, 1841, 1851, 1861, 1871, 1881, 1891}); + + assertTrue(searcher.explain(query, 801).getValue() > 0.0f); + assertTrue(searcher.explain(query, 891).getValue() > 0.0f); + } + public void testSpanNotWindowDoubleExcludesBefore() throws Exception { //test hitting two excludes before an include SpanQuery near = spanNearOrderedQuery("field", 2, "forty", "two"); diff --git a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java index 2d5e05cf8e5..2b5b919f385 100644 --- a/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java +++ b/lucene/core/src/test/org/apache/lucene/search/spans/TestSpans.java @@ -99,7 +99,6 @@ public class TestSpans extends LuceneTestCase { "s2 s1 s1 xx xx s2 xx s2 xx s1 xx xx xx xx xx s2 xx", "r1 s11", "r1 s21" - }; private void checkHits(Query query, int[] results) throws IOException { @@ -406,42 +405,54 @@ public class TestSpans extends LuceneTestCase { } - - public void testSpanNots() throws Throwable{ - assertEquals("SpanNotIncludeExcludeSame1", 0, spanCount("s2", "s2", 0, 0), 0); - assertEquals("SpanNotIncludeExcludeSame2", 0, spanCount("s2", "s2", 10, 10), 0); - - //focus on behind - assertEquals("SpanNotS2NotS1_6_0", 1, spanCount("s2", "s1", 6, 0)); - assertEquals("SpanNotS2NotS1_5_0", 2, spanCount("s2", "s1", 5, 0)); - assertEquals("SpanNotS2NotS1_3_0", 3, spanCount("s2", "s1", 3, 0)); - assertEquals("SpanNotS2NotS1_2_0", 4, spanCount("s2", "s1", 2, 0)); - assertEquals("SpanNotS2NotS1_0_0", 4, spanCount("s2", "s1", 0, 0)); - - //focus on both - assertEquals("SpanNotS2NotS1_3_1", 2, spanCount("s2", "s1", 3, 1)); - assertEquals("SpanNotS2NotS1_2_1", 3, spanCount("s2", "s1", 2, 1)); - assertEquals("SpanNotS2NotS1_1_1", 3, spanCount("s2", "s1", 1, 1)); - assertEquals("SpanNotS2NotS1_10_10", 0, spanCount("s2", "s1", 10, 10)); - - //focus on ahead - assertEquals("SpanNotS1NotS2_10_10", 0, spanCount("s1", "s2", 10, 10)); - assertEquals("SpanNotS1NotS2_0_1", 3, spanCount("s1", "s2", 0, 1)); - assertEquals("SpanNotS1NotS2_0_2", 3, spanCount("s1", "s2", 0, 2)); - assertEquals("SpanNotS1NotS2_0_3", 2, spanCount("s1", "s2", 0, 3)); - assertEquals("SpanNotS1NotS2_0_4", 1, spanCount("s1", "s2", 0, 4)); - assertEquals("SpanNotS1NotS2_0_8", 0, spanCount("s1", "s2", 0, 8)); - - //exclude doesn't exist - assertEquals("SpanNotS1NotS3_8_8", 3, spanCount("s1", "s3", 8, 8)); - //include doesn't exist - assertEquals("SpanNotS3NotS1_8_8", 0, spanCount("s3", "s1", 8, 8)); + public void testSpanNots() throws Throwable { + assertEquals("SpanNotIncludeExcludeSame1", 0, spanCount("s2", 0, "s2", 0, 0), 0); + assertEquals("SpanNotIncludeExcludeSame2", 0, spanCount("s2", 0, "s2", 10, 10), 0); + + //focus on behind + assertEquals("SpanNotS2NotS1_6_0", 1, spanCount("s2", 0, "s1", 6, 0)); + assertEquals("SpanNotS2NotS1_5_0", 2, spanCount("s2", 0, "s1", 5, 0)); + assertEquals("SpanNotS2NotS1_3_0", 3, spanCount("s2", 0, "s1", 3, 0)); + assertEquals("SpanNotS2NotS1_2_0", 4, spanCount("s2", 0, "s1", 2, 0)); + assertEquals("SpanNotS2NotS1_0_0", 4, spanCount("s2", 0, "s1", 0, 0)); + + //focus on both + assertEquals("SpanNotS2NotS1_3_1", 2, spanCount("s2", 0, "s1", 3, 1)); + assertEquals("SpanNotS2NotS1_2_1", 3, spanCount("s2", 0, "s1", 2, 1)); + assertEquals("SpanNotS2NotS1_1_1", 3, spanCount("s2", 0, "s1", 1, 1)); + assertEquals("SpanNotS2NotS1_10_10", 0, spanCount("s2", 0, "s1", 10, 10)); + + //focus on ahead + assertEquals("SpanNotS1NotS2_10_10", 0, spanCount("s1", 0, "s2", 10, 10)); + assertEquals("SpanNotS1NotS2_0_1", 3, spanCount("s1", 0, "s2", 0, 1)); + assertEquals("SpanNotS1NotS2_0_2", 3, spanCount("s1", 0, "s2", 0, 2)); + assertEquals("SpanNotS1NotS2_0_3", 2, spanCount("s1", 0, "s2", 0, 3)); + assertEquals("SpanNotS1NotS2_0_4", 1, spanCount("s1", 0, "s2", 0, 4)); + assertEquals("SpanNotS1NotS2_0_8", 0, spanCount("s1", 0, "s2", 0, 8)); + + //exclude doesn't exist + assertEquals("SpanNotS1NotS3_8_8", 3, spanCount("s1", 0, "s3", 8, 8)); + + //include doesn't exist + assertEquals("SpanNotS3NotS1_8_8", 0, spanCount("s3", 0, "s1", 8, 8)); + + // Negative values + assertEquals("SpanNotS2S1NotXXNeg_0_0", 1, spanCount("s2 s1", 10, "xx", 0, 0)); + assertEquals("SpanNotS2S1NotXXNeg_1_1", 1, spanCount("s2 s1", 10, "xx", -1, -1)); + assertEquals("SpanNotS2S1NotXXNeg_0_2", 2, spanCount("s2 s1", 10, "xx", 0, -2)); + assertEquals("SpanNotS2S1NotXXNeg_1_2", 2, spanCount("s2 s1", 10, "xx", -1, -2)); + assertEquals("SpanNotS2S1NotXXNeg_2_1", 2, spanCount("s2 s1", 10, "xx", -2, -1)); + assertEquals("SpanNotS2S1NotXXNeg_3_1", 2, spanCount("s2 s1", 10, "xx", -3, -1)); + assertEquals("SpanNotS2S1NotXXNeg_1_3", 2, spanCount("s2 s1", 10, "xx", -1, -3)); + assertEquals("SpanNotS2S1NotXXNeg_2_2", 3, spanCount("s2 s1", 10, "xx", -2, -2)); } - - private int spanCount(String include, String exclude, int pre, int post) throws IOException{ - SpanQuery iq = spanTermQuery(field, include); + + + private int spanCount(String include, int slop, String exclude, int pre, int post) throws IOException{ + String[] includeTerms = include.split(" +"); + SpanQuery iq = includeTerms.length == 1 ? spanTermQuery(field, include) : spanNearOrderedQuery(field, slop, includeTerms); SpanQuery eq = spanTermQuery(field, exclude); SpanQuery snq = spanNotQuery(iq, eq, pre, post); Spans spans = snq.createWeight(searcher, false, 1f).getSpans(searcher.getIndexReader().leaves().get(0), SpanWeight.Postings.POSITIONS);