From 95ddd51108bcaedee1e9766a3170552bbd65d0b7 Mon Sep 17 00:00:00 2001 From: Michael McCandless Date: Tue, 11 Nov 2014 21:42:33 +0000 Subject: [PATCH] LUCENE-6050: allow for MUST and MUST_NOT (in addition to SHOULD) context tags in Analyzing/BlendedInfixSuggester git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1638441 13f79535-47bb-0310-9956-ffa450edef68 --- lucene/CHANGES.txt | 4 + .../analyzing/AnalyzingInfixSuggester.java | 63 +++++++--- .../analyzing/BlendedInfixSuggester.java | 8 ++ .../AnalyzingInfixSuggesterTest.java | 112 +++++++++++++++++- .../analyzing/BlendedInfixSuggesterTest.java | 8 +- 5 files changed, 171 insertions(+), 24 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c19fbe91dd3..60a8b8c594c 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -91,6 +91,10 @@ New Features index using it will need to be rebuilt. (Thomas Neidhart via Mike McCandless) +* LUCENE-6050: Accept MUST and MUST_NOT (in addition to SHOULD) for + each context passed to Analyzing/BlendedInfixSuggester (Arcadius + Ahouansou, jane chang via Mike McCandless) + API Changes * LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java index 80b8bf50c86..e0684a208c2 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggester.java @@ -23,6 +23,8 @@ import java.io.StringReader; import java.nio.file.Path; import java.util.ArrayList; import java.util.HashSet; +import java.util.HashMap; +import java.util.Map; import java.util.List; import java.util.Set; @@ -390,7 +392,22 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { /** Lookup, without any context. */ public List lookup(CharSequence key, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { - return lookup(key, null, num, allTermsRequired, doHighlight); + return lookup(key, (Map)null, num, allTermsRequired, doHighlight); + } + + /** Lookup, with context but without booleans. Context booleans default to SHOULD, + * so each suggestion must have at least one of the contexts. */ + public List lookup(CharSequence key, Set contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { + + if (contexts == null) { + return lookup(key, num, allTermsRequired, doHighlight); + } + + Map contextInfo = new HashMap<>(); + for (BytesRef context : contexts) { + contextInfo.put(context, BooleanClause.Occur.SHOULD); + } + return lookup(key, contextInfo, num, allTermsRequired, doHighlight); } /** This is called if the last token isn't ended @@ -408,7 +425,7 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { /** Retrieve suggestions, specifying whether all terms * must match ({@code allTermsRequired}) and whether the hits * should be highlighted ({@code doHighlight}). */ - public List lookup(CharSequence key, Set contexts, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { + public List lookup(CharSequence key, Map contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { if (searcherMgr == null) { throw new IllegalStateException("suggester was not built"); @@ -469,21 +486,35 @@ public class AnalyzingInfixSuggester extends Lookup implements Closeable { } } - if (contexts != null) { - BooleanQuery sub = new BooleanQuery(); - query.add(sub, BooleanClause.Occur.MUST); - for(BytesRef context : contexts) { - // NOTE: we "should" wrap this in - // ConstantScoreQuery, or maybe send this as a - // Filter instead to search, but since all of - // these are MUST'd, the change to the score won't - // affect the overall ranking. Since we indexed - // as DOCS_ONLY, the perf should be the same - // either way (no freq int[] blocks to decode): + if (contextInfo != null) { + + boolean allMustNot = true; + for (Map.Entry entry : contextInfo.entrySet()) { + if (entry.getValue() != BooleanClause.Occur.MUST_NOT) { + allMustNot = false; + break; + } + } - // TODO: if we had a BinaryTermField we could fix - // this "must be valid ut8f" limitation: - sub.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, context.utf8ToString())), BooleanClause.Occur.SHOULD); + // do not make a subquery if all context booleans are must not + if (allMustNot == true) { + for (Map.Entry entry : contextInfo.entrySet()) { + query.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, entry.getKey().utf8ToString())), BooleanClause.Occur.MUST_NOT); + } + + } else { + BooleanQuery sub = new BooleanQuery(); + query.add(sub, BooleanClause.Occur.MUST); + + for (Map.Entry entry : contextInfo.entrySet()) { + // NOTE: we "should" wrap this in + // ConstantScoreQuery, or maybe send this as a + // Filter instead to search. + + // TODO: if we had a BinaryTermField we could fix + // this "must be valid ut8f" limitation: + sub.add(new TermQuery(new Term(CONTEXTS_FIELD_NAME, entry.getKey().utf8ToString())), entry.getValue()); + } } } } diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggester.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggester.java index 4e911da8eeb..e1257515ed5 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggester.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggester.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.TreeSet; @@ -33,6 +34,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.MultiDocValues; import org.apache.lucene.index.Terms; import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopFieldDocs; @@ -146,6 +148,12 @@ public class BlendedInfixSuggester extends AnalyzingInfixSuggester { return super.lookup(key, contexts, num * numFactor, allTermsRequired, doHighlight); } + @Override + public List lookup(CharSequence key, Map contextInfo, int num, boolean allTermsRequired, boolean doHighlight) throws IOException { + // here we multiply the number of searched element by the defined factor + return super.lookup(key, contextInfo, num * numFactor, allTermsRequired, doHighlight); + } + @Override protected FieldType getTextFieldType() { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java index a7caacf53f6..951f20159e5 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/AnalyzingInfixSuggesterTest.java @@ -23,8 +23,10 @@ import java.nio.file.Path; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; +import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; import org.apache.lucene.analysis.Analyzer; @@ -35,6 +37,7 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.util.CharArraySet; +import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.suggest.Input; import org.apache.lucene.search.suggest.InputArrayIterator; import org.apache.lucene.search.suggest.Lookup.LookupResult; @@ -899,7 +902,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { assertTrue(result.contexts.contains(new BytesRef("foo"))); assertTrue(result.contexts.contains(new BytesRef("bar"))); - // Both suggestions have "foo" context: + // Both have "foo" context: results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet("foo"), 10, true, true); assertEquals(2, results.size()); @@ -934,8 +937,16 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { assertTrue(result.contexts.contains(new BytesRef("foo"))); assertTrue(result.contexts.contains(new BytesRef("bar"))); - // Only one has "baz" context: - results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet("baz"), 10, true, true); + // None do not have "foo" context: + Map contextInfo = new HashMap<>(); + contextInfo.put(new BytesRef("foo"), BooleanClause.Occur.MUST_NOT); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); + assertEquals(0, results.size()); + + // Only one does not have "bar" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST_NOT); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); assertEquals(1, results.size()); result = results.get(0); @@ -947,7 +958,7 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { assertTrue(result.contexts.contains(new BytesRef("foo"))); assertTrue(result.contexts.contains(new BytesRef("baz"))); - // Both have foo or bar: + // Both have "foo" or "bar" context: results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet("foo", "bar"), 10, true, true); assertEquals(2, results.size()); @@ -969,6 +980,99 @@ public class AnalyzingInfixSuggesterTest extends LuceneTestCase { assertTrue(result.contexts.contains(new BytesRef("foo"))); assertTrue(result.contexts.contains(new BytesRef("bar"))); + // Both have "bar" or "baz" context: + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet("bar", "baz"), 10, true, true); + assertEquals(2, results.size()); + + result = results.get(0); + assertEquals("a penny saved is a penny earned", result.key); + assertEquals(10, result.value); + assertEquals(new BytesRef("foobaz"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("baz"))); + + result = results.get(1); + assertEquals("lend me your ear", result.key); + assertEquals(8, result.value); + assertEquals(new BytesRef("foobar"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("bar"))); + + // Only one has "foo" and "bar" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("foo"), BooleanClause.Occur.MUST); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); + assertEquals(1, results.size()); + + result = results.get(0); + assertEquals("lend me your ear", result.key); + assertEquals(8, result.value); + assertEquals(new BytesRef("foobar"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("bar"))); + + // None have "bar" and "baz" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST); + contextInfo.put(new BytesRef("baz"), BooleanClause.Occur.MUST); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); + assertEquals(0, results.size()); + + // None do not have "foo" and do not have "bar" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("foo"), BooleanClause.Occur.MUST_NOT); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST_NOT); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); + assertEquals(0, results.size()); + + // Both do not have "bar" and do not have "baz" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST_NOT); + contextInfo.put(new BytesRef("baz"), BooleanClause.Occur.MUST_NOT); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), asSet("bar", "baz"), 10, true, true); + assertEquals(2, results.size()); + + result = results.get(0); + assertEquals("a penny saved is a penny earned", result.key); + assertEquals(10, result.value); + assertEquals(new BytesRef("foobaz"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("baz"))); + + result = results.get(1); + assertEquals("lend me your ear", result.key); + assertEquals(8, result.value); + assertEquals(new BytesRef("foobar"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("bar"))); + + // Only one has "foo" and does not have "bar" context: + contextInfo.clear(); + contextInfo.put(new BytesRef("foo"), BooleanClause.Occur.MUST); + contextInfo.put(new BytesRef("bar"), BooleanClause.Occur.MUST_NOT); + results = suggester.lookup(TestUtil.stringToCharSequence("ear", random()), contextInfo, 10, true, true); + assertEquals(1, results.size()); + + result = results.get(0); + assertEquals("a penny saved is a penny earned", result.key); + assertEquals(10, result.value); + assertEquals(new BytesRef("foobaz"), result.payload); + assertNotNull(result.contexts); + assertEquals(2, result.contexts.size()); + assertTrue(result.contexts.contains(new BytesRef("foo"))); + assertTrue(result.contexts.contains(new BytesRef("baz"))); + suggester.close(); } } diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java index bd9f08a9108..9fd393a19aa 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/analyzing/BlendedInfixSuggesterTest.java @@ -137,12 +137,12 @@ public class BlendedInfixSuggesterTest extends LuceneTestCase { // we don't find it for in the 2 first - assertEquals(2, suggester.lookup("the", null, 2, true, false).size()); + assertEquals(2, suggester.lookup("the", 2, true, false).size()); long w0 = getInResults(suggester, "the", ret, 2); assertTrue(w0 < 0); // but it's there if we search for 3 elements - assertEquals(3, suggester.lookup("the", null, 3, true, false).size()); + assertEquals(3, suggester.lookup("the", 3, true, false).size()); long w1 = getInResults(suggester, "the", ret, 3); assertTrue(w1 > 0); @@ -188,7 +188,7 @@ public class BlendedInfixSuggesterTest extends LuceneTestCase { suggester.build(new InputArrayIterator(keys)); - List responses = suggester.lookup("the", null, 4, true, false); + List responses = suggester.lookup("the", 4, true, false); for (Lookup.LookupResult response : responses) { System.out.println(response); @@ -199,7 +199,7 @@ public class BlendedInfixSuggesterTest extends LuceneTestCase { private static long getInResults(BlendedInfixSuggester suggester, String prefix, BytesRef payload, int num) throws IOException { - List responses = suggester.lookup(prefix, null, num, true, false); + List responses = suggester.lookup(prefix, num, true, false); for (Lookup.LookupResult response : responses) { if (response.payload.equals(payload)) {