[SOLR-12238] Synonym Queries boost (#357)

SOLR-12238: Handle boosts in QueryBuilder

QueryBuilder now detects per-term boosts supplied by a BoostAttribute when
building queries using a TokenStream.  This commit also adds a DelimitedBoostTokenFilter
that parses boosts from tokens using a delimiter token, and exposes this in Solr
This commit is contained in:
Alessandro Benedetti 2020-02-24 10:29:41 +00:00 committed by GitHub
parent 57c7139ea3
commit 663611c99c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 839 additions and 41 deletions

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;
import java.io.IOException;
/**
* Characters before the delimiter are the "token", those after are the boost.
* <p>
* For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
* and 0.7 is the boost.
* <p>
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
*/
public final class DelimitedBoostTokenFilter extends TokenFilter {
private final char delimiter;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
public DelimitedBoostTokenFilter(TokenStream input, char delimiter) {
super(input);
this.delimiter = delimiter;
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
final char[] buffer = termAtt.buffer();
final int length = termAtt.length();
for (int i = 0; i < length; i++) {
if (buffer[i] == delimiter) {
float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1))));
boostAtt.setBoost(boost);
termAtt.setLength(i);
return true;
}
}
return true;
} else {
return false;
}
}
}

View File

@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.util.TokenFilterFactory;
import java.util.Map;
/**
* Factory for {@link DelimitedBoostTokenFilter}.
* <pre class="prettyprint">
* &lt;fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100"&gt;
* &lt;analyzer&gt;
* &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
* &lt;filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/&gt;
* &lt;/analyzer&gt;
* &lt;/fieldType&gt;</pre>
*
* @lucene.spi {@value #NAME}
*/
public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
/**
* SPI name
*/
public static final String NAME = "delimitedBoost";
public static final String DELIMITER_ATTR = "delimiter";
public static final char DEFAULT_DELIMITER = '|';
private final char delimiter;
/**
* Creates a new DelimitedPayloadTokenFilterFactory
*/
public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
super(args);
delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
if (!args.isEmpty()) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
}
@Override
public DelimitedBoostTokenFilter create(TokenStream input) {
return new DelimitedBoostTokenFilter(input, delimiter);
}
}

View File

@ -0,0 +1,21 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Provides various convenience classes for creating boosts on Tokens.
*/
package org.apache.lucene.analysis.boost;

View File

@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
org.apache.lucene.analysis.br.BrazilianStemFilterFactory

View File

@ -0,0 +1,85 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.boost;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.search.BoostAttribute;
public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase {
public void testBoosts() throws Exception {
String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9";
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
(whitespaceMockTokenizer(test),
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class);
filter.reset();
assertTermEquals("The", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f);
assertTermEquals("red", filter, termAtt, boostAtt, 0.5f);
assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f);
assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f);
assertTermEquals("over", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("the", filter, termAtt, boostAtt, 1.0f);
assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f);
assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f);
assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f);
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
public void testNext() throws Exception {
String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6";
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
(whitespaceMockTokenizer(test),
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
filter.reset();
assertTermEquals("The", filter, 1.0f);
assertTermEquals("quick", filter, 0.1f);
assertTermEquals("red", filter, 0.2f);
assertTermEquals("fox", filter, 0.3f);
assertTermEquals("jumped", filter, 0.4f);
assertTermEquals("over", filter, 1.0f);
assertTermEquals("the", filter, 1.0f);
assertTermEquals("lazy", filter, 0.5f);
assertTermEquals("brown", filter, 0.6f);
assertTermEquals("dogs", filter, 0.6f);
assertFalse(filter.incrementToken());
filter.end();
filter.close();
}
void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
float actualBoost = boostAtt.getBoost();
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception {
assertTrue(stream.incrementToken());
assertEquals(expected, termAtt.toString());
float actualBoost = boostAtt.getBoost();
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
}
}

View File

@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms; // javadocs only
* @lucene.internal
*/
public interface BoostAttribute extends Attribute {
float DEFAULT_BOOST = 1.0f;
/** Sets the boost in this attribute */
public void setBoost(float boost);
/** Retrieves the boost, default is {@code 1.0f}. */

View File

@ -30,17 +30,21 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SynonymQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanBoostQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
/**
* Creates queries from the {@link Analyzer} chain.
@ -63,6 +67,24 @@ public class QueryBuilder {
protected boolean enableGraphQueries = true;
protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false;
/**
* Wraps a term and boost
*/
public static class TermAndBoost {
/** the term */
public final Term term;
/** the boost */
public final float boost;
/**
* Creates a new TermAndBoost
*/
public TermAndBoost(Term term, float boost) {
this.term = term;
this.boost = boost;
}
}
/** Creates a new QueryBuilder using the given analyzer. */
public QueryBuilder(Analyzer analyzer) {
this.analyzer = analyzer;
@ -350,22 +372,32 @@ public class QueryBuilder {
*/
protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class);
SpanQuery result;
float boost = DEFAULT_BOOST;
if (termAtt == null) {
return null;
}
List<SpanTermQuery> terms = new ArrayList<>();
while (in.incrementToken()) {
boost *= boostAtt.getBoost();
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
}
if (terms.isEmpty()) {
return null;
} else if (terms.size() == 1) {
return terms.get(0);
result = terms.get(0);
} else {
return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true);
}
if (boost != DEFAULT_BOOST) {
result = new SpanBoostQuery(result, boost);
}
return result;
}
/**
@ -373,13 +405,14 @@ public class QueryBuilder {
*/
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
if (!stream.incrementToken()) {
throw new AssertionError();
}
return newTermQuery(new Term(field, termAtt.getBytesRef()));
return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
}
/**
@ -387,24 +420,25 @@ public class QueryBuilder {
*/
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
List<Term> terms = new ArrayList<>();
List<TermAndBoost> terms = new ArrayList<>();
while (stream.incrementToken()) {
terms.add(new Term(field, termAtt.getBytesRef()));
terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
}
return newSynonymQuery(terms.toArray(new Term[terms.size()]));
return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
}
protected void add(BooleanQuery.Builder q, List<Term> current, BooleanClause.Occur operator) {
protected void add(BooleanQuery.Builder q, List<TermAndBoost> current, BooleanClause.Occur operator) {
if (current.isEmpty()) {
return;
}
if (current.size() == 1) {
q.add(newTermQuery(current.get(0)), operator);
q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator);
} else {
q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator);
q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator);
}
}
@ -413,10 +447,11 @@ public class QueryBuilder {
*/
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
BooleanQuery.Builder q = newBooleanQuery();
List<Term> currentQuery = new ArrayList<>();
List<TermAndBoost> currentQuery = new ArrayList<>();
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
stream.reset();
while (stream.incrementToken()) {
@ -424,7 +459,7 @@ public class QueryBuilder {
add(q, currentQuery, operator);
currentQuery.clear();
}
currentQuery.add(new Term(field, termAtt.getBytesRef()));
currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
}
add(q, currentQuery, operator);
@ -439,9 +474,10 @@ public class QueryBuilder {
builder.setSlop(slop);
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
int position = -1;
int position = -1;
float phraseBoost = DEFAULT_BOOST;
stream.reset();
while (stream.incrementToken()) {
if (enablePositionIncrements) {
@ -450,9 +486,13 @@ public class QueryBuilder {
position += 1;
}
builder.add(new Term(field, termAtt.getBytesRef()), position);
phraseBoost *= boostAtt.getBoost();
}
return builder.build();
PhraseQuery query = builder.build();
if (phraseBoost == DEFAULT_BOOST) {
return query;
}
return new BoostQuery(query, phraseBoost);
}
/**
@ -509,33 +549,40 @@ public class QueryBuilder {
end = articulationPoints[i];
}
lastState = end;
final Query queryPos;
final Query positionalQuery;
if (graph.hasSidePath(start)) {
final Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
final Iterator<TokenStream> sidePathsIterator = graph.getFiniteStrings(start, end);
Iterator<Query> queries = new Iterator<Query>() {
@Override
public boolean hasNext() {
return it.hasNext();
return sidePathsIterator.hasNext();
}
@Override
public Query next() {
TokenStream ts = it.next();
return createFieldQuery(ts, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
TokenStream sidePath = sidePathsIterator.next();
return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
}
};
queryPos = newGraphSynonymQuery(queries);
positionalQuery = newGraphSynonymQuery(queries);
} else {
Term[] terms = graph.getTerms(field, start);
List<AttributeSource> attributes = graph.getTerms(start);
TermAndBoost[] terms = attributes.stream()
.map(s -> {
TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
BoostAttribute b = s.addAttribute(BoostAttribute.class);
return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost());
})
.toArray(TermAndBoost[]::new);
assert terms.length > 0;
if (terms.length == 1) {
queryPos = newTermQuery(terms[0]);
positionalQuery = newTermQuery(terms[0].term, terms[0].boost);
} else {
queryPos = newSynonymQuery(terms);
positionalQuery = newSynonymQuery(terms);
}
}
if (queryPos != null) {
builder.add(queryPos, operator);
if (positionalQuery != null) {
builder.add(positionalQuery, operator);
}
}
return builder.build();
@ -650,10 +697,10 @@ public class QueryBuilder {
* This is intended for subclasses that wish to customize the generated queries.
* @return new Query instance
*/
protected Query newSynonymQuery(Term terms[]) {
SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field());
for (Term term : terms) {
builder.addTerm(term);
protected Query newSynonymQuery(TermAndBoost[] terms) {
SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field());
for (TermAndBoost t : terms) {
builder.addTerm(t.term, t.boost);
}
return builder.build();
}
@ -683,10 +730,15 @@ public class QueryBuilder {
* @param term term
* @return new TermQuery instance
*/
protected Query newTermQuery(Term term) {
return new TermQuery(term);
protected Query newTermQuery(Term term, float boost) {
Query q = new TermQuery(term);
if (boost == DEFAULT_BOOST) {
return q;
}
return new BoostQuery(q, boost);
}
/**
* Builds a new MultiPhraseQuery instance.
* <p>

View File

@ -20,6 +20,7 @@ package org.apache.lucene.util;
import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.CannedBinaryTokenStream;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.MockSynonymFilter;
@ -32,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
@ -507,4 +510,51 @@ public class TestQueryBuilder extends LuceneTestCase {
expectThrows(IndexSearcher.TooManyClauses.class, () -> qb.analyzeGraphPhrase(ts, "", 0));
}
}
private static final class MockBoostTokenFilter extends TokenFilter {
final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
protected MockBoostTokenFilter(TokenStream input) {
super(input);
}
@Override
public boolean incrementToken() throws IOException {
if (input.incrementToken() == false) {
return false;
}
if (termAtt.length() == 3) {
boostAtt.setBoost(0.5f);
}
return true;
}
}
public void testTokenStreamBoosts() {
Analyzer msa = new MockSynonymAnalyzer();
Analyzer a = new AnalyzerWrapper(msa.getReuseStrategy()) {
@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return msa;
}
@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
return new TokenStreamComponents(components.getSource(), new MockBoostTokenFilter(components.getTokenStream()));
}
};
QueryBuilder builder = new QueryBuilder(a);
Query q = builder.createBooleanQuery("field", "hot dogs");
Query expected = new BooleanQuery.Builder()
.add(new BoostQuery(new TermQuery(new Term("field", "hot")), 0.5f), BooleanClause.Occur.SHOULD)
.add(new SynonymQuery.Builder("field")
.addTerm(new Term("field", "dogs"))
.addTerm(new Term("field", "dog"), 0.5f)
.build(), BooleanClause.Occur.SHOULD)
.build();
assertEquals(expected, q);
}
}

View File

@ -147,7 +147,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
// to throw a runtime exception here if a term for another field is embedded
// in phrase query
@Override
protected Query newTermQuery(Term term) {
protected Query newTermQuery(Term term, float boost) {
if (isPass2ResolvingPhrases) {
try {
checkPhraseClauseIsForSameField(term.field());
@ -155,7 +155,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
throw new RuntimeException("Error parsing complex phrase", pe);
}
}
return super.newTermQuery(term);
return super.newTermQuery(term, boost);
}
// Helper method used to report on any clauses that appear in query syntax

View File

@ -21,6 +21,8 @@ import java.util.ArrayList;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
@ -601,19 +603,35 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
}
@Override
protected Query newSynonymQuery(Term terms[]) {
protected Query newGraphSynonymQuery(Iterator<Query> sidePathQueriesIterator) {
switch (synonymQueryStyle) {
case PICK_BEST: {
List<Query> sidePathSynonymQueries = new LinkedList<>();
sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add);
return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f);
}
case AS_SAME_TERM:
case AS_DISTINCT_TERMS:{
return super.newGraphSynonymQuery(sidePathQueriesIterator);}
default:
throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery");
}
}
@Override
protected Query newSynonymQuery(TermAndBoost[] terms) {
switch (synonymQueryStyle) {
case PICK_BEST:
List<Query> currPosnClauses = new ArrayList<Query>(terms.length);
for (Term term : terms) {
currPosnClauses.add(newTermQuery(term));
for (TermAndBoost term : terms) {
currPosnClauses.add(newTermQuery(term.term, term.boost));
}
DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f);
return dm;
case AS_DISTINCT_TERMS:
BooleanQuery.Builder builder = new BooleanQuery.Builder();
for (Term term : terms) {
builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD);
for (TermAndBoost term : terms) {
builder.add(newTermQuery(term.term, term.boost), BooleanClause.Occur.SHOULD);
}
return builder.build();
case AS_SAME_TERM:

View File

@ -227,6 +227,41 @@
</analyzer>
</fieldType>
<fieldType name="text_pick_best_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="pick_best" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_as_distinct_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_distinct_terms" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text_as_same_term_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_same_term" autoGeneratePhraseQueries="true">
<analyzer type="index">
<tokenizer class="solr.MockTokenizerFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.MockTokenizerFactory"/>
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="nametext" class="solr.TextField">
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
@ -656,6 +691,9 @@
<dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
<dynamicField name="t_pick_best_*" type="text_pick_best" indexed="true" stored="true"/>
<dynamicField name="t_pick_best_boosted_*" type="text_pick_best_boosted" indexed="true" stored="true"/>
<dynamicField name="t_as_distinct_boosted_*" type="text_as_distinct_boosted" indexed="true" stored="true"/>
<dynamicField name="t_as_same_term_boosted_*" type="text_as_same_term_boosted" indexed="true" stored="true"/>
<dynamicField name="t_as_distinct_*" type="text_as_distinct" indexed="true" stored="true"/>

View File

@ -37,4 +37,18 @@ crow blackbird, grackle
tabby => tabby, cat, feline, animal
persian => persian, cat, feline, animal
jeans, denim pants
jeans, denim pants
# Boosted Synonyms
tiger, tigre|0.9
lynx => lince|0.8, lynx_canadensis|0.9
leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
panthera pardus, leopard|0.6
panthera tigris => tiger|0.99
snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
panthera blytheae, oldest|0.5 ancient|0.9 panthera

View File

@ -300,4 +300,80 @@ public class TestManagedSynonymGraphFilterFactory extends RestTestBase {
assertJDelete(endpoint+"/fröhlich",
"/error/code==404");
}
/**
* Can we add and single term synonyms with weight
*/
@Test
public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throws Exception {
String endpoint = "/schema/analysis/synonyms/englishgraph";
assertJQ(endpoint,
"/synonymMappings/initArgs/ignoreCase==false",
"/synonymMappings/managedMap=={}");
// does not exist
assertJQ(endpoint+"/tiger",
"/error/code==404");
Map<String,List<String>> syns = new HashMap<>();
// now put a synonym
syns.put("tiger", Arrays.asList("tiger|1.0"));
assertJPut(endpoint,
toJSONString(syns),
"/responseHeader/status==0");
// and check if it exists
assertJQ(endpoint,
"/synonymMappings/managedMap/tiger==['tiger|1.0']");
// verify delete works
assertJDelete(endpoint+"/tiger",
"/responseHeader/status==0");
// was it really deleted?
assertJDelete(endpoint+"/tiger",
"/error/code==404");
}
/**
* Can we add multi term synonyms with weight
*/
@Test
public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws Exception {
String endpoint = "/schema/analysis/synonyms/englishgraph";
assertJQ(endpoint,
"/synonymMappings/initArgs/ignoreCase==false",
"/synonymMappings/managedMap=={}");
// does not exist
assertJQ(endpoint+"/tiger",
"/error/code==404");
Map<String,List<String>> syns = new HashMap<>();
// now put a synonym
List<String> tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8");
syns.put("tiger", tigerSyonyms);
String jsonTigerSynonyms = toJSONString(syns);
assertJPut(endpoint,
jsonTigerSynonyms,
"/responseHeader/status==0");
// and check if it exists
assertJQ(endpoint,
"/synonymMappings/managedMap/tiger==[\"Shere Kan|0.8\",\"panthera tigris|0.9\",\"tiger|1.0\"]");
// verify delete works
assertJDelete(endpoint+"/tiger",
"/responseHeader/status==0");
// was it really deleted?
assertJDelete(endpoint+"/tiger",
"/error/code==404");
}
}

View File

@ -1221,8 +1221,225 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString());
q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery();
assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString());
assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString());
}
public void testSynonymsBoost_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
//tiger, tigre|0.9
Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("(t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger", q.toString());
q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
//lynx => lince|0.8, lynx_canadensis|0.9
q = QParser.getParser("lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("(t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9", q.toString());
q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
}
public void testSynonymsBoost_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)", q.toString());
//lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
q = QParser.getParser("lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
q = QParser.getParser("lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
q = QParser.getParser("lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
}
public void testSynonymsBoost_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
//tiger, tigre|0.9
//lynx => lince|0.8, lynx_canadensis|0.9
Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)" +
" ((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)" +
" ((t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9)", q.toString());
q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)" +
" Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
}
public void testSynonymsBoost_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
//lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)" +
" ((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)" +
" ((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)" +
" ((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
}
public void testSynonymsBoost_singleConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
//panthera pardus, leopard|0.6
Query q = QParser.getParser("panthera pardus story",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") t_pick_best_boosted_foo:story", q.toString());
q = QParser.getParser("panthera pardus story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") t_as_distinct_boosted_foo:story", q.toString());
q = QParser.getParser("panthera pardus story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") t_as_same_term_boosted_foo:story", q.toString());
//panthera tigris => tiger|0.99
q = QParser.getParser("panthera tigris story", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("(t_pick_best_boosted_foo:tiger)^0.99 t_pick_best_boosted_foo:story", q.toString());
q = QParser.getParser("panthera tigris story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("(t_as_distinct_boosted_foo:tiger)^0.99 t_as_distinct_boosted_foo:story", q.toString());
q = QParser.getParser("panthera tigris story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("(t_as_same_term_boosted_foo:tiger)^0.99 t_as_same_term_boosted_foo:story", q.toString());
}
public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoost_shouldParseMultiplicativeBoostedQuery() throws Exception {
//panthera blytheae, oldest|0.5 ancient|0.9 panthera
Query q = QParser.getParser("panthera blytheae",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"oldest ancient panthera\")^0.45 | t_pick_best_boosted_foo:\"panthera blytheae\")", q.toString());
q = QParser.getParser("panthera blytheae", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_distinct_boosted_foo:\"panthera blytheae\")", q.toString());
q = QParser.getParser("panthera blytheae", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_same_term_boosted_foo:\"panthera blytheae\")", q.toString());
}
public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
//snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString());
q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")", q.toString());
q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")", q.toString());
//panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
}
public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
//panthera pardus, leopard|0.6
//tiger, tigre|0.9
Query q = QParser.getParser("panthera pardus tiger",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") ((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") ((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)", q.toString());
q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
}
public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
//snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
//panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" +
" ((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")" +
" ((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")" +
" ((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
}
public void testSynonymsBoost_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception {
Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boosted_foo^10"))).getQuery();
assertEquals("+(" +
"((((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\"))^10.0)" +
" ((((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75))^10.0)" +
")", q.toString());
q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boosted_foo^10"))).getQuery();
assertEquals("+(" +
"(((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")^10.0)" +
" (((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)^10.0))", q.toString());
q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boosted_foo^10"))).getQuery();
assertEquals("+(" +
"(((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")^10.0)" +
" (((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)^10.0))", q.toString());
}
public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception {
Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
assertEquals("spanNear([" +
"spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," +
" (t_pick_best_boosted_foo:white_leopard)^0.6," +
" spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," +
" spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
}
public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception {
Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
assertEquals("spanNear([" +
"spanOr([" +
"(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," +
" spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," +
" spanOr([" +
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
}
public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception {
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
}
@Test

View File

@ -398,6 +398,72 @@ Discard original token (`inject="false"`).
Note that "Kuczewski" has two encodings, which are added at the same position.
== Delimited Boost Filter
This filter adds a numeric floating point boost value to tokens, splitting on a delimiter character.
*Factory class:* `solr.DelimitedBoostTokenFilterFactory`
*Arguments:*
`delimiter`:: The character used to separate the token and the boost. Defaults to '|'.
*Example:*
[.dynamic-tabs]
--
[example.tab-pane#byname-filter-delimitedBoost]
====
[.tab-label]*With name*
[source,xml]
----
<analyzer>
<tokenizer name="standard"/>
<filter name="delimitedBoost"/>
</analyzer>
----
====
[example.tab-pane#byclass-filter-delimitedBoost]
====
[.tab-label]*With class name (legacy)*
[source,xml]
----
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
</analyzer>
----
====
--
*In:* "leopard|0.5 panthera uncia|0.9"
*Tokenizer to Filter:* "leopard|0.5"(1), "panthera"(2), "uncia|0.9"(3)
*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
The numeric floating point in square brackets is a float token boost attribute.
*Example:*
Using a different delimiter (`delimiter="/"`).
[source,xml]
----
<analyzer>
<tokenizer name="standard"/>
<filter name="delimitedBoost" delimiter="/"/>
</analyzer>
----
*In:* "leopard/0.5 panthera uncia/0.9"
*Tokenizer to Filter:* "leopard/0.5"(1), "panthera"(2), "uncia/0.9"(3)
*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
*N.B.* make sure the delimiter is compatible with the tokenizer you use
== Edge N-Gram Filter
This filter generates edge n-gram tokens of sizes within the given range.
@ -2292,6 +2358,39 @@ small => tiny,teeny,weeny
*Out:* "the"(1), "large"(2), "large"(3), "couch"(4), "sofa"(4), "divan"(4)
*Weighted Synonyms:*
Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time.
For more information feel free to refer to:
https://sease.io/2020/02/introducing-weighted-synonyms-in-apache-lucene.html
For the following examples, assume a synonyms file named `boostedSynonyms.txt`:
[source,text]
----
leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
lion => panthera leo|0.9, simba|0.8, kimba|0.75
----
*Example:*
====
[.tab-label]*With name*
[source,xml]
----
<analyzer type="query">
<tokenizer name="standard"/>
<filter name="synonymGraph" synonyms="boostedSynonyms.txt"/>
<filter name="delimitedBoost"/>
</analyzer>
----
====
*In:* "lion"
*Tokenizer to Filter:* "lion"(1)
*Out:* "panthera"(1), "leo"(2)[0.9], "simba"(1)[0.8], "kimba"(1)[0.75]
== Token Offset Payload Filter
This filter adds the numeric character offsets of the token as a payload value for that token.