mirror of https://github.com/apache/lucene.git
[SOLR-12238] Synonym Queries boost (#357)
SOLR-12238: Handle boosts in QueryBuilder QueryBuilder now detects per-term boosts supplied by a BoostAttribute when building queries using a TokenStream. This commit also adds a DelimitedBoostTokenFilter that parses boosts from tokens using a delimiter token, and exposes this in Solr
This commit is contained in:
parent
57c7139ea3
commit
663611c99c
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.boost;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
||||
/**
|
||||
* Characters before the delimiter are the "token", those after are the boost.
|
||||
* <p>
|
||||
* For example, if the delimiter is '|', then for the string "foo|0.7", foo is the token
|
||||
* and 0.7 is the boost.
|
||||
* <p>
|
||||
* Note make sure your Tokenizer doesn't split on the delimiter, or this won't work
|
||||
*/
|
||||
public final class DelimitedBoostTokenFilter extends TokenFilter {
|
||||
private final char delimiter;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
|
||||
|
||||
public DelimitedBoostTokenFilter(TokenStream input, char delimiter) {
|
||||
super(input);
|
||||
this.delimiter = delimiter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken()) {
|
||||
final char[] buffer = termAtt.buffer();
|
||||
final int length = termAtt.length();
|
||||
for (int i = 0; i < length; i++) {
|
||||
if (buffer[i] == delimiter) {
|
||||
float boost = Float.parseFloat(new String(buffer, i + 1, (length - (i + 1))));
|
||||
boostAtt.setBoost(boost);
|
||||
termAtt.setLength(i);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,63 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.boost;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.util.TokenFilterFactory;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Factory for {@link DelimitedBoostTokenFilter}.
|
||||
* <pre class="prettyprint">
|
||||
* <fieldType name="text_dlmtd" class="solr.TextField" positionIncrementGap="100">
|
||||
* <analyzer>
|
||||
* <tokenizer class="solr.WhitespaceTokenizerFactory"/>
|
||||
* <filter class="solr.DelimitedBoostTokenFilterFactory" delimiter="|"/>
|
||||
* </analyzer>
|
||||
* </fieldType></pre>
|
||||
*
|
||||
* @lucene.spi {@value #NAME}
|
||||
*/
|
||||
public class DelimitedBoostTokenFilterFactory extends TokenFilterFactory {
|
||||
|
||||
/**
|
||||
* SPI name
|
||||
*/
|
||||
public static final String NAME = "delimitedBoost";
|
||||
public static final String DELIMITER_ATTR = "delimiter";
|
||||
public static final char DEFAULT_DELIMITER = '|';
|
||||
|
||||
private final char delimiter;
|
||||
|
||||
/**
|
||||
* Creates a new DelimitedPayloadTokenFilterFactory
|
||||
*/
|
||||
public DelimitedBoostTokenFilterFactory(Map<String, String> args) {
|
||||
super(args);
|
||||
delimiter = getChar(args, DELIMITER_ATTR, DEFAULT_DELIMITER);
|
||||
if (!args.isEmpty()) {
|
||||
throw new IllegalArgumentException("Unknown parameters: " + args);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DelimitedBoostTokenFilter create(TokenStream input) {
|
||||
return new DelimitedBoostTokenFilter(input, delimiter);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Provides various convenience classes for creating boosts on Tokens.
|
||||
*/
|
||||
package org.apache.lucene.analysis.boost;
|
|
@ -17,6 +17,7 @@ org.apache.lucene.analysis.tr.ApostropheFilterFactory
|
|||
org.apache.lucene.analysis.ar.ArabicNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.ar.ArabicStemFilterFactory
|
||||
org.apache.lucene.analysis.bg.BulgarianStemFilterFactory
|
||||
org.apache.lucene.analysis.boost.DelimitedBoostTokenFilterFactory
|
||||
org.apache.lucene.analysis.bn.BengaliNormalizationFilterFactory
|
||||
org.apache.lucene.analysis.bn.BengaliStemFilterFactory
|
||||
org.apache.lucene.analysis.br.BrazilianStemFilterFactory
|
||||
|
|
|
@ -0,0 +1,85 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.analysis.boost;
|
||||
|
||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
|
||||
public class DelimitedBoostTokenFilterTest extends BaseTokenStreamTestCase {
|
||||
|
||||
public void testBoosts() throws Exception {
|
||||
String test = "The quick|0.4 red|0.5 fox|0.2 jumped|0.1 over the lazy|0.8 brown|0.9 dogs|0.9";
|
||||
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
|
||||
(whitespaceMockTokenizer(test),
|
||||
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
|
||||
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||
BoostAttribute boostAtt = filter.addAttribute(BoostAttribute.class);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, termAtt, boostAtt, 1.0f);
|
||||
assertTermEquals("quick", filter, termAtt, boostAtt, 0.4f);
|
||||
assertTermEquals("red", filter, termAtt, boostAtt, 0.5f);
|
||||
assertTermEquals("fox", filter, termAtt, boostAtt, 0.2f);
|
||||
assertTermEquals("jumped", filter, termAtt, boostAtt, 0.1f);
|
||||
assertTermEquals("over", filter, termAtt, boostAtt, 1.0f);
|
||||
assertTermEquals("the", filter, termAtt, boostAtt, 1.0f);
|
||||
assertTermEquals("lazy", filter, termAtt, boostAtt, 0.8f);
|
||||
assertTermEquals("brown", filter, termAtt, boostAtt, 0.9f);
|
||||
assertTermEquals("dogs", filter, termAtt, boostAtt, 0.9f);
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
public void testNext() throws Exception {
|
||||
String test = "The quick|0.1 red|0.2 fox|0.3 jumped|0.4 over the lazy|0.5 brown|0.6 dogs|0.6";
|
||||
DelimitedBoostTokenFilter filter = new DelimitedBoostTokenFilter
|
||||
(whitespaceMockTokenizer(test),
|
||||
DelimitedBoostTokenFilterFactory.DEFAULT_DELIMITER);
|
||||
filter.reset();
|
||||
assertTermEquals("The", filter, 1.0f);
|
||||
assertTermEquals("quick", filter, 0.1f);
|
||||
assertTermEquals("red", filter, 0.2f);
|
||||
assertTermEquals("fox", filter, 0.3f);
|
||||
assertTermEquals("jumped", filter, 0.4f);
|
||||
assertTermEquals("over", filter, 1.0f);
|
||||
assertTermEquals("the", filter, 1.0f);
|
||||
assertTermEquals("lazy", filter, 0.5f);
|
||||
assertTermEquals("brown", filter, 0.6f);
|
||||
assertTermEquals("dogs", filter, 0.6f);
|
||||
assertFalse(filter.incrementToken());
|
||||
filter.end();
|
||||
filter.close();
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, float expectedBoost) throws Exception {
|
||||
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
float actualBoost = boostAtt.getBoost();
|
||||
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
|
||||
}
|
||||
|
||||
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, BoostAttribute boostAtt, float expectedBoost) throws Exception {
|
||||
assertTrue(stream.incrementToken());
|
||||
assertEquals(expected, termAtt.toString());
|
||||
float actualBoost = boostAtt.getBoost();
|
||||
assertTrue(actualBoost + " does not equal: " + expectedBoost, actualBoost == expectedBoost);
|
||||
}
|
||||
}
|
|
@ -32,6 +32,7 @@ import org.apache.lucene.index.Terms; // javadocs only
|
|||
* @lucene.internal
|
||||
*/
|
||||
public interface BoostAttribute extends Attribute {
|
||||
float DEFAULT_BOOST = 1.0f;
|
||||
/** Sets the boost in this attribute */
|
||||
public void setBoost(float boost);
|
||||
/** Retrieves the boost, default is {@code 1.0f}. */
|
||||
|
|
|
@ -30,17 +30,21 @@ import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.SynonymQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.spans.SpanBoostQuery;
|
||||
import org.apache.lucene.search.spans.SpanNearQuery;
|
||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||
import org.apache.lucene.search.spans.SpanQuery;
|
||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||
import org.apache.lucene.util.graph.GraphTokenStreamFiniteStrings;
|
||||
import static org.apache.lucene.search.BoostAttribute.DEFAULT_BOOST;
|
||||
|
||||
/**
|
||||
* Creates queries from the {@link Analyzer} chain.
|
||||
|
@ -63,6 +67,24 @@ public class QueryBuilder {
|
|||
protected boolean enableGraphQueries = true;
|
||||
protected boolean autoGenerateMultiTermSynonymsPhraseQuery = false;
|
||||
|
||||
/**
|
||||
* Wraps a term and boost
|
||||
*/
|
||||
public static class TermAndBoost {
|
||||
/** the term */
|
||||
public final Term term;
|
||||
/** the boost */
|
||||
public final float boost;
|
||||
|
||||
/**
|
||||
* Creates a new TermAndBoost
|
||||
*/
|
||||
public TermAndBoost(Term term, float boost) {
|
||||
this.term = term;
|
||||
this.boost = boost;
|
||||
}
|
||||
}
|
||||
|
||||
/** Creates a new QueryBuilder using the given analyzer. */
|
||||
public QueryBuilder(Analyzer analyzer) {
|
||||
this.analyzer = analyzer;
|
||||
|
@ -350,22 +372,32 @@ public class QueryBuilder {
|
|||
*/
|
||||
protected SpanQuery createSpanQuery(TokenStream in, String field) throws IOException {
|
||||
TermToBytesRefAttribute termAtt = in.getAttribute(TermToBytesRefAttribute.class);
|
||||
BoostAttribute boostAtt = in.addAttribute(BoostAttribute.class);
|
||||
|
||||
SpanQuery result;
|
||||
float boost = DEFAULT_BOOST;
|
||||
if (termAtt == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<SpanTermQuery> terms = new ArrayList<>();
|
||||
while (in.incrementToken()) {
|
||||
boost *= boostAtt.getBoost();
|
||||
terms.add(new SpanTermQuery(new Term(field, termAtt.getBytesRef())));
|
||||
}
|
||||
|
||||
if (terms.isEmpty()) {
|
||||
return null;
|
||||
} else if (terms.size() == 1) {
|
||||
return terms.get(0);
|
||||
result = terms.get(0);
|
||||
} else {
|
||||
return new SpanNearQuery(terms.toArray(new SpanTermQuery[0]), 0, true);
|
||||
result = new SpanNearQuery(terms.toArray(new SpanQuery[0]), 0, true);
|
||||
}
|
||||
|
||||
if (boost != DEFAULT_BOOST) {
|
||||
result = new SpanBoostQuery(result, boost);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -373,13 +405,14 @@ public class QueryBuilder {
|
|||
*/
|
||||
protected Query analyzeTerm(String field, TokenStream stream) throws IOException {
|
||||
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
|
||||
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
if (!stream.incrementToken()) {
|
||||
throw new AssertionError();
|
||||
}
|
||||
|
||||
return newTermQuery(new Term(field, termAtt.getBytesRef()));
|
||||
return newTermQuery(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost());
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -387,24 +420,25 @@ public class QueryBuilder {
|
|||
*/
|
||||
protected Query analyzeBoolean(String field, TokenStream stream) throws IOException {
|
||||
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
|
||||
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
List<Term> terms = new ArrayList<>();
|
||||
List<TermAndBoost> terms = new ArrayList<>();
|
||||
while (stream.incrementToken()) {
|
||||
terms.add(new Term(field, termAtt.getBytesRef()));
|
||||
terms.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
|
||||
}
|
||||
|
||||
return newSynonymQuery(terms.toArray(new Term[terms.size()]));
|
||||
return newSynonymQuery(terms.toArray(new TermAndBoost[0]));
|
||||
}
|
||||
|
||||
protected void add(BooleanQuery.Builder q, List<Term> current, BooleanClause.Occur operator) {
|
||||
protected void add(BooleanQuery.Builder q, List<TermAndBoost> current, BooleanClause.Occur operator) {
|
||||
if (current.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
if (current.size() == 1) {
|
||||
q.add(newTermQuery(current.get(0)), operator);
|
||||
q.add(newTermQuery(current.get(0).term, current.get(0).boost), operator);
|
||||
} else {
|
||||
q.add(newSynonymQuery(current.toArray(new Term[current.size()])), operator);
|
||||
q.add(newSynonymQuery(current.toArray(new TermAndBoost[0])), operator);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -413,10 +447,11 @@ public class QueryBuilder {
|
|||
*/
|
||||
protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException {
|
||||
BooleanQuery.Builder q = newBooleanQuery();
|
||||
List<Term> currentQuery = new ArrayList<>();
|
||||
List<TermAndBoost> currentQuery = new ArrayList<>();
|
||||
|
||||
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
|
||||
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
|
||||
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
|
@ -424,7 +459,7 @@ public class QueryBuilder {
|
|||
add(q, currentQuery, operator);
|
||||
currentQuery.clear();
|
||||
}
|
||||
currentQuery.add(new Term(field, termAtt.getBytesRef()));
|
||||
currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost()));
|
||||
}
|
||||
add(q, currentQuery, operator);
|
||||
|
||||
|
@ -439,9 +474,10 @@ public class QueryBuilder {
|
|||
builder.setSlop(slop);
|
||||
|
||||
TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
|
||||
BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class);
|
||||
PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
|
||||
int position = -1;
|
||||
|
||||
int position = -1;
|
||||
float phraseBoost = DEFAULT_BOOST;
|
||||
stream.reset();
|
||||
while (stream.incrementToken()) {
|
||||
if (enablePositionIncrements) {
|
||||
|
@ -450,9 +486,13 @@ public class QueryBuilder {
|
|||
position += 1;
|
||||
}
|
||||
builder.add(new Term(field, termAtt.getBytesRef()), position);
|
||||
phraseBoost *= boostAtt.getBoost();
|
||||
}
|
||||
|
||||
return builder.build();
|
||||
PhraseQuery query = builder.build();
|
||||
if (phraseBoost == DEFAULT_BOOST) {
|
||||
return query;
|
||||
}
|
||||
return new BoostQuery(query, phraseBoost);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -509,33 +549,40 @@ public class QueryBuilder {
|
|||
end = articulationPoints[i];
|
||||
}
|
||||
lastState = end;
|
||||
final Query queryPos;
|
||||
final Query positionalQuery;
|
||||
if (graph.hasSidePath(start)) {
|
||||
final Iterator<TokenStream> it = graph.getFiniteStrings(start, end);
|
||||
final Iterator<TokenStream> sidePathsIterator = graph.getFiniteStrings(start, end);
|
||||
Iterator<Query> queries = new Iterator<Query>() {
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return it.hasNext();
|
||||
return sidePathsIterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query next() {
|
||||
TokenStream ts = it.next();
|
||||
return createFieldQuery(ts, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
|
||||
TokenStream sidePath = sidePathsIterator.next();
|
||||
return createFieldQuery(sidePath, BooleanClause.Occur.MUST, field, getAutoGenerateMultiTermSynonymsPhraseQuery(), 0);
|
||||
}
|
||||
};
|
||||
queryPos = newGraphSynonymQuery(queries);
|
||||
positionalQuery = newGraphSynonymQuery(queries);
|
||||
} else {
|
||||
Term[] terms = graph.getTerms(field, start);
|
||||
List<AttributeSource> attributes = graph.getTerms(start);
|
||||
TermAndBoost[] terms = attributes.stream()
|
||||
.map(s -> {
|
||||
TermToBytesRefAttribute t = s.addAttribute(TermToBytesRefAttribute.class);
|
||||
BoostAttribute b = s.addAttribute(BoostAttribute.class);
|
||||
return new TermAndBoost(new Term(field, t.getBytesRef()), b.getBoost());
|
||||
})
|
||||
.toArray(TermAndBoost[]::new);
|
||||
assert terms.length > 0;
|
||||
if (terms.length == 1) {
|
||||
queryPos = newTermQuery(terms[0]);
|
||||
positionalQuery = newTermQuery(terms[0].term, terms[0].boost);
|
||||
} else {
|
||||
queryPos = newSynonymQuery(terms);
|
||||
positionalQuery = newSynonymQuery(terms);
|
||||
}
|
||||
}
|
||||
if (queryPos != null) {
|
||||
builder.add(queryPos, operator);
|
||||
if (positionalQuery != null) {
|
||||
builder.add(positionalQuery, operator);
|
||||
}
|
||||
}
|
||||
return builder.build();
|
||||
|
@ -650,10 +697,10 @@ public class QueryBuilder {
|
|||
* This is intended for subclasses that wish to customize the generated queries.
|
||||
* @return new Query instance
|
||||
*/
|
||||
protected Query newSynonymQuery(Term terms[]) {
|
||||
SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].field());
|
||||
for (Term term : terms) {
|
||||
builder.addTerm(term);
|
||||
protected Query newSynonymQuery(TermAndBoost[] terms) {
|
||||
SynonymQuery.Builder builder = new SynonymQuery.Builder(terms[0].term.field());
|
||||
for (TermAndBoost t : terms) {
|
||||
builder.addTerm(t.term, t.boost);
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
@ -683,10 +730,15 @@ public class QueryBuilder {
|
|||
* @param term term
|
||||
* @return new TermQuery instance
|
||||
*/
|
||||
protected Query newTermQuery(Term term) {
|
||||
return new TermQuery(term);
|
||||
protected Query newTermQuery(Term term, float boost) {
|
||||
Query q = new TermQuery(term);
|
||||
if (boost == DEFAULT_BOOST) {
|
||||
return q;
|
||||
}
|
||||
return new BoostQuery(q, boost);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Builds a new MultiPhraseQuery instance.
|
||||
* <p>
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.util;
|
|||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.AnalyzerWrapper;
|
||||
import org.apache.lucene.analysis.CannedBinaryTokenStream;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockSynonymFilter;
|
||||
|
@ -32,6 +33,8 @@ import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
import org.apache.lucene.search.BoostQuery;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
|
@ -507,4 +510,51 @@ public class TestQueryBuilder extends LuceneTestCase {
|
|||
expectThrows(IndexSearcher.TooManyClauses.class, () -> qb.analyzeGraphPhrase(ts, "", 0));
|
||||
}
|
||||
}
|
||||
|
||||
private static final class MockBoostTokenFilter extends TokenFilter {
|
||||
|
||||
final BoostAttribute boostAtt = addAttribute(BoostAttribute.class);
|
||||
final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
|
||||
protected MockBoostTokenFilter(TokenStream input) {
|
||||
super(input);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (input.incrementToken() == false) {
|
||||
return false;
|
||||
}
|
||||
if (termAtt.length() == 3) {
|
||||
boostAtt.setBoost(0.5f);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
public void testTokenStreamBoosts() {
|
||||
Analyzer msa = new MockSynonymAnalyzer();
|
||||
Analyzer a = new AnalyzerWrapper(msa.getReuseStrategy()) {
|
||||
@Override
|
||||
protected Analyzer getWrappedAnalyzer(String fieldName) {
|
||||
return msa;
|
||||
}
|
||||
@Override
|
||||
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
|
||||
return new TokenStreamComponents(components.getSource(), new MockBoostTokenFilter(components.getTokenStream()));
|
||||
}
|
||||
};
|
||||
|
||||
QueryBuilder builder = new QueryBuilder(a);
|
||||
Query q = builder.createBooleanQuery("field", "hot dogs");
|
||||
Query expected = new BooleanQuery.Builder()
|
||||
.add(new BoostQuery(new TermQuery(new Term("field", "hot")), 0.5f), BooleanClause.Occur.SHOULD)
|
||||
.add(new SynonymQuery.Builder("field")
|
||||
.addTerm(new Term("field", "dogs"))
|
||||
.addTerm(new Term("field", "dog"), 0.5f)
|
||||
.build(), BooleanClause.Occur.SHOULD)
|
||||
.build();
|
||||
|
||||
assertEquals(expected, q);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -147,7 +147,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
|
|||
// to throw a runtime exception here if a term for another field is embedded
|
||||
// in phrase query
|
||||
@Override
|
||||
protected Query newTermQuery(Term term) {
|
||||
protected Query newTermQuery(Term term, float boost) {
|
||||
if (isPass2ResolvingPhrases) {
|
||||
try {
|
||||
checkPhraseClauseIsForSameField(term.field());
|
||||
|
@ -155,7 +155,7 @@ public class ComplexPhraseQueryParser extends QueryParser {
|
|||
throw new RuntimeException("Error parsing complex phrase", pe);
|
||||
}
|
||||
}
|
||||
return super.newTermQuery(term);
|
||||
return super.newTermQuery(term, boost);
|
||||
}
|
||||
|
||||
// Helper method used to report on any clauses that appear in query syntax
|
||||
|
|
|
@ -21,6 +21,8 @@ import java.util.ArrayList;
|
|||
import java.util.Collections;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
@ -601,19 +603,35 @@ public abstract class SolrQueryParserBase extends QueryBuilder {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected Query newSynonymQuery(Term terms[]) {
|
||||
protected Query newGraphSynonymQuery(Iterator<Query> sidePathQueriesIterator) {
|
||||
switch (synonymQueryStyle) {
|
||||
case PICK_BEST: {
|
||||
List<Query> sidePathSynonymQueries = new LinkedList<>();
|
||||
sidePathQueriesIterator.forEachRemaining(sidePathSynonymQueries::add);
|
||||
return new DisjunctionMaxQuery(sidePathSynonymQueries, 0.0f);
|
||||
}
|
||||
case AS_SAME_TERM:
|
||||
case AS_DISTINCT_TERMS:{
|
||||
return super.newGraphSynonymQuery(sidePathQueriesIterator);}
|
||||
default:
|
||||
throw new AssertionError("unrecognized synonymQueryStyle passed when creating newSynonymQuery");
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Query newSynonymQuery(TermAndBoost[] terms) {
|
||||
switch (synonymQueryStyle) {
|
||||
case PICK_BEST:
|
||||
List<Query> currPosnClauses = new ArrayList<Query>(terms.length);
|
||||
for (Term term : terms) {
|
||||
currPosnClauses.add(newTermQuery(term));
|
||||
for (TermAndBoost term : terms) {
|
||||
currPosnClauses.add(newTermQuery(term.term, term.boost));
|
||||
}
|
||||
DisjunctionMaxQuery dm = new DisjunctionMaxQuery(currPosnClauses, 0.0f);
|
||||
return dm;
|
||||
case AS_DISTINCT_TERMS:
|
||||
BooleanQuery.Builder builder = new BooleanQuery.Builder();
|
||||
for (Term term : terms) {
|
||||
builder.add(newTermQuery(term), BooleanClause.Occur.SHOULD);
|
||||
for (TermAndBoost term : terms) {
|
||||
builder.add(newTermQuery(term.term, term.boost), BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return builder.build();
|
||||
case AS_SAME_TERM:
|
||||
|
|
|
@ -227,6 +227,41 @@
|
|||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_pick_best_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="pick_best" autoGeneratePhraseQueries="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_as_distinct_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_distinct_terms" autoGeneratePhraseQueries="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="text_as_same_term_boosted" class="solr.TextField" positionIncrementGap="100" synonymQueryStyle="as_same_term" autoGeneratePhraseQueries="true">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.MockTokenizerFactory"/>
|
||||
<filter class="solr.SynonymGraphFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
|
||||
<filter class="solr.LowerCaseFilterFactory"/>
|
||||
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldType name="nametext" class="solr.TextField">
|
||||
<analyzer class="org.apache.lucene.analysis.core.WhitespaceAnalyzer"/>
|
||||
|
@ -656,6 +691,9 @@
|
|||
<dynamicField name="*_sI" type="string" indexed="true" stored="false"/>
|
||||
<dynamicField name="*_sS" type="string" indexed="false" stored="true"/>
|
||||
<dynamicField name="t_pick_best_*" type="text_pick_best" indexed="true" stored="true"/>
|
||||
<dynamicField name="t_pick_best_boosted_*" type="text_pick_best_boosted" indexed="true" stored="true"/>
|
||||
<dynamicField name="t_as_distinct_boosted_*" type="text_as_distinct_boosted" indexed="true" stored="true"/>
|
||||
<dynamicField name="t_as_same_term_boosted_*" type="text_as_same_term_boosted" indexed="true" stored="true"/>
|
||||
<dynamicField name="t_as_distinct_*" type="text_as_distinct" indexed="true" stored="true"/>
|
||||
|
||||
|
||||
|
|
|
@ -37,4 +37,18 @@ crow blackbird, grackle
|
|||
tabby => tabby, cat, feline, animal
|
||||
persian => persian, cat, feline, animal
|
||||
|
||||
jeans, denim pants
|
||||
jeans, denim pants
|
||||
|
||||
# Boosted Synonyms
|
||||
tiger, tigre|0.9
|
||||
lynx => lince|0.8, lynx_canadensis|0.9
|
||||
|
||||
leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
|
||||
lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
|
||||
|
||||
panthera pardus, leopard|0.6
|
||||
panthera tigris => tiger|0.99
|
||||
|
||||
snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
|
||||
panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
|
||||
panthera blytheae, oldest|0.5 ancient|0.9 panthera
|
|
@ -300,4 +300,80 @@ public class TestManagedSynonymGraphFilterFactory extends RestTestBase {
|
|||
assertJDelete(endpoint+"/fröhlich",
|
||||
"/error/code==404");
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we add and single term synonyms with weight
|
||||
*/
|
||||
@Test
|
||||
public void testManagedSynonyms_singleTermWithWeight_shouldHandleSynonym() throws Exception {
|
||||
String endpoint = "/schema/analysis/synonyms/englishgraph";
|
||||
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/initArgs/ignoreCase==false",
|
||||
"/synonymMappings/managedMap=={}");
|
||||
|
||||
// does not exist
|
||||
assertJQ(endpoint+"/tiger",
|
||||
"/error/code==404");
|
||||
|
||||
Map<String,List<String>> syns = new HashMap<>();
|
||||
|
||||
// now put a synonym
|
||||
syns.put("tiger", Arrays.asList("tiger|1.0"));
|
||||
assertJPut(endpoint,
|
||||
toJSONString(syns),
|
||||
"/responseHeader/status==0");
|
||||
|
||||
// and check if it exists
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/managedMap/tiger==['tiger|1.0']");
|
||||
|
||||
// verify delete works
|
||||
assertJDelete(endpoint+"/tiger",
|
||||
"/responseHeader/status==0");
|
||||
|
||||
|
||||
// was it really deleted?
|
||||
assertJDelete(endpoint+"/tiger",
|
||||
"/error/code==404");
|
||||
}
|
||||
|
||||
/**
|
||||
* Can we add multi term synonyms with weight
|
||||
*/
|
||||
@Test
|
||||
public void testManagedSynonyms_multiTermWithWeight_shouldHandleSynonym() throws Exception {
|
||||
String endpoint = "/schema/analysis/synonyms/englishgraph";
|
||||
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/initArgs/ignoreCase==false",
|
||||
"/synonymMappings/managedMap=={}");
|
||||
|
||||
// does not exist
|
||||
assertJQ(endpoint+"/tiger",
|
||||
"/error/code==404");
|
||||
|
||||
Map<String,List<String>> syns = new HashMap<>();
|
||||
|
||||
// now put a synonym
|
||||
List<String> tigerSyonyms = Arrays.asList("tiger|1.0", "panthera tigris|0.9", "Shere Kan|0.8");
|
||||
syns.put("tiger", tigerSyonyms);
|
||||
String jsonTigerSynonyms = toJSONString(syns);
|
||||
assertJPut(endpoint,
|
||||
jsonTigerSynonyms,
|
||||
"/responseHeader/status==0");
|
||||
|
||||
// and check if it exists
|
||||
assertJQ(endpoint,
|
||||
"/synonymMappings/managedMap/tiger==[\"Shere Kan|0.8\",\"panthera tigris|0.9\",\"tiger|1.0\"]");
|
||||
|
||||
// verify delete works
|
||||
assertJDelete(endpoint+"/tiger",
|
||||
"/responseHeader/status==0");
|
||||
|
||||
|
||||
// was it really deleted?
|
||||
assertJDelete(endpoint+"/tiger",
|
||||
"/error/code==404");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1221,8 +1221,225 @@ public class TestSolrQueryParser extends SolrTestCaseJ4 {
|
|||
assertEquals("(t_as_distinct_foo:\"denim pant\" t_as_distinct_foo:jean)", q.toString());
|
||||
|
||||
q = QParser.getParser("jeans", req(params("df", "t_pick_best_foo", "sow", "false"))).getQuery();
|
||||
assertEquals("(t_pick_best_foo:\"denim pant\" t_pick_best_foo:jean)", q.toString());
|
||||
assertEquals("(t_pick_best_foo:\"denim pant\" | t_pick_best_foo:jean)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_singleTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//tiger, tigre|0.9
|
||||
Query q = QParser.getParser("tiger", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
|
||||
|
||||
q = QParser.getParser("tiger", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("(t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger", q.toString());
|
||||
|
||||
q = QParser.getParser("tiger", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
|
||||
|
||||
//lynx => lince|0.8, lynx_canadensis|0.9
|
||||
q = QParser.getParser("lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
|
||||
|
||||
q = QParser.getParser("lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("(t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9", q.toString());
|
||||
|
||||
q = QParser.getParser("lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_singleTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
|
||||
Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
|
||||
|
||||
q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
|
||||
|
||||
q = QParser.getParser("leopard", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)", q.toString());
|
||||
|
||||
//lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
|
||||
q = QParser.getParser("lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
|
||||
|
||||
q = QParser.getParser("lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
|
||||
|
||||
q = QParser.getParser("lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_multiTermQuerySingleTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//tiger, tigre|0.9
|
||||
//lynx => lince|0.8, lynx_canadensis|0.9
|
||||
Query q = QParser.getParser("tiger lynx", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)" +
|
||||
" ((t_pick_best_boosted_foo:lince)^0.8 | (t_pick_best_boosted_foo:lynx_canadensis)^0.9)", q.toString());
|
||||
|
||||
q = QParser.getParser("tiger lynx", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)" +
|
||||
" ((t_as_distinct_boosted_foo:lince)^0.8 (t_as_distinct_boosted_foo:lynx_canadensis)^0.9)", q.toString());
|
||||
|
||||
q = QParser.getParser("tiger lynx", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)" +
|
||||
" Synonym(t_as_same_term_boosted_foo:lince^0.8 t_as_same_term_boosted_foo:lynx_canadensis^0.9)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_multiTermQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
|
||||
//lion => panthera leo|0.9, simba leo|0.8, kimba|0.75
|
||||
Query q = QParser.getParser("leopard lion", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)" +
|
||||
" ((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75)", q.toString());
|
||||
|
||||
q = QParser.getParser("leopard lion", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)" +
|
||||
" ((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)", q.toString());
|
||||
|
||||
q = QParser.getParser("leopard lion", req(params("df", "t_as_same_term_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:bagheera)^0.9 (t_as_same_term_boosted_foo:\"panthera pardus\")^0.85 t_as_same_term_boosted_foo:leopard)" +
|
||||
" ((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)", q.toString());
|
||||
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_singleConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
|
||||
//panthera pardus, leopard|0.6
|
||||
Query q = QParser.getParser("panthera pardus story",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") t_pick_best_boosted_foo:story", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera pardus story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") t_as_distinct_boosted_foo:story", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera pardus story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") t_as_same_term_boosted_foo:story", q.toString());
|
||||
|
||||
//panthera tigris => tiger|0.99
|
||||
q = QParser.getParser("panthera tigris story", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("(t_pick_best_boosted_foo:tiger)^0.99 t_pick_best_boosted_foo:story", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera tigris story", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("(t_as_distinct_boosted_foo:tiger)^0.99 t_as_distinct_boosted_foo:story", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera tigris story", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("(t_as_same_term_boosted_foo:tiger)^0.99 t_as_same_term_boosted_foo:story", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_singleConceptQueryMultiTermSynonymWithMultipleBoost_shouldParseMultiplicativeBoostedQuery() throws Exception {
|
||||
//panthera blytheae, oldest|0.5 ancient|0.9 panthera
|
||||
Query q = QParser.getParser("panthera blytheae",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"oldest ancient panthera\")^0.45 | t_pick_best_boosted_foo:\"panthera blytheae\")", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera blytheae", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_distinct_boosted_foo:\"panthera blytheae\")", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera blytheae", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"oldest ancient panthera\")^0.45 t_as_same_term_boosted_foo:\"panthera blytheae\")", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_singleConceptQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
|
||||
Query q = QParser.getParser("snow leopard",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")", q.toString());
|
||||
|
||||
//panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
|
||||
q = QParser.getParser("panthera onca", req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_multiConceptQuerySingleTermSynonym_shouldParseBoostedQuery() throws Exception {
|
||||
//panthera pardus, leopard|0.6
|
||||
//tiger, tigre|0.9
|
||||
Query q = QParser.getParser("panthera pardus tiger",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:leopard)^0.6 | t_pick_best_boosted_foo:\"panthera pardus\") ((t_pick_best_boosted_foo:tigre)^0.9 | t_pick_best_boosted_foo:tiger)", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:leopard)^0.6 t_as_distinct_boosted_foo:\"panthera pardus\") ((t_as_distinct_boosted_foo:tigre)^0.9 t_as_distinct_boosted_foo:tiger)", q.toString());
|
||||
|
||||
q = QParser.getParser("panthera pardus tiger", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:leopard)^0.6 t_as_same_term_boosted_foo:\"panthera pardus\") Synonym(t_as_same_term_boosted_foo:tiger t_as_same_term_boosted_foo:tigre^0.9)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_multiConceptsQueryMultiTermSynonyms_shouldParseBoostedQuery() throws Exception {
|
||||
//snow leopard, panthera uncia|0.9, big cat|0.8, white_leopard|0.6
|
||||
//panthera onca => jaguar|0.95, big cat|0.85, black panther|0.65
|
||||
Query q = QParser.getParser("snow leopard panthera onca",req(params("df", "t_pick_best_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\")" +
|
||||
" ((t_pick_best_boosted_foo:jaguar)^0.95 | (t_pick_best_boosted_foo:\"big cat\")^0.85 | (t_pick_best_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_distinct_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")" +
|
||||
" ((t_as_distinct_boosted_foo:jaguar)^0.95 (t_as_distinct_boosted_foo:\"big cat\")^0.85 (t_as_distinct_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard panthera onca", req(params("df", "t_as_same_term_boosted_foo","sow", "false"))).getQuery();
|
||||
assertEquals("((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")" +
|
||||
" ((t_as_same_term_boosted_foo:jaguar)^0.95 (t_as_same_term_boosted_foo:\"big cat\")^0.85 (t_as_same_term_boosted_foo:\"black panther\")^0.65)", q.toString());
|
||||
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_edismaxBoost_shouldParseBoostedPhraseQuery() throws Exception {
|
||||
Query q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_pick_best_boosted_foo^10"))).getQuery();
|
||||
assertEquals("+(" +
|
||||
"((((t_pick_best_boosted_foo:\"panthera uncia\")^0.9 | (t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:white_leopard)^0.6 | t_pick_best_boosted_foo:\"snow leopard\"))^10.0)" +
|
||||
" ((((t_pick_best_boosted_foo:\"panthera leo\")^0.9 | (t_pick_best_boosted_foo:\"simba leo\")^0.8 | (t_pick_best_boosted_foo:kimba)^0.75))^10.0)" +
|
||||
")", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_distinct_boosted_foo^10"))).getQuery();
|
||||
assertEquals("+(" +
|
||||
"(((t_as_distinct_boosted_foo:\"panthera uncia\")^0.9 (t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:white_leopard)^0.6 t_as_distinct_boosted_foo:\"snow leopard\")^10.0)" +
|
||||
" (((t_as_distinct_boosted_foo:\"panthera leo\")^0.9 (t_as_distinct_boosted_foo:\"simba leo\")^0.8 (t_as_distinct_boosted_foo:kimba)^0.75)^10.0))", q.toString());
|
||||
|
||||
q = QParser.getParser("snow leopard lion","edismax",true, req(params("sow", "false","qf", "t_as_same_term_boosted_foo^10"))).getQuery();
|
||||
assertEquals("+(" +
|
||||
"(((t_as_same_term_boosted_foo:\"panthera uncia\")^0.9 (t_as_same_term_boosted_foo:\"big cat\")^0.8 (t_as_same_term_boosted_foo:white_leopard)^0.6 t_as_same_term_boosted_foo:\"snow leopard\")^10.0)" +
|
||||
" (((t_as_same_term_boosted_foo:\"panthera leo\")^0.9 (t_as_same_term_boosted_foo:\"simba leo\")^0.8 (t_as_same_term_boosted_foo:kimba)^0.75)^10.0))", q.toString());
|
||||
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_phraseQueryMultiTermSynonymsBoost_shouldParseBoostedSpanQuery() throws Exception {
|
||||
Query q = QParser.getParser("\"snow leopard lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
|
||||
assertEquals("spanNear([" +
|
||||
"spanOr([" +
|
||||
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:uncia], 0, true))^0.9," +
|
||||
" (spanNear([t_pick_best_boosted_foo:big, t_pick_best_boosted_foo:cat], 0, true))^0.8," +
|
||||
" (t_pick_best_boosted_foo:white_leopard)^0.6," +
|
||||
" spanNear([t_pick_best_boosted_foo:snow, t_pick_best_boosted_foo:leopard], 0, true)])," +
|
||||
" spanOr([" +
|
||||
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
|
||||
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
|
||||
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_phraseQueryMultiTermSynonymsMultipleBoost_shouldParseMultiplicativeBoostedSpanQuery() throws Exception {
|
||||
Query q = QParser.getParser("\"panthera blytheae lion\"", req(params("df", "t_pick_best_boosted_foo", "sow", "false"))).getQuery();
|
||||
assertEquals("spanNear([" +
|
||||
"spanOr([" +
|
||||
"(spanNear([t_pick_best_boosted_foo:oldest, t_pick_best_boosted_foo:ancient, t_pick_best_boosted_foo:panthera], 0, true))^0.45," +
|
||||
" spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:blytheae], 0, true)])," +
|
||||
" spanOr([" +
|
||||
"(spanNear([t_pick_best_boosted_foo:panthera, t_pick_best_boosted_foo:leo], 0, true))^0.9," +
|
||||
" (spanNear([t_pick_best_boosted_foo:simba, t_pick_best_boosted_foo:leo], 0, true))^0.8," +
|
||||
" (t_pick_best_boosted_foo:kimba)^0.75])], 0, true)", q.toString());
|
||||
}
|
||||
|
||||
public void testSynonymsBoost_BoostMissing_shouldAssignDefaultBoost() throws Exception {
|
||||
//leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
|
||||
Query q = QParser.getParser("leopard", req(params("df", "t_pick_best_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_pick_best_boosted_foo:\"big cat\")^0.8 | (t_pick_best_boosted_foo:bagheera)^0.9 | (t_pick_best_boosted_foo:\"panthera pardus\")^0.85 | t_pick_best_boosted_foo:leopard)", q.toString());
|
||||
|
||||
q = QParser.getParser("leopard", req(params("df", "t_as_distinct_boosted_foo"))).getQuery();
|
||||
assertEquals("((t_as_distinct_boosted_foo:\"big cat\")^0.8 (t_as_distinct_boosted_foo:bagheera)^0.9 (t_as_distinct_boosted_foo:\"panthera pardus\")^0.85 t_as_distinct_boosted_foo:leopard)", q.toString());
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
|
@ -398,6 +398,72 @@ Discard original token (`inject="false"`).
|
|||
|
||||
Note that "Kuczewski" has two encodings, which are added at the same position.
|
||||
|
||||
== Delimited Boost Filter
|
||||
|
||||
This filter adds a numeric floating point boost value to tokens, splitting on a delimiter character.
|
||||
|
||||
*Factory class:* `solr.DelimitedBoostTokenFilterFactory`
|
||||
|
||||
*Arguments:*
|
||||
|
||||
`delimiter`:: The character used to separate the token and the boost. Defaults to '|'.
|
||||
|
||||
*Example:*
|
||||
|
||||
[.dynamic-tabs]
|
||||
--
|
||||
[example.tab-pane#byname-filter-delimitedBoost]
|
||||
====
|
||||
[.tab-label]*With name*
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer name="standard"/>
|
||||
<filter name="delimitedBoost"/>
|
||||
</analyzer>
|
||||
----
|
||||
====
|
||||
[example.tab-pane#byclass-filter-delimitedBoost]
|
||||
====
|
||||
[.tab-label]*With class name (legacy)*
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory"/>
|
||||
<filter class="solr.DelimitedBoostTokenFilterFactory"/>
|
||||
</analyzer>
|
||||
----
|
||||
====
|
||||
--
|
||||
|
||||
*In:* "leopard|0.5 panthera uncia|0.9"
|
||||
|
||||
*Tokenizer to Filter:* "leopard|0.5"(1), "panthera"(2), "uncia|0.9"(3)
|
||||
|
||||
*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
|
||||
|
||||
The numeric floating point in square brackets is a float token boost attribute.
|
||||
|
||||
*Example:*
|
||||
|
||||
Using a different delimiter (`delimiter="/"`).
|
||||
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer>
|
||||
<tokenizer name="standard"/>
|
||||
<filter name="delimitedBoost" delimiter="/"/>
|
||||
</analyzer>
|
||||
----
|
||||
|
||||
*In:* "leopard/0.5 panthera uncia/0.9"
|
||||
|
||||
*Tokenizer to Filter:* "leopard/0.5"(1), "panthera"(2), "uncia/0.9"(3)
|
||||
|
||||
*Out:* "leopard"(1)[0.5], "panthera"(2), "uncia"(3)[0.9]
|
||||
|
||||
*N.B.* make sure the delimiter is compatible with the tokenizer you use
|
||||
|
||||
== Edge N-Gram Filter
|
||||
|
||||
This filter generates edge n-gram tokens of sizes within the given range.
|
||||
|
@ -2292,6 +2358,39 @@ small => tiny,teeny,weeny
|
|||
|
||||
*Out:* "the"(1), "large"(2), "large"(3), "couch"(4), "sofa"(4), "divan"(4)
|
||||
|
||||
*Weighted Synonyms:*
|
||||
|
||||
Combining the DelimitedBoostFilter with the Synonym Graph Filter you can achieve Weighted synonyms at query time.
|
||||
For more information feel free to refer to:
|
||||
https://sease.io/2020/02/introducing-weighted-synonyms-in-apache-lucene.html
|
||||
For the following examples, assume a synonyms file named `boostedSynonyms.txt`:
|
||||
|
||||
[source,text]
|
||||
----
|
||||
leopard, big cat|0.8, bagheera|0.9, panthera pardus|0.85
|
||||
lion => panthera leo|0.9, simba|0.8, kimba|0.75
|
||||
----
|
||||
|
||||
*Example:*
|
||||
|
||||
====
|
||||
[.tab-label]*With name*
|
||||
[source,xml]
|
||||
----
|
||||
<analyzer type="query">
|
||||
<tokenizer name="standard"/>
|
||||
<filter name="synonymGraph" synonyms="boostedSynonyms.txt"/>
|
||||
<filter name="delimitedBoost"/>
|
||||
</analyzer>
|
||||
----
|
||||
====
|
||||
|
||||
*In:* "lion"
|
||||
|
||||
*Tokenizer to Filter:* "lion"(1)
|
||||
|
||||
*Out:* "panthera"(1), "leo"(2)[0.9], "simba"(1)[0.8], "kimba"(1)[0.75]
|
||||
|
||||
== Token Offset Payload Filter
|
||||
|
||||
This filter adds the numeric character offsets of the token as a payload value for that token.
|
||||
|
|
Loading…
Reference in New Issue