LUCENE-5867: Add a BooleanSimilarity.

This commit is contained in:
Adrien Grand 2016-11-10 14:05:13 +01:00
parent c415bc8d1d
commit 3e15233b23
5 changed files with 218 additions and 3 deletions

View File

@ -55,7 +55,10 @@ Other
* LUCENE-7360: Remove Explanation.toHtml() (Alan Woodward)
======================= Lucene 6.4.0 =======================
(No Changes)
New features
* LUCENE-5867: Added BooleanSimilarity. (Robert Muir, Adrien Grand)
Improvements

View File

@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.util.BytesRef;
/**
* Simple similarity that gives terms a score that is equal to their query
* boost. This similarity is typically used with disabled norms since neither
* document statistics nor index statistics are used for scoring. That said,
* if norms are enabled, they will be computed the same way as
* {@link SimilarityBase} and {@link BM25Similarity} with
* {@link SimilarityBase#setDiscountOverlaps(boolean) discounted overlaps}
* so that the {@link Similarity} can be changed after the index has been
* created.
*/
public class BooleanSimilarity extends Similarity {
private static final Similarity BM25_SIM = new BM25Similarity();
/** Sole constructor */
public BooleanSimilarity() {}
@Override
public long computeNorm(FieldInvertState state) {
return BM25_SIM.computeNorm(state);
}
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
return new BooleanWeight(boost);
}
private static class BooleanWeight extends SimWeight {
final float boost;
BooleanWeight(float boost) {
this.boost = boost;
}
}
@Override
public SimScorer simScorer(SimWeight weight, LeafReaderContext context) throws IOException {
final float boost = ((BooleanWeight) weight).boost;
return new SimScorer() {
@Override
public float score(int doc, float freq) throws IOException {
return boost;
}
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
Explanation queryBoostExpl = Explanation.match(boost, "query boost");
return Explanation.match(
queryBoostExpl.getValue(),
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",
queryBoostExpl);
}
@Override
public float computeSlopFactor(int distance) {
return 1f;
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1f;
}
};
}
}

View File

@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.search.similarities;
import java.io.IOException;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.TestUtil;
public class TestBooleanSimilarity extends LuceneTestCase {
public void testTermScoreIsEqualToBoost() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
newIndexWriterConfig());
Document doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
doc.add(new StringField("foo", "baz", Store.NO));
w.addDocument(doc);
doc = new Document();
doc.add(new StringField("foo", "bar", Store.NO));
doc.add(new StringField("foo", "bar", Store.NO));
w.addDocument(doc);
DirectoryReader reader = w.getReader();
w.close();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new BooleanSimilarity());
TopDocs topDocs = searcher.search(new TermQuery(new Term("foo", "bar")), 2);
assertEquals(2, topDocs.totalHits);
assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
assertEquals(1f, topDocs.scoreDocs[1].score, 0f);
topDocs = searcher.search(new TermQuery(new Term("foo", "baz")), 1);
assertEquals(1, topDocs.totalHits);
assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
topDocs = searcher.search(new BoostQuery(new TermQuery(new Term("foo", "baz")), 3f), 1);
assertEquals(1, topDocs.totalHits);
assertEquals(3f, topDocs.scoreDocs[0].score, 0f);
reader.close();
dir.close();
}
public void testPhraseScoreIsEqualToBoost() throws IOException {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
newIndexWriterConfig().setSimilarity(new BooleanSimilarity()));
Document doc = new Document();
doc.add(new TextField("foo", "bar baz quux", Store.NO));
w.addDocument(doc);
DirectoryReader reader = w.getReader();
w.close();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarity(new BooleanSimilarity());
PhraseQuery query = new PhraseQuery(2, "foo", "bar", "quux");
TopDocs topDocs = searcher.search(query, 2);
assertEquals(1, topDocs.totalHits);
assertEquals(1f, topDocs.scoreDocs[0].score, 0f);
topDocs = searcher.search(new BoostQuery(query, 7), 2);
assertEquals(1, topDocs.totalHits);
assertEquals(7f, topDocs.scoreDocs[0].score, 0f);
reader.close();
dir.close();
}
public void testSameNormsAsBM25() {
BooleanSimilarity sim1 = new BooleanSimilarity();
BM25Similarity sim2 = new BM25Similarity();
sim2.setDiscountOverlaps(true);
for (int iter = 0; iter < 100; ++iter) {
final int length = TestUtil.nextInt(random(), 1, 100);
final int position = random().nextInt(length);
final int numOverlaps = random().nextInt(50);
final float boost = random().nextFloat() * 10;
FieldInvertState state = new FieldInvertState("foo", position, length, numOverlaps, 100, boost);
assertEquals(
sim2.computeNorm(state),
sim1.computeNorm(state),
0f);
}
}
}

View File

@ -91,6 +91,7 @@ public class RandomSimilarity extends PerFieldSimilarityWrapper {
allSims = new ArrayList<>();
allSims.add(new ClassicSimilarity());
allSims.add(new BM25Similarity());
allSims.add(new BooleanSimilarity());
for (BasicModel basicModel : BASIC_MODELS) {
for (AfterEffect afterEffect : AFTER_EFFECTS) {
for (Normalization normalization : NORMALIZATIONS) {

View File

@ -36,7 +36,6 @@ import org.apache.lucene.codecs.lucene70.Lucene70Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
import org.apache.lucene.search.similarities.ClassicSimilarity;
import org.apache.lucene.search.similarities.RandomSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.LuceneTestCase.SuppressCodecs;
@ -213,7 +212,7 @@ final class TestRuleSetupAndRestoreClassEnv extends AbstractBeforeAfterRule {
TimeZone randomTimeZone = randomTimeZone(random());
timeZone = testTimeZone.equals("random") ? randomTimeZone : TimeZone.getTimeZone(testTimeZone);
TimeZone.setDefault(timeZone);
similarity = random().nextBoolean() ? new ClassicSimilarity() : new RandomSimilarity(random());
similarity = new RandomSimilarity(random());
// Check codec restrictions once at class level.
try {