LUCENE-2959: add state of the art ranking to Lucene

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1169470 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-09-11 15:47:21 +00:00
parent 82649a21b4
commit cfaf91c739
141 changed files with 4345 additions and 183 deletions

View File

@ -495,6 +495,34 @@ New features
* LUCENE-3423: add Terms.getDocCount(), which returns the number of documents
that have at least one term for a field. (Yonik Seeley, Robert Muir)
* LUCENE-2959: Added a variety of different relevance ranking systems to Lucene.
- Added Okapi BM25, Language Models, Divergence from Randomness, and
Information-Based Models. The models are pluggable, support all of lucene's
features (boosts, slops, explanations, etc) and queries (spans, etc).
- All models default to the same index-time norm encoding as DefaultSimilarity:
so you can easily try these out/switch back and forth/run experiments and
comparisons without reindexing. Note: most of the models do rely upon index
statistics that are new in Lucene 4.0, so for existing 3.x indexes its a good
idea to upgrade your index to the new format with IndexUpgrader first.
- Added a new subclass SimilarityBase which provides a simplified API
for plugging in new ranking algorithms without dealing with all of the
nuances and implementation details of Lucene.
- Added a new helper class BasicSimilarityProvider that just applies one
scoring algorithm to all fields, with queryNorm() and coord() returning 1.
In general, it is recommended to disable coord() when using the new models.
For example, to use BM25 for all fields:
searcher.setSimilarityProvider(new BasicSimilarityProvider(new BM25Similarity()));
If you instead want to apply different similarities (e.g. ones with different
parameter values or different algorithms entirely) to different fields, implement
SimilarityProvider with your per-field logic.
(David Mark Nemeskey via Robert Muir)
Optimizations
* LUCENE-2588: Don't store unnecessary suffixes when writing the terms

View File

@ -43,7 +43,7 @@ import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.CollectionUtil;
import org.apache.lucene.util.AttributeImpl;

View File

@ -57,8 +57,8 @@ import org.apache.lucene.search.Collector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.RAMDirectory; // for javadocs
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;

View File

@ -22,9 +22,9 @@ import java.util.Date;
import java.util.List;
import java.util.ArrayList;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Bits;

View File

@ -17,7 +17,7 @@
package org.apache.lucene.misc;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.index.FieldInvertState;
/**

View File

@ -26,13 +26,13 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -18,11 +18,11 @@
package org.apache.lucene.misc;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.index.FieldInvertState;

View File

@ -31,13 +31,13 @@ import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.MultiNorms;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -31,6 +31,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;

View File

@ -223,14 +223,14 @@ public class Field implements IndexableField {
* document.
*
* <p>The boost is used to compute the norm factor for the field. By
* default, in the {@link org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)} method,
* default, in the {@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)} method,
* the boost value is multiplied by the length normalization factor and then
* rounded by {@link org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* rounded by {@link org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)} before it is stored in the
* index. One should attempt to ensure that this product does not overflow
* the range of that encoding.
*
* @see org.apache.lucene.search.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.DefaultSimilarity#encodeNormValue(float)
* @see org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.similarities.DefaultSimilarity#encodeNormValue(float)
*/
public void setBoost(float boost) {
this.boost = boost;

View File

@ -32,7 +32,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.DocumentsWriterPerThreadPool.ThreadState;
import org.apache.lucene.index.FieldInfos.FieldNumberBiMap;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;

View File

@ -26,7 +26,7 @@ import java.text.NumberFormat;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FlushInfo;
import org.apache.lucene.store.IOContext;

View File

@ -32,7 +32,7 @@ import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.codecs.PerDocValues;
import org.apache.lucene.index.values.IndexDocValues;
import org.apache.lucene.search.FieldCache; // javadocs
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
@ -1012,7 +1012,7 @@ public abstract class IndexReader implements Cloneable,Closeable {
*
* @see #norms(String)
* @see Similarity#computeNorm(FieldInvertState)
* @see org.apache.lucene.search.DefaultSimilarity#decodeNormValue(byte)
* @see org.apache.lucene.search.similarities.DefaultSimilarity#decodeNormValue(byte)
* @throws StaleReaderException if the index has changed
* since this reader was opened
* @throws CorruptIndexException if the index is corrupt

View File

@ -22,7 +22,7 @@ import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.IndexWriter.IndexReaderWarmer;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.util.Version;
/**

View File

@ -17,7 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.util.ArrayUtil;
/** Taps into DocInverter, as an InvertedDocEndConsumer,

View File

@ -24,7 +24,8 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.util.ToStringUtils;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.ConjunctionTermScorer.DocsAndFreqs;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.search.TermQuery.TermWeight;
import java.io.IOException;

View File

@ -24,6 +24,7 @@ import java.util.List;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery.BooleanWeight;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.Scorer.ChildScorer;
/* See the description in BooleanScorer.java, comparing

View File

@ -18,7 +18,7 @@ package org.apache.lucene.search;
*/
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.util.ArrayUtil;
import java.io.IOException;
import java.util.Comparator;

View File

@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.Arrays;
import org.apache.lucene.index.*;
import org.apache.lucene.search.similarities.Similarity;
final class ExactPhraseScorer extends Scorer {
private final int endMinus1;

View File

@ -38,6 +38,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.NIOFSDirectory; // javadoc
import org.apache.lucene.util.ReaderUtil;

View File

@ -26,7 +26,8 @@ import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
@ -164,8 +165,7 @@ public class MultiPhraseQuery extends Query {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
if (termArrays.size() == 0) // optimize zero-term case
return null;
assert !termArrays.isEmpty();
final IndexReader reader = context.reader;
final Bits liveDocs = reader.getLiveDocs();
@ -249,7 +249,11 @@ public class MultiPhraseQuery extends Query {
@Override
public Query rewrite(IndexReader reader) {
if (termArrays.size() == 1) { // optimize one-term case
if (termArrays.isEmpty()) {
BooleanQuery bq = new BooleanQuery();
bq.setBoost(getBoost());
return bq;
} else if (termArrays.size() == 1) { // optimize one-term case
Term[] terms = termArrays.get(0);
BooleanQuery boq = new BooleanQuery(true);
for (int i=0; i<terms.length; i++) {

View File

@ -29,7 +29,8 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.TermState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ToStringUtils;
@ -119,7 +120,11 @@ public class PhraseQuery extends Query {
@Override
public Query rewrite(IndexReader reader) throws IOException {
if (terms.size() == 1) {
if (terms.isEmpty()) {
BooleanQuery bq = new BooleanQuery();
bq.setBoost(getBoost());
return bq;
} else if (terms.size() == 1) {
TermQuery tq = new TermQuery(terms.get(0));
tq.setBoost(getBoost());
return tq;
@ -208,8 +213,7 @@ public class PhraseQuery extends Query {
@Override
public Scorer scorer(AtomicReaderContext context, ScorerContext scorerContext) throws IOException {
if (terms.size() == 0) // optimize zero-term case
return null;
assert !terms.isEmpty();
final IndexReader reader = context.reader;
final Bits liveDocs = reader.getLiveDocs();
PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
@ -285,12 +289,6 @@ public class PhraseQuery extends Query {
@Override
public Weight createWeight(IndexSearcher searcher) throws IOException {
if (terms.size() == 1) { // optimize one-term case
Term term = terms.get(0);
Query termQuery = new TermQuery(term);
termQuery.setBoost(getBoost());
return termQuery.createWeight(searcher);
}
return new PhraseWeight(searcher);
}

View File

@ -19,6 +19,8 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.search.similarities.Similarity;
/** Expert: Scoring functionality for phrase queries.
* <br>A document is considered matching if it contains the phrase-query terms
* at "valid" positions. What "valid positions" are

View File

@ -20,6 +20,8 @@ package org.apache.lucene.search;
import java.io.IOException;
import java.util.LinkedHashSet;
import org.apache.lucene.search.similarities.Similarity;
final class SloppyPhraseScorer extends PhraseScorer {
private int slop;
private PhrasePositions repeats[];

View File

@ -28,7 +28,8 @@ import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.ReaderUtil;

View File

@ -20,6 +20,7 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.search.similarities.Similarity;
/** Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
*/

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.search.similarities.SimilarityProvider;
/**
* Expert: Calculate query weights and build query scorers.

View File

@ -22,10 +22,10 @@ import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.NearSpansOrdered;
import org.apache.lucene.search.spans.NearSpansUnordered;
import org.apache.lucene.search.spans.SpanNearQuery;
@ -52,7 +52,7 @@ import java.util.Iterator;
* <p/>
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
*
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
* @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
*/
public class PayloadNearQuery extends SpanNearQuery {
protected String fieldName;

View File

@ -20,16 +20,16 @@ package org.apache.lucene.search.payloads;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.search.DefaultSimilarity; // javadocs only
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.ComplexExplanation;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.payloads.PayloadNearQuery.PayloadNearSpanScorer;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.spans.TermSpans;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.SpanWeight;
@ -49,7 +49,7 @@ import java.io.IOException;
* which returns 1 by default.
* <p/>
* Payload scores are aggregated using a pluggable {@link PayloadFunction}.
* @see org.apache.lucene.search.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
* @see org.apache.lucene.search.similarities.Similarity.SloppyDocScorer#computePayloadFactor(int, int, int, BytesRef)
**/
public class PayloadTermQuery extends SpanTermQuery {
protected PayloadFunction function;

View File

@ -0,0 +1,63 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* This class acts as the base class for the implementations of the <em>first
* normalization of the informative content</em> in the DFR framework. This
* component is also called the <em>after effect</em> and is defined by the
* formula <em>Inf<sub>2</sub> = 1 - Prob<sub>2</sub></em>, where
* <em>Prob<sub>2</sub></em> measures the <em>information gain</em>.
*
* @see DFRSimilarity
* @lucene.experimental
*/
public abstract class AfterEffect {
/** Returns the aftereffect score. */
public abstract float score(BasicStats stats, float tfn);
/** Returns an explanation for the score. */
public abstract Explanation explain(BasicStats stats, float tfn);
/** Implementation used when there is no aftereffect. */
public static final class NoAfterEffect extends AfterEffect {
@Override
public final float score(BasicStats stats, float tfn) {
return 1f;
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
return new Explanation(1, "no aftereffect");
}
@Override
public String toString() {
return "";
}
}
/**
* Subclasses must override this method to return the code of the
* after effect formula. Refer to the original paper for the list.
*/
@Override
public abstract String toString();
}

View File

@ -0,0 +1,49 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Model of the information gain based on the ratio of two Bernoulli processes.
* @lucene.experimental
*/
public class AfterEffectB extends AfterEffect {
@Override
public final float score(BasicStats stats, float tfn) {
long F = stats.getTotalTermFreq();
int n = stats.getDocFreq();
return (F + 1) / (n * (tfn + 1));
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(score(stats, tfn));
result.addDetail(new Explanation(tfn, "tfn"));
result.addDetail(new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
result.addDetail(new Explanation(stats.getDocFreq(), "docFreq"));
return result;
}
@Override
public String toString() {
return "B";
}
}

View File

@ -0,0 +1,45 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Model of the information gain based on Laplace's law of succession.
* @lucene.experimental
*/
public class AfterEffectL extends AfterEffect {
@Override
public final float score(BasicStats stats, float tfn) {
return 1 / (tfn + 1);
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(score(stats, tfn));
result.addDetail(new Explanation(tfn, "tfn"));
return result;
}
@Override
public String toString() {
return "L";
}
}

View File

@ -0,0 +1,339 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TermContext;
/**
* BM25 Similarity. Introduced in Stephen E. Robertson, Steve Walker,
* Susan Jones, Micheline Hancock-Beaulieu, and Mike Gatford. Okapi at TREC-3.
* In Proceedings of the Third Text REtrieval Conference (TREC 1994).
* Gaithersburg, USA, November 1994.
* @lucene.experimental
*/
public class BM25Similarity extends Similarity {
private final float k1;
private final float b;
// TODO: should we add a delta like sifaka.cs.uiuc.edu/~ylv2/pub/sigir11-bm25l.pdf ?
public BM25Similarity(float k1, float b) {
this.k1 = k1;
this.b = b;
}
/** BM25 with these default values:
* <ul>
* <li>{@code k1 = 1.2},
* <li>{@code b = 0.75}.</li>
* </ul>
*/
public BM25Similarity() {
this.k1 = 1.2f;
this.b = 0.75f;
}
/** Implemented as <code>log(1 + (numDocs - docFreq + 0.5)/(docFreq + 0.5))</code>. */
protected float idf(int docFreq, int numDocs) {
return (float) Math.log(1 + (numDocs - docFreq + 0.5D)/(docFreq + 0.5D));
}
/** Implemented as <code>1 / (distance + 1)</code>. */
protected float sloppyFreq(int distance) {
return 1.0f / (distance + 1);
}
/** The default implementation returns <code>1</code> */
protected float scorePayload(int doc, int start, int end, BytesRef payload) {
return 1;
}
/** The default implementation computes the average as <code>sumTotalTermFreq / maxDoc</code>,
* or returns <code>1</code> if the index does not store sumTotalTermFreq (Lucene 3.x indexes
* or any field that omits frequency information). */
protected float avgFieldLength(IndexSearcher searcher, String field) throws IOException {
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), field);
if (terms == null) {
// field does not exist;
return 1f;
}
long sumTotalTermFreq = terms.getSumTotalTermFreq();
long maxdoc = searcher.maxDoc();
return sumTotalTermFreq == -1 ? 1f : (float) (sumTotalTermFreq / (double) maxdoc);
}
/** The default implementation encodes <code>boost / sqrt(length)</code>
* with {@link SmallFloat#floatToByte315(float)}. This is compatible with
* Lucene's default implementation. If you change this, then you should
* change {@link #decodeNormValue(byte)} to match. */
protected byte encodeNormValue(float boost, int fieldLength) {
return SmallFloat.floatToByte315(boost / (float) Math.sqrt(fieldLength));
}
/** The default implementation returns <code>1 / f<sup>2</sup></code>
* where <code>f</code> is {@link SmallFloat#byte315ToFloat(byte)}. */
protected float decodeNormValue(byte b) {
return NORM_TABLE[b & 0xFF];
}
// Default true
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with 0 position increment) are
* ignored when computing norm. By default this is true, meaning overlap
* tokens do not count when computing norms. */
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/** @see #setDiscountOverlaps */
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
/** Cache of decoded bytes. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
float f = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (f*f);
}
}
@Override
public final byte computeNorm(FieldInvertState state) {
final int numTerms = discountOverlaps ? state.getLength() - state.getNumOverlap() : state.getLength();
return encodeNormValue(state.getBoost(), numTerms);
}
public Explanation idfExplain(TermContext stats, final IndexSearcher searcher) throws IOException {
final int df = stats.docFreq();
final int max = searcher.maxDoc();
final float idf = idf(df, max);
return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
public Explanation idfExplain(final TermContext stats[], IndexSearcher searcher) throws IOException {
final int max = searcher.maxDoc();
float idf = 0.0f;
final Explanation exp = new Explanation();
exp.setDescription("idf(), sum of:");
for (final TermContext stat : stats ) {
final int df = stat.docFreq();
final float termIdf = idf(df, max);
exp.addDetail(new Explanation(termIdf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"));
idf += termIdf;
}
exp.setValue(idf);
return exp;
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termStats) throws IOException {
Explanation idf = termStats.length == 1 ? idfExplain(termStats[0], searcher) : idfExplain(termStats, searcher);
float avgdl = avgFieldLength(searcher, fieldName);
// compute freq-independent part of bm25 equation across all norm values
float cache[] = new float[256];
for (int i = 0; i < cache.length; i++) {
cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
}
return new BM25Stats(idf, queryBoost, avgdl, cache);
}
@Override
public final ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
final byte[] norms = context.reader.norms(fieldName);
return norms == null
? new ExactBM25DocScorerNoNorms((BM25Stats)stats)
: new ExactBM25DocScorer((BM25Stats)stats, norms);
}
@Override
public final SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
return new SloppyBM25DocScorer((BM25Stats) stats, context.reader.norms(fieldName));
}
private class ExactBM25DocScorer extends ExactDocScorer {
private final BM25Stats stats;
private final float weightValue;
private final byte[] norms;
private final float[] cache;
ExactBM25DocScorer(BM25Stats stats, byte norms[]) {
assert norms != null;
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
this.cache = stats.cache;
this.norms = norms;
}
@Override
public float score(int doc, int freq) {
return weightValue * freq / (freq + cache[norms[doc] & 0xFF]);
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
}
/** there are no norms, we act as if b=0 */
private class ExactBM25DocScorerNoNorms extends ExactDocScorer {
private final BM25Stats stats;
private final float weightValue;
private static final int SCORE_CACHE_SIZE = 32;
private float[] scoreCache = new float[SCORE_CACHE_SIZE];
ExactBM25DocScorerNoNorms(BM25Stats stats) {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1); // boost * idf * (k1 + 1)
for (int i = 0; i < SCORE_CACHE_SIZE; i++)
scoreCache[i] = weightValue * i / (i + k1);
}
@Override
public float score(int doc, int freq) {
// TODO: maybe score cache is more trouble than its worth?
return freq < SCORE_CACHE_SIZE // check cache
? scoreCache[freq] // cache hit
: weightValue * freq / (freq + k1); // cache miss
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, null);
}
}
private class SloppyBM25DocScorer extends SloppyDocScorer {
private final BM25Stats stats;
private final float weightValue; // boost * idf * (k1 + 1)
private final byte[] norms;
private final float[] cache;
SloppyBM25DocScorer(BM25Stats stats, byte norms[]) {
this.stats = stats;
this.weightValue = stats.weight * (k1 + 1);
this.cache = stats.cache;
this.norms = norms;
}
@Override
public float score(int doc, float freq) {
// if there are no norms, we act as if b=0
float norm = norms == null ? k1 : cache[norms[doc] & 0xFF];
return weightValue * freq / (freq + norm);
}
@Override
public Explanation explain(int doc, Explanation freq) {
return explainScore(doc, freq, stats, norms);
}
@Override
public float computeSlopFactor(int distance) {
return sloppyFreq(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return scorePayload(doc, start, end, payload);
}
}
/** Collection statistics for the BM25 model. */
private static class BM25Stats extends Stats {
/** BM25's idf */
private final Explanation idf;
/** The average document length. */
private final float avgdl;
/** query's inner boost */
private final float queryBoost;
/** weight (idf * boost) */
private float weight;
/** precomputed norm[256] with k1 * ((1 - b) + b * dl / avgdl) */
private final float cache[];
BM25Stats(Explanation idf, float queryBoost, float avgdl, float cache[]) {
this.idf = idf;
this.queryBoost = queryBoost;
this.avgdl = avgdl;
this.cache = cache;
}
@Override
public float getValueForNormalization() {
// we return a TF-IDF like normalization to be nice, but we don't actually normalize ourselves.
final float queryWeight = idf.getValue() * queryBoost;
return queryWeight * queryWeight;
}
@Override
public void normalize(float queryNorm, float topLevelBoost) {
// we don't normalize with queryNorm at all, we just capture the top-level boost
this.weight = idf.getValue() * queryBoost * topLevelBoost;
}
}
private Explanation explainScore(int doc, Explanation freq, BM25Stats stats, byte[] norms) {
Explanation result = new Explanation();
result.setDescription("score(doc="+doc+",freq="+freq+"), product of:");
Explanation boostExpl = new Explanation(stats.queryBoost, "boost");
if (stats.queryBoost != 1.0f)
result.addDetail(boostExpl);
result.addDetail(stats.idf);
Explanation tfNormExpl = new Explanation();
tfNormExpl.setDescription("tfNorm, computed from:");
tfNormExpl.addDetail(freq);
tfNormExpl.addDetail(new Explanation(k1, "parameter k1"));
if (norms == null) {
tfNormExpl.addDetail(new Explanation(0, "parameter b (norms omitted for field)"));
tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1));
} else {
float doclen = decodeNormValue(norms[doc]);
tfNormExpl.addDetail(new Explanation(b, "parameter b"));
tfNormExpl.addDetail(new Explanation(stats.avgdl, "avgFieldLength"));
tfNormExpl.addDetail(new Explanation(doclen, "fieldLength"));
tfNormExpl.setValue((freq.getValue() * (k1 + 1)) / (freq.getValue() + k1 * (1 - b + b * doclen/stats.avgdl)));
}
result.addDetail(tfNormExpl);
result.setValue(boostExpl.getValue() * stats.idf.getValue() * tfNormExpl.getValue());
return result;
}
@Override
public String toString() {
return "BM25(k1=" + k1 + ",b=" + b + ")";
}
}

View File

@ -0,0 +1,60 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* This class acts as the base class for the specific <em>basic model</em>
* implementations in the DFR framework. Basic models compute the
* <em>informative content Inf<sub>1</sub> = -log<sub>2</sub>Prob<sub>1</sub>
* </em>.
*
* @see DFRSimilarity
* @lucene.experimental
*/
public abstract class BasicModel {
/** Returns the informative content score. */
public abstract float score(BasicStats stats, float tfn);
/**
* Returns an explanation for the score.
* <p>Most basic models use the number of documents and the total term
* frequency to compute Inf<sub>1</sub>. This method provides a generic
* explanation for such models. Subclasses that use other statistics must
* override this method.</p>
*/
public Explanation explain(BasicStats stats, float tfn) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(score(stats, tfn));
result.addDetail(new Explanation(tfn, "tfn"));
result.addDetail(
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
result.addDetail(
new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
return result;
}
/**
* Subclasses must override this method to return the code of the
* basic model formula. Refer to the original paper for the list.
*/
@Override
public abstract String toString();
}

View File

@ -0,0 +1,47 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Limiting form of the Bose-Einstein model. The formula used in Lucene differs
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
* and {@code N} is increased by {@code F}
* @lucene.experimental
*/
public class BasicModelBE extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
double F = stats.getTotalTermFreq() + tfn;
// approximation only holds true when F << N, so we use N += F
double N = F + stats.getNumberOfDocuments();
return (float)(-log2((N - 1) * Math.E)
+ f(N + F - 1, N + F - tfn - 2) - f(F, F - tfn));
}
/** The <em>f</em> helper function defined for <em>B<sub>E</sub></em>. */
private final double f(double n, double m) {
return (m + 0.5) * log2(n / m) + (n - m) * log2(n);
}
@Override
public String toString() {
return "Be";
}
}

View File

@ -0,0 +1,52 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Implements the approximation of the binomial model with the divergence
* for DFR. The formula used in Lucene differs slightly from the one in the
* original paper: to avoid underflow for small values of {@code N} and
* {@code F}, {@code N} is increased by {@code 1} and
* {@code F} is always increased by {@code tfn}.
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
* abnormally high scores for low tf values.
* @lucene.experimental
*/
public class BasicModelD extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
// we have to ensure phi is always < 1 for tiny TTF values, otherwise nphi can go negative,
// resulting in NaN. cleanest way is to unconditionally always add tfn to totalTermFreq
// to create a 'normalized' F.
double F = stats.getTotalTermFreq() + tfn;
double phi = (double)tfn / F;
double nphi = 1 - phi;
double p = 1.0 / (stats.getNumberOfDocuments() + 1);
double D = phi * log2(phi / p) + nphi * log2(nphi / (1 - p));
return (float)(D * F + 0.5 * log2(1 + 2 * Math.PI * tfn * nphi));
}
@Override
public String toString() {
return "D";
}
}

View File

@ -0,0 +1,41 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Geometric as limiting form of the Bose-Einstein model. The formula used in Lucene differs
* slightly from the one in the original paper: {@code F} is increased by {@code tfn}
* and {@code N} is increased by {@code F}.
* @lucene.experimental
*/
public class BasicModelG extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
// just like in BE, approximation only holds true when F << N, so we use lambda = F / (N + F)
double lambda = stats.getTotalTermFreq() / (double) (stats.getNumberOfDocuments() + stats.getTotalTermFreq());
// -log(1 / (lambda + 1)) -> log(lambda + 1)
return (float)(log2(lambda + 1) + tfn * log2((1 + lambda) / lambda));
}
@Override
public String toString() {
return "G";
}
}

View File

@ -0,0 +1,38 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* An approximation of the <em>I(n<sub>e</sub>)</em> model.
* @lucene.experimental
*/
public class BasicModelIF extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
return tfn * (float)(log2(1 + (N + 1) / (F + 0.5)));
}
@Override
public String toString() {
return "I(F)";
}
}

View File

@ -0,0 +1,52 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* The basic tf-idf model of randomness.
* @lucene.experimental
*/
public class BasicModelIn extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
int n = stats.getDocFreq();
return tfn * (float)(log2((N + 1) / (n + 0.5)));
}
@Override
public final Explanation explain(BasicStats stats, float tfn) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(score(stats, tfn));
result.addDetail(new Explanation(tfn, "tfn"));
result.addDetail(
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
result.addDetail(
new Explanation(stats.getDocFreq(), "docFreq"));
return result;
}
@Override
public String toString() {
return "I(n)";
}
}

View File

@ -0,0 +1,40 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Tf-idf model of randomness, based on a mixture of Poisson and inverse
* document frequency.
* @lucene.experimental
*/
public class BasicModelIne extends BasicModel {
@Override
public final float score(BasicStats stats, float tfn) {
int N = stats.getNumberOfDocuments();
long F = stats.getTotalTermFreq();
double ne = N * (1 - Math.pow((N - 1) / (double)N, F));
return tfn * (float)(log2((N + 1) / (ne + 0.5)));
}
@Override
public String toString() {
return "I(ne)";
}
}

View File

@ -0,0 +1,46 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Implements the Poisson approximation for the binomial model for DFR.
* @lucene.experimental
* <p>
* WARNING: for terms that do not meet the expected random distribution
* (e.g. stopwords), this model may give poor performance, such as
* abnormally high scores for low tf values.
*/
public class BasicModelP extends BasicModel {
/** {@code log2(Math.E)}, precomputed. */
protected static double LOG2_E = log2(Math.E);
@Override
public final float score(BasicStats stats, float tfn) {
float lambda = (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
return (float)(tfn * log2(tfn / lambda)
+ (lambda + 1 / (12 * tfn) - tfn) * LOG2_E
+ 0.5 * log2(2 * Math.PI * tfn));
}
@Override
public String toString() {
return "P";
}
}

View File

@ -0,0 +1,54 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* A simple {@link Similarity} provider that returns in
* {@code get(String field)} the object passed to its constructor. This class
* is aimed at non-VSM models, and therefore both the {@link #coord} and
* {@link #queryNorm} methods return {@code 1}. Use
* {@link DefaultSimilarityProvider} for {@link DefaultSimilarity}.
* @lucene.experimental
*/
public class BasicSimilarityProvider implements SimilarityProvider {
private final Similarity sim;
public BasicSimilarityProvider(Similarity sim) {
this.sim = sim;
}
@Override
public float coord(int overlap, int maxOverlap) {
return 1f;
}
@Override
public float queryNorm(float sumOfSquaredWeights) {
return 1f;
}
@Override
public Similarity get(String field) {
return sim;
}
@Override
public String toString() {
return "BasicSimilarityProvider(" + sim + ")";
}
}

View File

@ -0,0 +1,144 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.index.Terms;
/**
* Stores all statistics commonly used ranking methods.
* @lucene.experimental
*/
public class BasicStats extends Similarity.Stats {
/** The number of documents. */
protected int numberOfDocuments;
/** The total number of tokens in the field. */
protected long numberOfFieldTokens;
/** The average field length. */
protected float avgFieldLength;
/** The document frequency. */
protected int docFreq;
/** The total number of occurrences of this term across all documents. */
protected long totalTermFreq;
// -------------------------- Boost-related stuff --------------------------
/** Query's inner boost. */
protected final float queryBoost;
/** Any outer query's boost. */
protected float topLevelBoost;
/** For most Similarities, the immediate and the top level query boosts are
* not handled differently. Hence, this field is just the product of the
* other two. */
protected float totalBoost;
/** Constructor. Sets the query boost. */
public BasicStats(float queryBoost) {
this.queryBoost = queryBoost;
this.totalBoost = queryBoost;
}
// ------------------------- Getter/setter methods -------------------------
/** Returns the number of documents. */
public int getNumberOfDocuments() {
return numberOfDocuments;
}
/** Sets the number of documents. */
public void setNumberOfDocuments(int numberOfDocuments) {
this.numberOfDocuments = numberOfDocuments;
}
/**
* Returns the total number of tokens in the field.
* @see Terms#getSumTotalTermFreq()
*/
public long getNumberOfFieldTokens() {
return numberOfFieldTokens;
}
/**
* Sets the total number of tokens in the field.
* @see Terms#getSumTotalTermFreq()
*/
public void setNumberOfFieldTokens(long numberOfFieldTokens) {
this.numberOfFieldTokens = numberOfFieldTokens;
}
/** Returns the average field length. */
public float getAvgFieldLength() {
return avgFieldLength;
}
/** Sets the average field length. */
public void setAvgFieldLength(float avgFieldLength) {
this.avgFieldLength = avgFieldLength;
}
/** Returns the document frequency. */
public int getDocFreq() {
return docFreq;
}
/** Sets the document frequency. */
public void setDocFreq(int docFreq) {
this.docFreq = docFreq;
}
/** Returns the total number of occurrences of this term across all documents. */
public long getTotalTermFreq() {
return totalTermFreq;
}
/** Sets the total number of occurrences of this term across all documents. */
public void setTotalTermFreq(long totalTermFreq) {
this.totalTermFreq = totalTermFreq;
}
// -------------------------- Boost-related stuff --------------------------
/** The square of the raw normalization value.
* @see #rawNormalizationValue() */
@Override
public float getValueForNormalization() {
float rawValue = rawNormalizationValue();
return rawValue * rawValue;
}
/** Computes the raw normalization value. This basic implementation returns
* the query boost. Subclasses may override this method to include other
* factors (such as idf), or to save the value for inclusion in
* {@link #normalize(float, float)}, etc.
*/
protected float rawNormalizationValue() {
return queryBoost;
}
/** No normalization is done. {@code topLevelBoost} is saved in the object,
* however. */
@Override
public void normalize(float queryNorm, float topLevelBoost) {
this.topLevelBoost = topLevelBoost;
totalBoost = queryBoost * topLevelBoost;
}
/** Returns the total boost. */
public float getTotalBoost() {
return totalBoost;
}
}

View File

@ -0,0 +1,86 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Implements the <em>divergence from randomness (DFR)</em> framework
* introduced in Gianni Amati and Cornelis Joost Van Rijsbergen. 2002.
* Probabilistic models of information retrieval based on measuring the
* divergence from randomness. ACM Trans. Inf. Syst. 20, 4 (October 2002),
* 357-389.
* <p>The DFR scoring formula is composed of three separate components: the
* <em>basic model</em>, the <em>aftereffect</em> and an additional
* <em>normalization</em> component, represented by the classes
* {@code BasicModel}, {@code AfterEffect} and {@code Normalization},
* respectively. The names of these classes were chosen to match the names of
* their counterparts in the Terrier IR engine.</p>
* <p>Note that <em>qtf</em>, the multiplicity of term-occurrence in the query,
* is not handled by this implementation.</p>
* @see BasicModel
* @see AfterEffect
* @see Normalization
* @lucene.experimental
*/
public class DFRSimilarity extends SimilarityBase {
/** The basic model for information content. */
protected final BasicModel basicModel;
/** The first normalization of the information content. */
protected final AfterEffect afterEffect;
/** The term frequency normalization. */
protected final Normalization normalization;
public DFRSimilarity(BasicModel basicModel,
AfterEffect afterEffect,
Normalization normalization) {
if (basicModel == null || afterEffect == null || normalization == null) {
throw new NullPointerException("null parameters not allowed.");
}
this.basicModel = basicModel;
this.afterEffect = afterEffect;
this.normalization = normalization;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
float tfn = normalization.tfn(stats, freq, docLen);
return stats.getTotalBoost() *
basicModel.score(stats, tfn) * afterEffect.score(stats, tfn);
}
@Override
protected void explain(Explanation expl,
BasicStats stats, int doc, float freq, float docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
float tfn = normExpl.getValue();
expl.addDetail(normExpl);
expl.addDetail(basicModel.explain(stats, tfn));
expl.addDetail(afterEffect.explain(stats, tfn));
}
@Override
public String toString() {
return "DFR " + basicModel.toString() + afterEffect.toString()
+ normalization.toString();
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.util.BytesRef;
@ -85,4 +85,9 @@ public class DefaultSimilarity extends TFIDFSimilarity {
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public String toString() {
return "DefaultSimilarity";
}
}

View File

@ -1,4 +1,5 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -0,0 +1,45 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* The probabilistic distribution used to model term occurrence
* in information-based models.
* @see IBSimilarity
* @lucene.experimental
*/
public abstract class Distribution {
/** Computes the score. */
public abstract float score(BasicStats stats, float tfn, float lambda);
/** Explains the score. Returns the name of the model only, since
* both {@code tfn} and {@code lambda} are explained elsewhere. */
public Explanation explain(BasicStats stats, float tfn, float lambda) {
return new Explanation(
score(stats, tfn, lambda), getClass().getSimpleName());
}
/**
* Subclasses must override this method to return the name of the
* distribution.
*/
@Override
public abstract String toString();
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Log-logistic distribution.
* <p>Unlike for DFR, the natural logarithm is used, as
* it is faster to compute and the original paper does not express any
* preference to a specific base.</p>
* @lucene.experimental
*/
public class DistributionLL extends Distribution {
@Override
public final float score(BasicStats stats, float tfn, float lambda) {
return (float)-Math.log(lambda / (tfn + lambda));
}
@Override
public String toString() {
return "LL";
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* The smoothed power-law (SPL) distribution for the information-based framework
* that is described in the original paper.
* <p>Unlike for DFR, the natural logarithm is used, as
* it is faster to compute and the original paper does not express any
* preference to a specific base.</p>
* @lucene.experimental
*/
public class DistributionSPL extends Distribution {
@Override
public final float score(BasicStats stats, float tfn, float lambda) {
if (lambda == 1f) {
lambda = 0.99f;
}
return (float)-Math.log(
(Math.pow(lambda, (tfn / (tfn + 1))) - lambda) / (1 - lambda));
}
@Override
public String toString() {
return "SPL";
}
}

View File

@ -0,0 +1,94 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Provides a framework for the family of information-based models, as described
* in St&eacute;phane Clinchant and Eric Gaussier. 2010. Information-based
* models for ad hoc IR. In Proceeding of the 33rd international ACM SIGIR
* conference on Research and development in information retrieval (SIGIR '10).
* ACM, New York, NY, USA, 234-241.
* <p>The retrieval function is of the form <em>RSV(q, d) = &sum;
* -x<sup>q</sup><sub>w</sub> log Prob(X<sub>w</sub> &ge;
* t<sup>d</sup><sub>w</sub> | &lambda;<sub>w</sub>)</em>, where
* <ul>
* <li><em>x<sup>q</sup><sub>w</sub></em> is the query boost;</li>
* <li><em>X<sub>w</sub></em> is a random variable that counts the occurrences
* of word <em>w</em>;</li>
* <li><em>t<sup>d</sup><sub>w</sub></em> is the normalized term frequency;</li>
* <li><em>&lambda;<sub>w</sub></em> is a parameter.</li>
* </ul>
* </p>
* <p>The framework described in the paper has many similarities to the DFR
* framework (see {@link DFRSimilarity}). It is possible that the two
* Similarities will be merged at one point.</p>
* @lucene.experimental
*/
public class IBSimilarity extends SimilarityBase {
/** The probabilistic distribution used to model term occurrence. */
protected final Distribution distribution;
/** The <em>lambda (&lambda;<sub>w</sub>)</em> parameter. */
protected final Lambda lambda;
/** The term frequency normalization. */
protected final Normalization normalization;
public IBSimilarity(Distribution distribution,
Lambda lambda,
Normalization normalization) {
this.distribution = distribution;
this.lambda = lambda;
this.normalization = normalization;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
return stats.getTotalBoost() *
distribution.score(
stats,
normalization.tfn(stats, freq, docLen),
lambda.lambda(stats));
}
@Override
protected void explain(
Explanation expl, BasicStats stats, int doc, float freq, float docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
Explanation lambdaExpl = lambda.explain(stats);
expl.addDetail(normExpl);
expl.addDetail(lambdaExpl);
expl.addDetail(distribution.explain(
stats, normExpl.getValue(), lambdaExpl.getValue()));
}
/**
* The name of IB methods follow the pattern
* {@code IB <distribution> <lambda><normalization>}. The name of the
* distribution is the same as in the original paper; for the names of lambda
* parameters, refer to the javadoc of the {@link Lambda} classes.
*/
@Override
public String toString() {
return "IB " + distribution.toString() + "-" + lambda.toString()
+ normalization.toString();
}
}

View File

@ -0,0 +1,97 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Bayesian smoothing using Dirichlet priors. From Chengxiang Zhai and John
* Lafferty. 2001. A study of smoothing methods for language models applied to
* Ad Hoc information retrieval. In Proceedings of the 24th annual international
* ACM SIGIR conference on Research and development in information retrieval
* (SIGIR '01). ACM, New York, NY, USA, 334-342.
* <p>
* The formula as defined the paper assigns a negative score to documents that
* contain the term, but with fewer occurrences than predicted by the collection
* language model. The Lucene implementation returns {@code 0} for such
* documents.
* </p>
*
* @lucene.experimental
*/
public class LMDirichletSimilarity extends LMSimilarity {
/** The &mu; parameter. */
private final float mu;
/** @param mu the &mu; parameter. */
public LMDirichletSimilarity(CollectionModel collectionModel, float mu) {
super(collectionModel);
this.mu = mu;
}
/** @param mu the &mu; parameter. */
public LMDirichletSimilarity(float mu) {
this.mu = mu;
}
/** Instantiates the similarity with the default &mu; value of 2000. */
public LMDirichletSimilarity(CollectionModel collectionModel) {
this(collectionModel, 2000);
}
/** Instantiates the similarity with the default &mu; value of 2000. */
public LMDirichletSimilarity() {
this(2000);
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
float score = stats.getTotalBoost() * (float)(Math.log(1 + freq /
(mu * ((LMStats)stats).getCollectionProbability())) +
Math.log(mu / (docLen + mu)));
return score > 0.0f ? score : 0.0f;
}
@Override
protected void explain(Explanation expl, BasicStats stats, int doc,
float freq, float docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
expl.addDetail(new Explanation(mu, "mu"));
Explanation weightExpl = new Explanation();
weightExpl.setValue((float)Math.log(1 + freq /
(mu * ((LMStats)stats).getCollectionProbability())));
weightExpl.setDescription("term weight");
expl.addDetail(weightExpl);
expl.addDetail(new Explanation(
(float)Math.log(mu / (docLen + mu)), "document norm"));
super.explain(expl, stats, doc, freq, docLen);
}
/** Returns the &mu; parameter. */
public float getMu() {
return mu;
}
@Override
public String getName() {
return String.format("Dirichlet(%f)", getMu());
}
}

View File

@ -0,0 +1,77 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Language model based on the Jelinek-Mercer smoothing method. From Chengxiang
* Zhai and John Lafferty. 2001. A study of smoothing methods for language
* models applied to Ad Hoc information retrieval. In Proceedings of the 24th
* annual international ACM SIGIR conference on Research and development in
* information retrieval (SIGIR '01). ACM, New York, NY, USA, 334-342.
* <p>The model has a single parameter, &lambda;. According to said paper, the
* optimal value depends on both the collection and the query. The optimal value
* is around {@code 0.1} for title queries and {@code 0.7} for long queries.</p>
*
* @lucene.experimental
*/
public class LMJelinekMercerSimilarity extends LMSimilarity {
/** The &lambda; parameter. */
private final float lambda;
/** @param lambda the &lambda; parameter. */
public LMJelinekMercerSimilarity(
CollectionModel collectionModel, float lambda) {
super(collectionModel);
this.lambda = lambda;
}
/** @param lambda the &lambda; parameter. */
public LMJelinekMercerSimilarity(float lambda) {
this.lambda = lambda;
}
@Override
protected float score(BasicStats stats, float freq, float docLen) {
return stats.getTotalBoost() *
(float)Math.log(1 +
((1 - lambda) * freq / docLen) /
(lambda * ((LMStats)stats).getCollectionProbability()));
}
@Override
protected void explain(Explanation expl, BasicStats stats, int doc,
float freq, float docLen) {
if (stats.getTotalBoost() != 1.0f) {
expl.addDetail(new Explanation(stats.getTotalBoost(), "boost"));
}
expl.addDetail(new Explanation(lambda, "lambda"));
super.explain(expl, stats, doc, freq, docLen);
}
/** Returns the &lambda; parameter. */
public float getLambda() {
return lambda;
}
@Override
public String getName() {
return String.format("Jelinek-Mercer(%f)", getLambda());
}
}

View File

@ -0,0 +1,155 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.TermContext;
/**
* Abstract superclass for language modeling Similarities. The following inner
* types are introduced:
* <ul>
* <li>{@link LMStats}, which defines a new statistic, the probability that
* the collection language model generates the current term;</li>
* <li>{@link CollectionModel}, which is a strategy interface for object that
* compute the collection language model {@code p(w|C)};</li>
* <li>{@link DefaultCollectionModel}, an implementation of the former, that
* computes the term probability as the number of occurrences of the term in the
* collection, divided by the total number of tokens.</li>
* </ul>
*
* @lucene.experimental
*/
public abstract class LMSimilarity extends SimilarityBase {
/** The collection model. */
protected final CollectionModel collectionModel;
/** Creates a new instance with the specified collection language model. */
public LMSimilarity(CollectionModel collectionModel) {
this.collectionModel = collectionModel;
}
/** Creates a new instance with the default collection language model. */
public LMSimilarity() {
this(new DefaultCollectionModel());
}
@Override
protected BasicStats newStats(float queryBoost) {
return new LMStats(queryBoost);
}
/**
* Computes the collection probability of the current term in addition to the
* usual statistics.
*/
@Override
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher, String fieldName, TermContext termContext) throws IOException {
super.fillBasicStats(stats, searcher, fieldName, termContext);
LMStats lmStats = (LMStats) stats;
lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
@Override
protected void explain(Explanation expl, BasicStats stats, int doc,
float freq, float docLen) {
expl.addDetail(new Explanation(collectionModel.computeProbability(stats),
"collection probability"));
}
/**
* Returns the name of the LM method. The values of the parameters should be
* included as well.
* <p>Used in {@link #toString()}</p>.
*/
public abstract String getName();
/**
* Returns the name of the LM method. If a custom collection model strategy is
* used, its name is included as well.
* @see #getName()
* @see CollectionModel#getName()
* @see DefaultCollectionModel
*/
@Override
public String toString() {
String coll = collectionModel.getName();
if (coll != null) {
return String.format("LM %s - %s", getName(), coll);
} else {
return String.format("LM %s", getName());
}
}
/** Stores the collection distribution of the current term. */
public static class LMStats extends BasicStats {
/** The probability that the current term is generated by the collection. */
private float collectionProbability;
public LMStats(float queryBoost) {
super(queryBoost);
}
/**
* Returns the probability that the current term is generated by the
* collection.
*/
public final float getCollectionProbability() {
return collectionProbability;
}
/**
* Sets the probability that the current term is generated by the
* collection.
*/
public final void setCollectionProbability(float collectionProbability) {
this.collectionProbability = collectionProbability;
}
}
/** A strategy for computing the collection language model. */
public static interface CollectionModel {
/**
* Computes the probability {@code p(w|C)} according to the language model
* strategy for the current term.
*/
public float computeProbability(BasicStats stats);
/** The name of the collection model strategy. */
public String getName();
}
/**
* Models {@code p(w|C)} as the number of occurrences of the term in the
* collection, divided by the total number of tokens {@code + 1}.
*/
public static class DefaultCollectionModel implements CollectionModel {
@Override
public float computeProbability(BasicStats stats) {
return (float)stats.getTotalTermFreq() / (stats.getNumberOfFieldTokens() +1);
}
@Override
public String getName() {
return null;
}
}
}

View File

@ -0,0 +1,42 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* The <em>lambda (&lambda;<sub>w</sub>)</em> parameter in information-based
* models.
* @see IBSimilarity
* @lucene.experimental
*/
public abstract class Lambda {
/** Computes the lambda parameter. */
public abstract float lambda(BasicStats stats);
/** Explains the lambda parameter. */
public abstract Explanation explain(BasicStats stats);
/**
* Subclasses must override this method to return the code of the lambda
* formula. Since the original paper is not very clear on this matter, and
* also uses the DFR naming scheme incorrectly, the codes here were chosen
* arbitrarily.
*/
@Override
public abstract String toString();
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Computes lambda as {@code totalTermFreq / numberOfDocuments}.
* @lucene.experimental
*/
public class LambdaDF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
return (float)stats.getDocFreq() / stats.getNumberOfDocuments();
}
@Override
public final Explanation explain(BasicStats stats) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(lambda(stats));
result.addDetail(
new Explanation(stats.getDocFreq(), "docFreq"));
result.addDetail(
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
return result;
}
@Override
public String toString() {
return "D";
}
}

View File

@ -0,0 +1,48 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* Computes lambda as {@code docFreq / numberOfDocuments}.
* @lucene.experimental
*/
public class LambdaTTF extends Lambda {
@Override
public final float lambda(BasicStats stats) {
return (float)stats.getTotalTermFreq() / stats.getNumberOfDocuments();
}
@Override
public final Explanation explain(BasicStats stats) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(lambda(stats));
result.addDetail(
new Explanation(stats.getTotalTermFreq(), "totalTermFreq"));
result.addDetail(
new Explanation(stats.getNumberOfDocuments(), "numberOfDocuments"));
return result;
}
@Override
public String toString() {
return "L";
}
}

View File

@ -0,0 +1,159 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
/**
* Implements the CombSUM method for combining evidence from multiple
* similarity values described in: Joseph A. Shaw, Edward A. Fox.
* In Text REtrieval Conference (1993), pp. 243-252
* @lucene.experimental
*/
public class MultiSimilarity extends Similarity {
protected final Similarity sims[];
public MultiSimilarity(Similarity sims[]) {
this.sims = sims;
}
@Override
public byte computeNorm(FieldInvertState state) {
return sims[0].computeNorm(state);
}
@Override
public Stats computeStats(IndexSearcher searcher, String fieldName, float queryBoost, TermContext... termContexts) throws IOException {
Stats subStats[] = new Stats[sims.length];
for (int i = 0; i < subStats.length; i++) {
subStats[i] = sims[i].computeStats(searcher, fieldName, queryBoost, termContexts);
}
return new MultiStats(subStats);
}
@Override
public ExactDocScorer exactDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
ExactDocScorer subScorers[] = new ExactDocScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = sims[i].exactDocScorer(((MultiStats)stats).subStats[i], fieldName, context);
}
return new MultiExactDocScorer(subScorers);
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName, AtomicReaderContext context) throws IOException {
SloppyDocScorer subScorers[] = new SloppyDocScorer[sims.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = sims[i].sloppyDocScorer(((MultiStats)stats).subStats[i], fieldName, context);
}
return new MultiSloppyDocScorer(subScorers);
}
public static class MultiExactDocScorer extends ExactDocScorer {
private final ExactDocScorer subScorers[];
MultiExactDocScorer(ExactDocScorer subScorers[]) {
this.subScorers = subScorers;
}
@Override
public float score(int doc, int freq) {
float sum = 0.0f;
for (ExactDocScorer subScorer : subScorers) {
sum += subScorer.score(doc, freq);
}
return sum;
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation expl = new Explanation(score(doc, (int)freq.getValue()), "sum of:");
for (ExactDocScorer subScorer : subScorers) {
expl.addDetail(subScorer.explain(doc, freq));
}
return expl;
}
}
public static class MultiSloppyDocScorer extends SloppyDocScorer {
private final SloppyDocScorer subScorers[];
MultiSloppyDocScorer(SloppyDocScorer subScorers[]) {
this.subScorers = subScorers;
}
@Override
public float score(int doc, float freq) {
float sum = 0.0f;
for (SloppyDocScorer subScorer : subScorers) {
sum += subScorer.score(doc, freq);
}
return sum;
}
@Override
public Explanation explain(int doc, Explanation freq) {
Explanation expl = new Explanation(score(doc, freq.getValue()), "sum of:");
for (SloppyDocScorer subScorer : subScorers) {
expl.addDetail(subScorer.explain(doc, freq));
}
return expl;
}
@Override
public float computeSlopFactor(int distance) {
return subScorers[0].computeSlopFactor(distance);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return subScorers[0].computePayloadFactor(doc, start, end, payload);
}
}
public static class MultiStats extends Stats {
final Stats subStats[];
MultiStats(Stats subStats[]) {
this.subStats = subStats;
}
@Override
public float getValueForNormalization() {
float sum = 0.0f;
for (Stats stat : subStats) {
sum += stat.getValueForNormalization();
}
return sum / subStats.length;
}
@Override
public void normalize(float queryNorm, float topLevelBoost) {
for (Stats stat : subStats) {
stat.normalize(queryNorm, topLevelBoost);
}
}
}
}

View File

@ -0,0 +1,75 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.lucene.search.Explanation;
/**
* This class acts as the base class for the implementations of the term
* frequency normalization methods in the DFR framework.
*
* @see DFRSimilarity
* @lucene.experimental
*/
public abstract class Normalization {
/** Returns the normalized term frequency.
* @param len the field length. */
public abstract float tfn(BasicStats stats, float tf, float len);
/** Returns an explanation for the normalized term frequency.
* <p>The default normalization methods use the field length of the document
* and the average field length to compute the normalized term frequency.
* This method provides a generic explanation for such methods.
* Subclasses that use other statistics must override this method.</p>
*/
public Explanation explain(BasicStats stats, float tf, float len) {
Explanation result = new Explanation();
result.setDescription(getClass().getSimpleName() + ", computed from: ");
result.setValue(tfn(stats, tf, len));
result.addDetail(new Explanation(tf, "tf"));
result.addDetail(
new Explanation(stats.getAvgFieldLength(), "avgFieldLength"));
result.addDetail(new Explanation(len, "len"));
return result;
}
/** Implementation used when there is no normalization. */
public static final class NoNormalization extends Normalization {
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return tf;
}
@Override
public final Explanation explain(BasicStats stats, float tf, float len) {
return new Explanation(1, "no normalization");
}
@Override
public String toString() {
return "";
}
}
/**
* Subclasses must override this method to return the code of the
* normalization formula. Refer to the original paper for the list.
*/
@Override
public abstract String toString();
}

View File

@ -0,0 +1,34 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Normalization model that assumes a uniform distribution of the term frequency.
* @lucene.experimental
*/
public class NormalizationH1 extends Normalization {
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return tf * stats.getAvgFieldLength() / len;
}
@Override
public String toString() {
return "1";
}
}

View File

@ -0,0 +1,37 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
* Normalization model in which the term frequency is inversely related to the
* length.
* @lucene.experimental
*/
public class NormalizationH2 extends Normalization {
@Override
public final float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * log2(1 + stats.getAvgFieldLength() / len));
}
@Override
public String toString() {
return "2";
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Dirichlet Priors normalization
* @lucene.experimental
*/
public class NormalizationH3 extends Normalization {
private final float mu;
public NormalizationH3() {
this(800F);
}
public NormalizationH3(float mu) {
this.mu = mu;
}
@Override
public float tfn(BasicStats stats, float tf, float len) {
return (tf + mu * (stats.getTotalTermFreq() / (float)stats.getNumberOfFieldTokens())) / (len + mu) * mu;
}
@Override
public String toString() {
return "3(" + mu + ")";
}
}

View File

@ -0,0 +1,44 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Pareto-Zipf Normalization
* @lucene.experimental
*/
public class NormalizationZ extends Normalization {
final float z;
public NormalizationZ() {
this(0.30F);
}
public NormalizationZ(float z) {
this.z = z;
}
@Override
public float tfn(BasicStats stats, float tf, float len) {
return (float)(tf * Math.pow(stats.avgFieldLength / len, z));
}
@Override
public String toString() {
return "Z(" + z + ")";
}
}

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -25,6 +25,12 @@ import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader; // javadoc
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Terms; // javadoc
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanQuery; // javadoc
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat; // javadoc
@ -140,7 +146,7 @@ public abstract class Similarity {
* <p>
* Term frequencies are integers (the term or phrase's tf)
*/
public abstract class ExactDocScorer {
public static abstract class ExactDocScorer {
/**
* Score a single document
* @param doc document id
@ -169,7 +175,7 @@ public abstract class Similarity {
* <p>
* Term frequencies are floating point values.
*/
public abstract class SloppyDocScorer {
public static abstract class SloppyDocScorer {
/**
* Score a single document
* @param doc document id

View File

@ -0,0 +1,345 @@
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.SmallFloat;
import org.apache.lucene.util.TermContext;
/**
* A subclass of {@code Similarity} that provides a simplified API for its
* descendants. Subclasses are only required to implement the {@link #score}
* and {@link #toString()} methods. Implementing
* {@link #explain(Explanation, BasicStats, int, float, float)} is optional,
* inasmuch as SimilarityBase already provides a basic explanation of the score
* and the term frequency. However, implementers of a subclass are encouraged to
* include as much detail about the scoring method as possible.
* <p>
* Note: multi-word queries such as phrase queries are scored in a different way
* than Lucene's default ranking algorithm: whereas it "fakes" an IDF value for
* the phrase as a whole (since it does not know it), this class instead scores
* phrases as a summation of the individual term scores.
* @lucene.experimental
*/
public abstract class SimilarityBase extends Similarity {
/** For {@link #log2(double)}. Precomputed for efficiency reasons. */
private static final double LOG_2 = Math.log(2);
/** @see #setDiscountOverlaps */
protected boolean discountOverlaps = true;
/** Determines whether overlap tokens (Tokens with
* 0 position increment) are ignored when computing
* norm. By default this is true, meaning overlap
* tokens do not count when computing norms.
*
* @lucene.experimental
*
* @see #computeNorm
*/
public void setDiscountOverlaps(boolean v) {
discountOverlaps = v;
}
/** @see #setDiscountOverlaps */
public boolean getDiscountOverlaps() {
return discountOverlaps;
}
@Override
public final Stats computeStats(IndexSearcher searcher, String fieldName,
float queryBoost, TermContext... termContexts) throws IOException {
BasicStats stats[] = new BasicStats[termContexts.length];
for (int i = 0; i < termContexts.length; i++) {
stats[i] = newStats(queryBoost);
fillBasicStats(stats[i], searcher, fieldName, termContexts[i]);
}
return stats.length == 1 ? stats[0] : new MultiSimilarity.MultiStats(stats);
}
/** Factory method to return a custom stats object */
protected BasicStats newStats(float queryBoost) {
return new BasicStats(queryBoost);
}
/** Fills all member fields defined in {@code BasicStats} in {@code stats}.
* Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, IndexSearcher searcher,
String fieldName, TermContext termContext) throws IOException {
IndexReader reader = searcher.getIndexReader();
int numberOfDocuments = reader.maxDoc();
int docFreq = termContext.docFreq();
long totalTermFreq = termContext.totalTermFreq();
// codec does not supply totalTermFreq: substitute docFreq
if (totalTermFreq == -1) {
totalTermFreq = docFreq;
}
final long numberOfFieldTokens;
final float avgFieldLength;
Terms terms = MultiFields.getTerms(searcher.getIndexReader(), fieldName);
if (terms == null) {
// field does not exist;
numberOfFieldTokens = 0;
avgFieldLength = 1;
} else {
long sumTotalTermFreq = terms.getSumTotalTermFreq();
// We have to provide something if codec doesnt supply these measures,
// or if someone omitted frequencies for the field... negative values cause
// NaN/Inf for some scorers.
if (sumTotalTermFreq == -1) {
numberOfFieldTokens = docFreq;
avgFieldLength = 1;
} else {
numberOfFieldTokens = sumTotalTermFreq;
avgFieldLength = (float)numberOfFieldTokens / numberOfDocuments;
}
}
// TODO: add sumDocFreq for field (numberOfFieldPostings)
stats.setNumberOfDocuments(numberOfDocuments);
stats.setNumberOfFieldTokens(numberOfFieldTokens);
stats.setAvgFieldLength(avgFieldLength);
stats.setDocFreq(docFreq);
stats.setTotalTermFreq(totalTermFreq);
}
/**
* Scores the document {@code doc}.
* <p>Subclasses must apply their scoring formula in this class.</p>
* @param stats the corpus level statistics.
* @param freq the term frequency.
* @param docLen the document length.
* @return the score.
*/
protected abstract float score(BasicStats stats, float freq, float docLen);
/**
* Subclasses should implement this method to explain the score. {@code expl}
* already contains the score, the name of the class and the doc id, as well
* as the term frequency and its explanation; subclasses can add additional
* clauses to explain details of their scoring formulae.
* <p>The default implementation does nothing.</p>
*
* @param expl the explanation to extend with details.
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency.
* @param docLen the document length.
*/
protected void explain(
Explanation expl, BasicStats stats, int doc, float freq, float docLen) {}
/**
* Explains the score. The implementation here provides a basic explanation
* in the format <em>score(name-of-similarity, doc=doc-id,
* freq=term-frequency), computed from:</em>, and
* attaches the score (computed via the {@link #score(BasicStats, float, float)}
* method) and the explanation for the term frequency. Subclasses content with
* this format may add additional details in
* {@link #explain(Explanation, BasicStats, int, float, float)}.
*
* @param stats the corpus level statistics.
* @param doc the document id.
* @param freq the term frequency and its explanation.
* @param docLen the document length.
* @return the explanation.
*/
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, float docLen) {
Explanation result = new Explanation();
result.setValue(score(stats, freq.getValue(), docLen));
result.setDescription("score(" + getClass().getSimpleName() +
", doc=" + doc + ", freq=" + freq.getValue() +"), computed from:");
result.addDetail(freq);
explain(result, stats, doc, freq.getValue(), docLen);
return result;
}
@Override
public ExactDocScorer exactDocScorer(Stats stats, String fieldName,
AtomicReaderContext context) throws IOException {
byte norms[] = context.reader.norms(fieldName);
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
ExactDocScorer subScorers[] = new ExactDocScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = new BasicExactDocScorer((BasicStats)subStats[i], norms);
}
return new MultiSimilarity.MultiExactDocScorer(subScorers);
} else {
return new BasicExactDocScorer((BasicStats) stats, norms);
}
}
@Override
public SloppyDocScorer sloppyDocScorer(Stats stats, String fieldName,
AtomicReaderContext context) throws IOException {
byte norms[] = context.reader.norms(fieldName);
if (stats instanceof MultiSimilarity.MultiStats) {
// a multi term query (e.g. phrase). return the summation,
// scoring almost as if it were boolean query
Stats subStats[] = ((MultiSimilarity.MultiStats) stats).subStats;
SloppyDocScorer subScorers[] = new SloppyDocScorer[subStats.length];
for (int i = 0; i < subScorers.length; i++) {
subScorers[i] = new BasicSloppyDocScorer((BasicStats)subStats[i], norms);
}
return new MultiSimilarity.MultiSloppyDocScorer(subScorers);
} else {
return new BasicSloppyDocScorer((BasicStats) stats, norms);
}
}
/**
* Subclasses must override this method to return the name of the Similarity
* and preferably the values of parameters (if any) as well.
*/
@Override
public abstract String toString();
// ------------------------------ Norm handling ------------------------------
/** Norm -> document length map. */
private static final float[] NORM_TABLE = new float[256];
static {
for (int i = 0; i < 256; i++) {
float floatNorm = SmallFloat.byte315ToFloat((byte)i);
NORM_TABLE[i] = 1.0f / (floatNorm * floatNorm);
}
}
/** Encodes the document length in the same way as {@link TFIDFSimilarity}. */
@Override
public byte computeNorm(FieldInvertState state) {
final float numTerms;
if (discountOverlaps)
numTerms = state.getLength() - state.getNumOverlap();
else
numTerms = state.getLength() / state.getBoost();
return encodeNormValue(state.getBoost(), numTerms);
}
/** Decodes a normalization factor (document length) stored in an index.
* @see #encodeNormValue(float,float)
*/
protected float decodeNormValue(byte norm) {
return NORM_TABLE[norm & 0xFF]; // & 0xFF maps negative bytes to positive above 127
}
/** Encodes the length to a byte via SmallFloat. */
protected byte encodeNormValue(float boost, float length) {
return SmallFloat.floatToByte315((boost / (float) Math.sqrt(length)));
}
// ----------------------------- Static methods ------------------------------
/** Returns the base two logarithm of {@code x}. */
public static double log2(double x) {
// Put this to a 'util' class if we need more of these.
return Math.log(x) / LOG_2;
}
// --------------------------------- Classes ---------------------------------
/** Delegates the {@link #score(int, int)} and
* {@link #explain(int, Explanation)} methods to
* {@link SimilarityBase#score(BasicStats, float, int)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, int)},
* respectively.
*/
private class BasicExactDocScorer extends ExactDocScorer {
private final BasicStats stats;
private final byte[] norms;
BasicExactDocScorer(BasicStats stats, byte norms[]) {
this.stats = stats;
this.norms = norms;
}
@Override
public float score(int doc, int freq) {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq,
norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
return SimilarityBase.this.explain(stats, doc, freq,
norms == null ? 1F : decodeNormValue(norms[doc]));
}
}
/** Delegates the {@link #score(int, int)} and
* {@link #explain(int, Explanation)} methods to
* {@link SimilarityBase#score(BasicStats, float, int)} and
* {@link SimilarityBase#explain(BasicStats, int, Explanation, int)},
* respectively.
*/
private class BasicSloppyDocScorer extends SloppyDocScorer {
private final BasicStats stats;
private final byte[] norms;
BasicSloppyDocScorer(BasicStats stats, byte norms[]) {
this.stats = stats;
this.norms = norms;
}
@Override
public float score(int doc, float freq) {
// We have to supply something in case norms are omitted
return SimilarityBase.this.score(stats, freq,
norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
public Explanation explain(int doc, Explanation freq) {
return SimilarityBase.this.explain(stats, doc, freq,
norms == null ? 1F : decodeNormValue(norms[doc]));
}
@Override
public float computeSlopFactor(int distance) {
return 1.0f / (distance + 1);
}
@Override
public float computePayloadFactor(int doc, int start, int end, BytesRef payload) {
return 1f;
}
}
}

View File

@ -1,4 +1,6 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.BooleanQuery;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more

View File

@ -1,4 +1,4 @@
package org.apache.lucene.search;
package org.apache.lucene.search.similarities;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -21,6 +21,10 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.util.SmallFloat;
@ -303,13 +307,13 @@ import org.apache.lucene.util.SmallFloat;
* two term-queries with that same term and hence the computation would still be correct (although
* not very efficient).
* The default computation for <i>tf(t in d)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) DefaultSimilarity} is:
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* {@link org.apache.lucene.search.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#tf(float) tf(t in d)} &nbsp; = &nbsp;
* </td>
* <td valign="top" align="center" rowspan="1">
* frequency<sup><big>&frac12;</big></sup>
@ -328,13 +332,13 @@ import org.apache.lucene.util.SmallFloat;
* <i>idf(t)</i> appears for <i>t</i> in both the query and the document,
* hence it is squared in the equation.
* The default computation for <i>idf(t)</i> in
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) DefaultSimilarity} is:
*
* <br>&nbsp;<br>
* <table cellpadding="2" cellspacing="2" border="0" align="center">
* <tr>
* <td valign="middle" align="right">
* {@link org.apache.lucene.search.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* {@link org.apache.lucene.search.similarities.DefaultSimilarity#idf(int, int) idf(t)}&nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center">
* 1 + log <big>(</big>
@ -376,14 +380,14 @@ import org.apache.lucene.util.SmallFloat;
* This is a search time factor computed by the Similarity in effect at search time.
*
* The default computation in
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) DefaultSimilarityProvider}
* produces a <a href="http://en.wikipedia.org/wiki/Euclidean_norm#Euclidean_norm">Euclidean norm</a>:
* <br>&nbsp;<br>
* <table cellpadding="1" cellspacing="0" border="0" align="center">
* <tr>
* <td valign="middle" align="right" rowspan="1">
* queryNorm(q) &nbsp; = &nbsp;
* {@link org.apache.lucene.search.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* {@link org.apache.lucene.search.similarities.DefaultSimilarityProvider#queryNorm(float) queryNorm(sumOfSquaredWeights)}
* &nbsp; = &nbsp;
* </td>
* <td valign="middle" align="center" rowspan="1">

View File

@ -0,0 +1,174 @@
<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
This package contains the various ranking models that can be used in Lucene. The
abstract class {@link org.apache.lucene.search.similarities.Similarity} serves
as the base for ranking functions. For searching, users can employ the models
already implemented or create their own by extending one of the classes in this
package.
<h2>Table Of Contents</h2>
<p>
<ol>
<li><a href="#sims">Summary of the Ranking Methods</a></li>
<li><a href="#providers">Similarity Providers<a/></li>
<li><a href="#changingSimilarity">Changing the Similarity</a></li>
</ol>
</p>
<a name="sims"></a>
<h2>Summary of the Ranking Methods</h2>
<p>{@link org.apache.lucene.search.similarities.DefaultSimilarity} is the original Lucene
scoring function. It is based on a highly optimized Vector Space Model. For more
information, see {@link org.apache.lucene.search.similarities.TFIDFSimilarity}.</p>
<p>{@link org.apache.lucene.search.similarities.BM25Similarity} is an optimized
implementation of the successful Okapi BM25 model.</p>
<p>{@link org.apache.lucene.search.similarities.SimilarityBase} provides a basic
implementation of the Similarity contract and exposes a highly simplified
interface, which makes it an ideal starting point for new ranking functions.
Lucene ships the following methods built on
{@link org.apache.lucene.search.similarities.SimilarityBase}:
<a name="framework"></a>
<ul>
<li>Amati and Rijsbergen's {@linkplain org.apache.lucene.search.similarities.DFRSimilarity DFR} framework;</li>
<li>Clinchant and Gaussier's {@linkplain org.apache.lucene.search.similarities.IBSimilarity Information-based models}
for IR;</li>
<li>The implementation of two {@linkplain org.apache.lucene.search.similarities.LMSimilarity language models} from
Zhai and Lafferty's paper.</li>
</ul>
Since {@link org.apache.lucene.search.similarities.SimilarityBase} is not
optimized to the same extent as
{@link org.apache.lucene.search.similarities.DefaultSimilarity} and
{@link org.apache.lucene.search.similarities.BM25Similarity}, a difference in
performance is to be expected when using the methods listed above. However,
optimizations can always be implemented in subclasses; see
<a href="#changingSimilarity">below</a>.</p>
<a name="providers"></a>
<h2>Similarity Providers</h2>
<p>{@link org.apache.lucene.search.similarities.SimilarityProvider}s are factories
that return Similarities per-field and compute coordination factors and normalization
values for the query.
{@link org.apache.lucene.search.similarities.DefaultSimilarityProvider} is the
default implementation used by Lucene, geared towards vector-spaced search: it returns
{@link org.apache.lucene.search.similarities.DefaultSimilarity} for every field,
and implements coordination-level matching and query normalization.
{@link org.apache.lucene.search.similarities.BasicSimilarityProvider} is geared towards
non-vector-space models and does not implement coordination-level matching or query
normalization. It is a convenience implementation that returns an arbitrary
{@link org.apache.lucene.search.similarities.Similarity} for every field.
You can write your own SimilarityProvider to return different Similarities for different
fields: for example you might want to use different parameter values for different fields,
or maybe even entirely different ranking algorithms.
</p>
<a name="changingSimilarity"></a>
<h2>Changing Similarity</h2>
<p>Chances are the available Similarities are sufficient for all
your searching needs.
However, in some applications it may be necessary to customize your <a
href="Similarity.html">Similarity</a> implementation. For instance, some
applications do not need to
distinguish between shorter and longer documents (see <a
href="http://www.gossamer-threads.com/lists/lucene/java-user/38967#38967">a "fair" similarity</a>).</p>
<p>To change {@link org.apache.lucene.search.similarities.Similarity}, one must do so for both indexing and
searching, and the changes must happen before
either of these actions take place. Although in theory there is nothing stopping you from changing mid-stream, it
just isn't well-defined what is going to happen.
</p>
<p>To make this change, implement your own {@link org.apache.lucene.search.similarities.Similarity} (likely
you'll want to simply subclass an existing method, be it
{@link org.apache.lucene.search.similarities.DefaultSimilarity} or a descendant of
{@link org.apache.lucene.search.similarities.SimilarityBase}) and
{@link org.apache.lucene.search.similarities.SimilarityProvider} (or use
{@link org.apache.lucene.search.similarities.BasicSimilarityProvider}), and
then register the new class by calling
{@link org.apache.lucene.index.IndexWriterConfig#setSimilarityProvider(SimilarityProvider)}
before indexing and
{@link org.apache.lucene.search.IndexSearcher#setSimilarityProvider(SimilarityProvider)}
before searching.
</p>
<h3>Extending {@linkplain org.apache.lucene.search.similarities.SimilarityBase}</h3>
<p>
The easiest way to quickly implement a new ranking method is to extend
{@link org.apache.lucene.search.similarities.SimilarityBase}, which provides
basic implementations for the low level . Subclasses are only required to
implement the {@link org.apache.lucene.search.similarities.SimilarityBase#score(BasicStats, float, float)}
and {@link org.apache.lucene.search.similarities.SimilarityBase#toString()}
methods.</p>
<p>Another options is to extend one of the <a href="#framework">frameworks</a>
based on {@link org.apache.lucene.search.similarities.SimilarityBase}. These
Similarities are implemented modularly, e.g.
{@link org.apache.lucene.search.similarities.DFRSimilarity} delegates
computation of the three parts of its formula to the classes
{@link org.apache.lucene.search.similarities.BasicModel},
{@link org.apache.lucene.search.similarities.AfterEffect} and
{@link org.apache.lucene.search.similarities.Normalization}. Instead of
subclassing the Similarity, one can simply introduce a new basic model and tell
{@link org.apache.lucene.search.similarities.DFRSimilarity} to use it.</p>
<h3>Changing {@linkplain org.apache.lucene.search.similarities.DefaultSimilarity}</h3>
<p>
If you are interested in use cases for changing your similarity, see the Lucene users's mailing list at <a
href="http://www.nabble.com/Overriding-Similarity-tf2128934.html">Overriding Similarity</a>.
In summary, here are a few use cases:
<ol>
<li><p>The <code>SweetSpotSimilarity</code> in
<code>org.apache.lucene.misc</code> gives small
increases as the frequency increases a small amount
and then greater increases when you hit the "sweet spot", i.e. where
you think the frequency of terms is more significant.</p></li>
<li><p>Overriding tf &mdash; In some applications, it doesn't matter what the score of a document is as long as a
matching term occurs. In these
cases people have overridden Similarity to return 1 from the tf() method.</p></li>
<li><p>Changing Length Normalization &mdash; By overriding
{@link org.apache.lucene.search.similarities.Similarity#computeNorm(FieldInvertState state)},
it is possible to discount how the length of a field contributes
to a score. In {@link org.apache.lucene.search.similarities.DefaultSimilarity},
lengthNorm = 1 / (numTerms in field)^0.5, but if one changes this to be
1 / (numTerms in field), all fields will be treated
<a href="http://www.gossamer-threads.com/lists/lucene/java-user/38967#38967">"fairly"</a>.</p></li>
</ol>
In general, Chris Hostetter sums it up best in saying (from <a
href="http://www.gossamer-threads.com/lists/lucene/java-user/39125#39125">the Lucene users's mailing list</a>):
<blockquote>[One would override the Similarity in] ... any situation where you know more about your data then just
that
it's "text" is a situation where it *might* make sense to to override your
Similarity method.</blockquote>
</p>
</body>
</html>

View File

@ -19,11 +19,9 @@ package org.apache.lucene.search.spans;
import java.io.IOException;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.TFIDFSimilarity;
import org.apache.lucene.search.Weight;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.similarities.Similarity;
/**
* Public for extension only.

View File

@ -21,7 +21,8 @@ import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.IndexReader.ReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.util.TermContext;
import java.io.IOException;

View File

@ -33,7 +33,7 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import static org.apache.lucene.util.LuceneTestCase.TEST_VERSION_CURRENT;

View File

@ -18,6 +18,7 @@ package org.apache.lucene.search;
*/
import java.io.IOException;
import java.util.Locale;
import java.util.Set;
import java.util.TreeSet;
import java.util.Random;
@ -35,7 +36,7 @@ public class CheckHits {
* different order of operations from the actual scoring method ...
* this allows for a small amount of variation
*/
public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.0002f;
public static float EXPLAIN_SCORE_TOLERANCE_DELTA = 0.02f;
/**
* Tests that all documents up to maxDoc which are *not* in the
@ -327,6 +328,10 @@ public class CheckHits {
if (!deep) return;
Explanation detail[] = expl.getDetails();
// TODO: can we improve this entire method? its really geared to work only with TF/IDF
if (expl.getDescription().endsWith("computed from:")) {
return; // something more complicated.
}
if (detail!=null) {
if (detail.length==1) {
// simple containment, unless its a freq of: (which lets a query explain how the freq is calculated),
@ -338,7 +343,7 @@ public class CheckHits {
// - end with one of: "product of:", "sum of:", "max of:", or
// - have "max plus <x> times others" (where <x> is float).
float x = 0;
String descr = expl.getDescription().toLowerCase();
String descr = expl.getDescription().toLowerCase(Locale.ENGLISH);
boolean productOf = descr.endsWith("product of:");
boolean sumOf = descr.endsWith("sum of:");
boolean maxOf = descr.endsWith("max of:");

View File

@ -0,0 +1,158 @@
package org.apache.lucene.search;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.lucene.search.similarities.AfterEffect;
import org.apache.lucene.search.similarities.AfterEffectB;
import org.apache.lucene.search.similarities.AfterEffectL;
import org.apache.lucene.search.similarities.BM25Similarity;
import org.apache.lucene.search.similarities.BasicModel;
import org.apache.lucene.search.similarities.BasicModelBE;
import org.apache.lucene.search.similarities.BasicModelD;
import org.apache.lucene.search.similarities.BasicModelG;
import org.apache.lucene.search.similarities.BasicModelIF;
import org.apache.lucene.search.similarities.BasicModelIn;
import org.apache.lucene.search.similarities.BasicModelIne;
import org.apache.lucene.search.similarities.BasicModelP;
import org.apache.lucene.search.similarities.DFRSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Distribution;
import org.apache.lucene.search.similarities.DistributionLL;
import org.apache.lucene.search.similarities.DistributionSPL;
import org.apache.lucene.search.similarities.IBSimilarity;
import org.apache.lucene.search.similarities.LMDirichletSimilarity;
import org.apache.lucene.search.similarities.LMJelinekMercerSimilarity;
import org.apache.lucene.search.similarities.Lambda;
import org.apache.lucene.search.similarities.LambdaDF;
import org.apache.lucene.search.similarities.LambdaTTF;
import org.apache.lucene.search.similarities.Normalization;
import org.apache.lucene.search.similarities.NormalizationH1;
import org.apache.lucene.search.similarities.NormalizationH2;
import org.apache.lucene.search.similarities.NormalizationH3;
import org.apache.lucene.search.similarities.NormalizationZ;
import org.apache.lucene.search.similarities.Similarity;
public class RandomSimilarityProvider extends DefaultSimilarityProvider {
final List<Similarity> knownSims;
Map<String,Similarity> previousMappings = new HashMap<String,Similarity>();
final int perFieldSeed;
final boolean shouldCoord;
final boolean shouldQueryNorm;
public RandomSimilarityProvider(Random random) {
perFieldSeed = random.nextInt();
shouldCoord = random.nextBoolean();
shouldQueryNorm = random.nextBoolean();
knownSims = new ArrayList<Similarity>(allSims);
Collections.shuffle(knownSims, random);
}
@Override
public float coord(int overlap, int maxOverlap) {
if (shouldCoord) {
return super.coord(overlap, maxOverlap);
} else {
return 1.0f;
}
}
@Override
public float queryNorm(float sumOfSquaredWeights) {
if (shouldQueryNorm) {
return super.queryNorm(sumOfSquaredWeights);
} else {
return 1.0f;
}
}
@Override
public synchronized Similarity get(String field) {
assert field != null;
Similarity sim = previousMappings.get(field);
if (sim == null) {
sim = knownSims.get(Math.abs(perFieldSeed ^ field.hashCode()) % knownSims.size());
previousMappings.put(field, sim);
}
return sim;
}
// all the similarities that we rotate through
/** The DFR basic models to test. */
static BasicModel[] BASIC_MODELS = {
new BasicModelBE(), /* TODO: enable new BasicModelD(), */ new BasicModelG(),
new BasicModelIF(), new BasicModelIn(), new BasicModelIne(),
/* TODO: enable new BasicModelP() */
};
/** The DFR aftereffects to test. */
static AfterEffect[] AFTER_EFFECTS = {
new AfterEffectB(), new AfterEffectL(), new AfterEffect.NoAfterEffect()
};
/** The DFR normalizations to test. */
static Normalization[] NORMALIZATIONS = {
new NormalizationH1(), new NormalizationH2(),
new NormalizationH3(), new NormalizationZ()
// TODO: if we enable NoNormalization, we have to deal with
// a couple tests (e.g. TestDocBoost, TestSort) that expect length normalization
// new Normalization.NoNormalization()
};
/** The distributions for IB. */
static Distribution[] DISTRIBUTIONS = {
new DistributionLL(), new DistributionSPL()
};
/** Lambdas for IB. */
static Lambda[] LAMBDAS = {
new LambdaDF(), new LambdaTTF()
};
static List<Similarity> allSims;
static {
allSims = new ArrayList<Similarity>();
allSims.add(new DefaultSimilarity());
allSims.add(new BM25Similarity());
for (BasicModel basicModel : BASIC_MODELS) {
for (AfterEffect afterEffect : AFTER_EFFECTS) {
for (Normalization normalization : NORMALIZATIONS) {
allSims.add(new DFRSimilarity(basicModel, afterEffect, normalization));
}
}
}
for (Distribution distribution : DISTRIBUTIONS) {
for (Lambda lambda : LAMBDAS) {
for (Normalization normalization : NORMALIZATIONS) {
allSims.add(new IBSimilarity(distribution, lambda, normalization));
}
}
}
/* TODO: enable Dirichlet
allSims.add(new LMDirichletSimilarity()); */
allSims.add(new LMJelinekMercerSimilarity(0.1f));
allSims.add(new LMJelinekMercerSimilarity(0.7f));
}
@Override
public synchronized String toString() {
return "RandomSimilarityProvider(queryNorm=" + shouldQueryNorm + ",coord=" + shouldCoord + "): " + previousMappings.toString();
}
}

View File

@ -52,6 +52,8 @@ import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.CacheEntry;
import org.apache.lucene.search.AssertingIndexSearcher;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.RandomSimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FlushInfo;
@ -210,6 +212,8 @@ public abstract class LuceneTestCase extends Assert {
// default codec provider
private static CodecProvider savedCodecProvider;
private static SimilarityProvider similarityProvider;
private static Locale locale;
private static Locale savedLocale;
private static TimeZone timeZone;
@ -393,6 +397,7 @@ public abstract class LuceneTestCase extends Assert {
savedTimeZone = TimeZone.getDefault();
timeZone = TEST_TIMEZONE.equals("random") ? randomTimeZone(random) : TimeZone.getTimeZone(TEST_TIMEZONE);
TimeZone.setDefault(timeZone);
similarityProvider = new RandomSimilarityProvider(random);
testsFailed = false;
}
@ -467,6 +472,7 @@ public abstract class LuceneTestCase extends Assert {
/** print some useful debugging information about the environment */
private static void printDebuggingInformation(String codecDescription) {
System.err.println("NOTE: test params are: codec=" + codecDescription +
", sim=" + similarityProvider +
", locale=" + locale +
", timezone=" + (timeZone == null ? "(null)" : timeZone.getID()));
System.err.println("NOTE: all tests run in this JVM:");
@ -922,6 +928,7 @@ public abstract class LuceneTestCase extends Assert {
/** create a new index writer config with random defaults using the specified random */
public static IndexWriterConfig newIndexWriterConfig(Random r, Version v, Analyzer a) {
IndexWriterConfig c = new IndexWriterConfig(v, a);
c.setSimilarityProvider(similarityProvider);
if (r.nextBoolean()) {
c.setMergeScheduler(new SerialMergeScheduler());
}
@ -1249,7 +1256,9 @@ public abstract class LuceneTestCase extends Assert {
if (maybeWrap && rarely()) {
r = new SlowMultiReaderWrapper(r);
}
return random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext());
IndexSearcher ret = random.nextBoolean() ? new AssertingIndexSearcher(r) : new AssertingIndexSearcher(r.getTopReaderContext());
ret.setSimilarityProvider(similarityProvider);
return ret;
} else {
int threads = 0;
final ExecutorService ex = (random.nextBoolean()) ? null
@ -1258,7 +1267,7 @@ public abstract class LuceneTestCase extends Assert {
if (ex != null && VERBOSE) {
System.out.println("NOTE: newSearcher using ExecutorService with " + threads + " threads");
}
return random.nextBoolean() ?
IndexSearcher ret = random.nextBoolean() ?
new AssertingIndexSearcher(r, ex) {
@Override
public void close() throws IOException {
@ -1272,6 +1281,8 @@ public abstract class LuceneTestCase extends Assert {
shutdownExecutorService(ex);
}
};
ret.setSimilarityProvider(similarityProvider);
return ret;
}
}

View File

@ -35,13 +35,13 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.NumericRangeQuery;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

View File

@ -27,11 +27,11 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -25,7 +25,7 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.CompoundFileDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;

View File

@ -40,9 +40,9 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader.FieldOption;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;

View File

@ -17,7 +17,7 @@ package org.apache.lucene.index;
* limitations under the License.
*/
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;

View File

@ -29,10 +29,10 @@ import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -23,10 +23,11 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.MockDirectoryWrapper;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -35,11 +35,13 @@ import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BitVector;

View File

@ -26,8 +26,8 @@ import java.util.Set;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.DocumentsWriterPerThread.IndexingChain;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.util.LuceneTestCase;
import org.junit.Test;

View File

@ -27,9 +27,9 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;

View File

@ -27,10 +27,10 @@ import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.SimilarityProvider;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -32,6 +32,9 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.store.Directory;

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -234,7 +235,8 @@ public class TestParallelReader extends LuceneTestCase {
w.addDocument(d2);
w.close();
return new IndexSearcher(dir, false);
IndexReader ir = IndexReader.open(dir, false);
return newSearcher(ir);
}
// Fields 1 & 2 in one index, 3 & 4 in other, with ParallelReader:

View File

@ -25,9 +25,9 @@ import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.DefaultSimilarityProvider;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util._TestUtil;

View File

@ -20,9 +20,11 @@ package org.apache.lucene.search;
import java.io.IOException;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.Similarity.ExactDocScorer;
import org.apache.lucene.search.Similarity.SloppyDocScorer;
import org.apache.lucene.search.Similarity.Stats;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.search.similarities.Similarity.ExactDocScorer;
import org.apache.lucene.search.similarities.Similarity.SloppyDocScorer;
import org.apache.lucene.search.similarities.Similarity.Stats;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TermContext;
import org.apache.lucene.index.FieldInvertState;

View File

@ -26,6 +26,8 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.MockDirectoryWrapper;

View File

@ -27,6 +27,9 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.NamedThreadFactory;
@ -72,6 +75,21 @@ public class TestBooleanQuery extends LuceneTestCase {
IndexReader r = w.getReader();
IndexSearcher s = newSearcher(r);
// this test relies upon coord being the default implementation,
// otherwise scores are different!
final SimilarityProvider delegate = s.getSimilarityProvider();
s.setSimilarityProvider(new DefaultSimilarityProvider() {
@Override
public float queryNorm(float sumOfSquaredWeights) {
return delegate.queryNorm(sumOfSquaredWeights);
}
@Override
public Similarity get(String field) {
return delegate.get(field);
}
});
BooleanQuery q = new BooleanQuery();
q.add(new TermQuery(new Term("field", "a")), BooleanClause.Occur.SHOULD);
@ -81,7 +99,7 @@ public class TestBooleanQuery extends LuceneTestCase {
subQuery.setBoost(0);
q.add(subQuery, BooleanClause.Occur.SHOULD);
float score2 = s.search(q, 10).getMaxScore();
assertEquals(score*.5, score2, 1e-6);
assertEquals(score*.5F, score2, 1e-6);
// LUCENE-2617: make sure that a clause not in the index still contributes to the score via coord factor
BooleanQuery qq = (BooleanQuery)q.clone();
@ -91,14 +109,14 @@ public class TestBooleanQuery extends LuceneTestCase {
phrase.setBoost(0);
qq.add(phrase, BooleanClause.Occur.SHOULD);
score2 = s.search(qq, 10).getMaxScore();
assertEquals(score*(1.0/3), score2, 1e-6);
assertEquals(score*(1/3F), score2, 1e-6);
// now test BooleanScorer2
subQuery = new TermQuery(new Term("field", "b"));
subQuery.setBoost(0);
q.add(subQuery, BooleanClause.Occur.MUST);
score2 = s.search(q, 10).getMaxScore();
assertEquals(score*(2.0/3), score2, 1e-6);
assertEquals(score*(2/3F), score2, 1e-6);
// PhraseQuery w/ no terms added returns a null scorer
PhraseQuery pq = new PhraseQuery();

View File

@ -19,6 +19,7 @@ package org.apache.lucene.search;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.spans.*;
/**

View File

@ -23,6 +23,7 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -29,6 +29,10 @@ import org.apache.lucene.index.FieldInvertState;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Weight.ScorerContext;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import java.text.DecimalFormat;

View File

@ -56,7 +56,8 @@ public class TestDocBoost extends LuceneTestCase {
final float[] scores = new float[4];
newSearcher(reader).search
IndexSearcher searcher = newSearcher(reader);
searcher.search
(new TermQuery(new Term("field", "word")),
new Collector() {
private int base = 0;
@ -82,7 +83,10 @@ public class TestDocBoost extends LuceneTestCase {
float lastScore = 0.0f;
for (int i = 0; i < 2; i++) {
assertTrue(scores[i] > lastScore);
if (VERBOSE) {
System.out.println(searcher.explain(new TermQuery(new Term("field", "word")), i));
}
assertTrue("score: " + scores[i] + " should be > lastScore: " + lastScore, scores[i] > lastScore);
lastScore = scores[i];
}

View File

@ -30,6 +30,9 @@ import org.apache.lucene.index.Term;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.codecs.CodecProvider;
import org.apache.lucene.index.values.IndexDocValues.Source;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
@ -71,13 +74,24 @@ public class TestDocValuesScoring extends LuceneTestCase {
// no boosting
IndexSearcher searcher1 = newSearcher(ir);
final SimilarityProvider base = searcher1.getSimilarityProvider();
// boosting
IndexSearcher searcher2 = newSearcher(ir);
searcher2.setSimilarityProvider(new DefaultSimilarityProvider() {
final Similarity fooSim = new BoostingSimilarity(super.get("foo"), "foo_boost");
searcher2.setSimilarityProvider(new SimilarityProvider() {
final Similarity fooSim = new BoostingSimilarity(base.get("foo"), "foo_boost");
public Similarity get(String field) {
return "foo".equals(field) ? fooSim : super.get(field);
return "foo".equals(field) ? fooSim : base.get(field);
}
@Override
public float coord(int overlap, int maxOverlap) {
return base.coord(overlap, maxOverlap);
}
@Override
public float queryNorm(float sumOfSquaredWeights) {
return base.queryNorm(sumOfSquaredWeights);
}
});

View File

@ -23,6 +23,7 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.search.FieldValueHitQueue.Entry;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.store.*;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.BytesRef;
@ -41,7 +42,8 @@ public class TestElevationComparator extends LuceneTestCase {
directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).
setMaxBufferedDocs(2).
setMergePolicy(newLogMergePolicy(1000))
setMergePolicy(newLogMergePolicy(1000)).
setSimilarityProvider(new DefaultSimilarityProvider())
);
writer.addDocument(adoc(new String[] {"id", "a", "title", "ipod", "str_s", "a"}));
writer.addDocument(adoc(new String[] {"id", "b", "title", "ipod ipod", "str_s", "b"}));
@ -54,6 +56,7 @@ public class TestElevationComparator extends LuceneTestCase {
writer.close();
IndexSearcher searcher = newSearcher(r);
searcher.setSimilarityProvider(new DefaultSimilarityProvider());
runTest(searcher, true);
runTest(searcher, false);

View File

@ -29,6 +29,9 @@ import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.LuceneTestCase;
@ -104,6 +107,21 @@ public class TestFuzzyQuery2 extends LuceneTestCase {
if (VERBOSE) {
System.out.println("TEST: searcher=" + searcher);
}
// even though this uses a boost-only rewrite, this test relies upon queryNorm being the default implementation,
// otherwise scores are different!
final SimilarityProvider delegate = searcher.getSimilarityProvider();
searcher.setSimilarityProvider(new DefaultSimilarityProvider() {
@Override
public float coord(int overlap, int maxOverlap) {
return delegate.coord(overlap, maxOverlap);
}
@Override
public Similarity get(String field) {
return delegate.get(field);
}
});
writer.close();
String line;
while ((line = reader.readLine()) != null) {

View File

@ -37,6 +37,9 @@ import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.LuceneTestCase;

View File

@ -26,6 +26,9 @@ import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexReader.AtomicReaderContext;
import org.apache.lucene.index.RandomIndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.search.similarities.SimilarityProvider;
import org.apache.lucene.store.Directory;
import org.junit.AfterClass;
import org.junit.BeforeClass;
@ -169,6 +172,19 @@ public class TestMultiTermConstantScore extends BaseTestRangeFilter {
// test for correct application of query normalization
// must use a non score normalizing method for this.
final SimilarityProvider delegate = search.getSimilarityProvider();
search.setSimilarityProvider(new DefaultSimilarityProvider() {
@Override
public float coord(int overlap, int maxOverlap) {
return delegate.coord(overlap, maxOverlap);
}
@Override
public Similarity get(String field) {
return delegate.get(field);
}
});
Query q = csrq("data", "1", "6", T, T);
q.setBoost(100);
search.search(q, null, new Collector() {

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.similarities.DefaultSimilarityProvider;
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;
import org.apache.lucene.util._TestUtil;
@ -342,7 +343,10 @@ public class TestPhraseQuery extends LuceneTestCase {
public void testSlopScoring() throws IOException {
Directory directory = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random, directory, newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMergePolicy(newLogMergePolicy()));
RandomIndexWriter writer = new RandomIndexWriter(random, directory,
newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
.setMergePolicy(newLogMergePolicy())
.setSimilarityProvider(new DefaultSimilarityProvider()));
Document doc = new Document();
doc.add(newField("field", "foo firstname lastname foo", TextField.TYPE_STORED));
@ -360,6 +364,7 @@ public class TestPhraseQuery extends LuceneTestCase {
writer.close();
IndexSearcher searcher = newSearcher(reader);
searcher.setSimilarityProvider(new DefaultSimilarityProvider());
PhraseQuery query = new PhraseQuery();
query.add(new Term("field", "firstname"));
query.add(new Term("field", "lastname"));

Some files were not shown because too many files have changed in this diff Show More