mirror of https://github.com/apache/lucene.git
LUCENE-9537: Add initial Indri search engine functionality to Lucene
This commit is contained in:
parent
e4cede0e8c
commit
9cc5c9b798
|
@ -1,4 +1,4 @@
|
|||
Lucene Change Log
|
||||
Lucene Change Log
|
||||
|
||||
For more information on past and future Lucene versions, please see:
|
||||
http://s.apache.org/luceneversions
|
||||
|
@ -84,6 +84,17 @@ API Changes
|
|||
|
||||
* LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant)
|
||||
|
||||
* LUCENE-9537: Added smoothingScore method and default implementation to
|
||||
Scorable abstract class. The smoothing score allows scorers to calculate a
|
||||
score for a document where the search term or subquery is not present. The
|
||||
smoothing score acts like an idf so that documents that do not have terms or
|
||||
subqueries that are more frequent in the index are not penalized as much as
|
||||
documents that do not have less frequent terms or subqueries and prevents
|
||||
scores which are the product or terms or subqueries from going to zero. Added
|
||||
the implementation of the Indri AND and the IndriDirichletSimilarity from the
|
||||
academic Indri search engine: http://www.lemurproject.org/indri.php.
|
||||
(Cameron VandenBerg)
|
||||
|
||||
Improvements
|
||||
|
||||
* LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
|
||||
|
|
|
@ -184,8 +184,8 @@ public class IndexSearcher {
|
|||
/**
|
||||
* Runs searches for each segment separately, using the provided Executor. NOTE: if you are using
|
||||
* {@link NIOFSDirectory}, do not use the shutdownNow method of ExecutorService as this uses
|
||||
* Thread.interrupt under-the-hood which can silently close file descriptors (see <a
|
||||
* href="https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
|
||||
* Thread.interrupt under-the-hood which can silently close file descriptors (see <a href=
|
||||
* "https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
|
||||
*
|
||||
* @lucene.experimental
|
||||
*/
|
||||
|
@ -199,8 +199,8 @@ public class IndexSearcher {
|
|||
* <p>Given a non-<code>null</code> {@link Executor} this method runs searches for each segment
|
||||
* separately, using the provided Executor. NOTE: if you are using {@link NIOFSDirectory}, do not
|
||||
* use the shutdownNow method of ExecutorService as this uses Thread.interrupt under-the-hood
|
||||
* which can silently close file descriptors (see <a
|
||||
* href="https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
|
||||
* which can silently close file descriptors (see <a href=
|
||||
* "https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
|
||||
*
|
||||
* @see IndexReaderContext
|
||||
* @see IndexReader#getContext()
|
||||
|
@ -729,7 +729,7 @@ public class IndexSearcher {
|
|||
throws IOException {
|
||||
|
||||
// TODO: should we make this
|
||||
// threaded...? the Collector could be sync'd?
|
||||
// threaded...? the Collector could be sync'd?
|
||||
// always use single thread:
|
||||
for (LeafReaderContext ctx : leaves) { // search each subreader
|
||||
final LeafCollector leafCollector;
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/** A Query that matches documents matching combinations of subqueries. */
|
||||
public class IndriAndQuery extends IndriQuery {
|
||||
|
||||
public IndriAndQuery(List<BooleanClause> clauses) {
|
||||
super(clauses);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
IndriAndQuery query = this;
|
||||
return new IndriAndWeight(query, searcher, ScoreMode.TOP_SCORES, boost);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Combines scores of subscorers. If a subscorer does not contain the docId, a smoothing score is
|
||||
* calculated for that document/subscorer combination.
|
||||
*/
|
||||
public class IndriAndScorer extends IndriDisjunctionScorer {
|
||||
|
||||
protected IndriAndScorer(Weight weight, List<Scorer> subScorers, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
super(weight, subScorers, scoreMode, boost);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(List<Scorer> subScorers) throws IOException {
|
||||
int docId = this.docID();
|
||||
return scoreDoc(subScorers, docId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float smoothingScore(List<Scorer> subScorers, int docId) throws IOException {
|
||||
return scoreDoc(subScorers, docId);
|
||||
}
|
||||
|
||||
private float scoreDoc(List<Scorer> subScorers, int docId) throws IOException {
|
||||
double score = 0;
|
||||
double boostSum = 0.0;
|
||||
for (Scorer scorer : subScorers) {
|
||||
if (scorer instanceof IndriScorer) {
|
||||
IndriScorer indriScorer = (IndriScorer) scorer;
|
||||
int scorerDocId = indriScorer.docID();
|
||||
// If the query exists in the document, score the document
|
||||
// Otherwise, compute a smoothing score, which acts like an idf
|
||||
// for subqueries/terms
|
||||
double tempScore = 0;
|
||||
if (docId == scorerDocId) {
|
||||
tempScore = indriScorer.score();
|
||||
} else {
|
||||
tempScore = indriScorer.smoothingScore(docId);
|
||||
}
|
||||
tempScore *= indriScorer.getBoost();
|
||||
score += tempScore;
|
||||
boostSum += indriScorer.getBoost();
|
||||
}
|
||||
}
|
||||
if (boostSum == 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return (float) (score / boostSum);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,123 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
|
||||
/** The Weight for IndriAndQuery, used to normalize, score and explain these queries. */
|
||||
public class IndriAndWeight extends Weight {
|
||||
|
||||
private final IndriAndQuery query;
|
||||
private final ArrayList<Weight> weights;
|
||||
private final ScoreMode scoreMode;
|
||||
private final float boost;
|
||||
|
||||
public IndriAndWeight(
|
||||
IndriAndQuery query, IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
super(query);
|
||||
this.query = query;
|
||||
this.boost = boost;
|
||||
this.scoreMode = scoreMode;
|
||||
weights = new ArrayList<>();
|
||||
for (BooleanClause c : query) {
|
||||
Weight w = searcher.createWeight(c.getQuery(), scoreMode, 1.0f);
|
||||
weights.add(w);
|
||||
}
|
||||
}
|
||||
|
||||
private Scorer getScorer(LeafReaderContext context) throws IOException {
|
||||
List<Scorer> subScorers = new ArrayList<>();
|
||||
|
||||
for (Weight w : weights) {
|
||||
Scorer scorer = w.scorer(context);
|
||||
if (scorer != null) {
|
||||
subScorers.add(scorer);
|
||||
}
|
||||
}
|
||||
|
||||
if (subScorers.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
Scorer scorer = subScorers.get(0);
|
||||
if (subScorers.size() > 1) {
|
||||
scorer = new IndriAndScorer(this, subScorers, scoreMode, boost);
|
||||
}
|
||||
return scorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
return getScorer(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
||||
Scorer scorer = getScorer(context);
|
||||
if (scorer != null) {
|
||||
BulkScorer bulkScorer = new DefaultBulkScorer(scorer);
|
||||
return bulkScorer;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
for (Weight w : weights) {
|
||||
if (w.isCacheable(ctx) == false) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
||||
List<Explanation> subs = new ArrayList<>();
|
||||
boolean fail = false;
|
||||
Iterator<BooleanClause> cIter = query.iterator();
|
||||
for (Iterator<Weight> wIter = weights.iterator(); wIter.hasNext(); ) {
|
||||
Weight w = wIter.next();
|
||||
BooleanClause c = cIter.next();
|
||||
Explanation e = w.explain(context, doc);
|
||||
if (e.isMatch()) {
|
||||
subs.add(e);
|
||||
} else if (c.isRequired()) {
|
||||
subs.add(
|
||||
Explanation.noMatch(
|
||||
"no match on required clause (" + c.getQuery().toString() + ")", e));
|
||||
fail = true;
|
||||
}
|
||||
}
|
||||
if (fail) {
|
||||
return Explanation.noMatch(
|
||||
"Failure to meet condition(s) of required/prohibited clause(s)", subs);
|
||||
} else {
|
||||
Scorer scorer = scorer(context);
|
||||
if (scorer != null) {
|
||||
int advanced = scorer.iterator().advance(doc);
|
||||
assert advanced == doc;
|
||||
return Explanation.match(scorer.score(), "sum of:", subs);
|
||||
} else {
|
||||
return Explanation.noMatch(
|
||||
"Failure to meet condition(s) of required/prohibited clause(s)", subs);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,77 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* The Indri implemenation of a disjunction scorer which stores the subscorers for the child
|
||||
* queries. The score and smoothingScore methods use the list of all subscorers and not just the
|
||||
* matches so that a smoothingScore can be calculated if there is not an exact match.
|
||||
*/
|
||||
public abstract class IndriDisjunctionScorer extends IndriScorer {
|
||||
|
||||
private final List<Scorer> subScorersList;
|
||||
private final DisiPriorityQueue subScorers;
|
||||
private final DocIdSetIterator approximation;
|
||||
|
||||
protected IndriDisjunctionScorer(
|
||||
Weight weight, List<Scorer> subScorersList, ScoreMode scoreMode, float boost) {
|
||||
super(weight, boost);
|
||||
this.subScorersList = subScorersList;
|
||||
this.subScorers = new DisiPriorityQueue(subScorersList.size());
|
||||
for (Scorer scorer : subScorersList) {
|
||||
final DisiWrapper w = new DisiWrapper(scorer);
|
||||
this.subScorers.add(w);
|
||||
}
|
||||
this.approximation = new DisjunctionDISIApproximation(this.subScorers);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() {
|
||||
return approximation;
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getMaxScore(int upTo) throws IOException {
|
||||
return 0;
|
||||
}
|
||||
|
||||
public List<Scorer> getSubMatches() throws IOException {
|
||||
return subScorersList;
|
||||
}
|
||||
|
||||
abstract float score(List<Scorer> subScorers) throws IOException;
|
||||
|
||||
public abstract float smoothingScore(List<Scorer> subScorers, int docId) throws IOException;
|
||||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return score(getSubMatches());
|
||||
}
|
||||
|
||||
@Override
|
||||
public float smoothingScore(int docId) throws IOException {
|
||||
return smoothingScore(getSubMatches(), docId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return subScorers.top().doc;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,97 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* A Basic abstract query that all IndriQueries can extend to implement toString, equals,
|
||||
* getClauses, and iterator.
|
||||
*/
|
||||
public abstract class IndriQuery extends Query implements Iterable<BooleanClause> {
|
||||
|
||||
private List<BooleanClause> clauses;
|
||||
|
||||
public IndriQuery(List<BooleanClause> clauses) {
|
||||
this.clauses = clauses;
|
||||
}
|
||||
|
||||
@Override
|
||||
public abstract Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException;
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
|
||||
int i = 0;
|
||||
for (BooleanClause c : this) {
|
||||
buffer.append(c.getOccur().toString());
|
||||
|
||||
Query subQuery = c.getQuery();
|
||||
if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
|
||||
buffer.append("(");
|
||||
buffer.append(subQuery.toString(field));
|
||||
buffer.append(")");
|
||||
} else {
|
||||
buffer.append(subQuery.toString(field));
|
||||
}
|
||||
|
||||
if (i != clauses.size() - 1) {
|
||||
buffer.append(" ");
|
||||
}
|
||||
i += 1;
|
||||
}
|
||||
|
||||
return buffer.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
return sameClassAs(o) && equalsTo(getClass().cast(o));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void visit(QueryVisitor visitor) {
|
||||
visitor.visitLeaf(this);
|
||||
}
|
||||
|
||||
private boolean equalsTo(IndriQuery other) {
|
||||
return clauses.equals(other.clauses);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int hashCode = Objects.hash(clauses);
|
||||
if (hashCode == 0) {
|
||||
hashCode = 1;
|
||||
}
|
||||
return hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<BooleanClause> iterator() {
|
||||
return clauses.iterator();
|
||||
}
|
||||
|
||||
public List<BooleanClause> getClauses() {
|
||||
return this.clauses;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,51 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* The Indri parent scorer that stores the boost so that IndriScorers can use the boost outside of
|
||||
* the term.
|
||||
*/
|
||||
public abstract class IndriScorer extends Scorer {
|
||||
|
||||
private float boost;
|
||||
|
||||
protected IndriScorer(Weight weight, float boost) {
|
||||
super(weight);
|
||||
this.boost = boost;
|
||||
}
|
||||
|
||||
@Override
|
||||
public abstract DocIdSetIterator iterator();
|
||||
|
||||
@Override
|
||||
public abstract float getMaxScore(int upTo) throws IOException;
|
||||
|
||||
@Override
|
||||
public abstract float score() throws IOException;
|
||||
|
||||
public abstract float smoothingScore(int docId) throws IOException;
|
||||
|
||||
@Override
|
||||
public abstract int docID();
|
||||
|
||||
public float getBoost() {
|
||||
return this.boost;
|
||||
}
|
||||
}
|
|
@ -27,6 +27,21 @@ public abstract class Scorable {
|
|||
/** Returns the score of the current document matching the query. */
|
||||
public abstract float score() throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the smoothing score of the current document matching the query. This score is used when
|
||||
* the query/term does not appear in the document, and behaves like an idf. The smoothing score is
|
||||
* particularly important when the Scorer returns a product of probabilities so that the document
|
||||
* score does not go to zero when one probability is zero. This can return 0 or a smoothing score.
|
||||
*
|
||||
* <p>Smoothing scores are described in many papers, including: Metzler, D. and Croft, W. B. ,
|
||||
* "Combining the Language Model and Inference Network Approaches to Retrieval," Information
|
||||
* Processing and Management Special Issue on Bayesian Networks and Information Retrieval, 40(5),
|
||||
* pp.735-750.
|
||||
*/
|
||||
public float smoothingScore(int docId) throws IOException {
|
||||
return 0f;
|
||||
}
|
||||
|
||||
/** Returns the doc ID that is currently being scored. */
|
||||
public abstract int docID();
|
||||
|
||||
|
|
|
@ -75,6 +75,11 @@ public final class TermScorer extends Scorer {
|
|||
return docScorer.score(postingsEnum.docID(), postingsEnum.freq());
|
||||
}
|
||||
|
||||
@Override
|
||||
public float smoothingScore(int docId) throws IOException {
|
||||
return docScorer.score(docId, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advanceShallow(int target) throws IOException {
|
||||
return impactsDisi.advanceShallow(target);
|
||||
|
|
|
@ -0,0 +1,112 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
/**
|
||||
* Bayesian smoothing using Dirichlet priors as implemented in the Indri Search engine
|
||||
* (http://www.lemurproject.org/indri.php). Indri Dirichelet Smoothing!
|
||||
*
|
||||
* <pre class="prettyprint">
|
||||
* tf_E + mu*P(t|D) P(t|E)= documentLength + documentMu
|
||||
* mu*P(t|C) + tf_D where P(t|D)= doclen + mu
|
||||
* </pre>
|
||||
*
|
||||
* <p>A larger value for mu, produces more smoothing. Smoothing is most important for short
|
||||
* documents where the probabilities are more granular.
|
||||
*/
|
||||
public class IndriDirichletSimilarity extends LMSimilarity {
|
||||
|
||||
/** The μ parameter. */
|
||||
private final float mu;
|
||||
|
||||
/** Instantiates the similarity with the provided μ parameter. */
|
||||
public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) {
|
||||
super(collectionModel);
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the provided μ parameter. */
|
||||
public IndriDirichletSimilarity(float mu) {
|
||||
this.mu = mu;
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the default μ value of 2000. */
|
||||
public IndriDirichletSimilarity(CollectionModel collectionModel) {
|
||||
this(collectionModel, 2000);
|
||||
}
|
||||
|
||||
/** Instantiates the similarity with the default μ value of 2000. */
|
||||
public IndriDirichletSimilarity() {
|
||||
this(new IndriCollectionModel(), 2000);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double score(BasicStats stats, double freq, double docLen) {
|
||||
double collectionProbability = ((LMStats) stats).getCollectionProbability();
|
||||
double score = (freq + (mu * collectionProbability)) / (docLen + mu);
|
||||
return (Math.log(score));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void explain(List<Explanation> subs, BasicStats stats, double freq, double docLen) {
|
||||
if (stats.getBoost() != 1.0f) {
|
||||
subs.add(Explanation.match(stats.getBoost(), "boost"));
|
||||
}
|
||||
|
||||
subs.add(Explanation.match(mu, "mu"));
|
||||
double collectionProbability = ((LMStats) stats).getCollectionProbability();
|
||||
Explanation weightExpl =
|
||||
Explanation.match(
|
||||
(float) Math.log((freq + (mu * collectionProbability)) / (docLen + mu)), "term weight");
|
||||
subs.add(weightExpl);
|
||||
subs.add(Explanation.match((float) Math.log(mu / (docLen + mu)), "document norm"));
|
||||
super.explain(subs, stats, freq, docLen);
|
||||
}
|
||||
|
||||
/** Returns the μ parameter. */
|
||||
public float getMu() {
|
||||
return mu;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return String.format(Locale.ROOT, "IndriDirichlet(%f)", getMu());
|
||||
}
|
||||
|
||||
/**
|
||||
* Models {@code p(w|C)} as the number of occurrences of the term in the collection, divided by
|
||||
* the total number of tokens {@code + 1}.
|
||||
*/
|
||||
public static class IndriCollectionModel implements CollectionModel {
|
||||
|
||||
/** Sole constructor: parameter-free */
|
||||
public IndriCollectionModel() {}
|
||||
|
||||
@Override
|
||||
public double computeProbability(BasicStats stats) {
|
||||
return ((double) stats.getTotalTermFreq()) / ((double) stats.getNumberOfFieldTokens());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,165 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search;
|
||||
|
||||
import java.util.Arrays;
|
||||
import org.apache.lucene.analysis.MockAnalyzer;
|
||||
import org.apache.lucene.analysis.MockTokenFilter;
|
||||
import org.apache.lucene.analysis.MockTokenizer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.document.TextField;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.RandomIndexWriter;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause.Occur;
|
||||
import org.apache.lucene.search.similarities.IndriDirichletSimilarity;
|
||||
import org.apache.lucene.search.similarities.Similarity;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestIndriAndQuery extends LuceneTestCase {
|
||||
|
||||
/** threshold for comparing floats */
|
||||
public static final float SCORE_COMP_THRESH = 0.0000f;
|
||||
|
||||
public Similarity sim = new IndriDirichletSimilarity();
|
||||
public Directory index;
|
||||
public IndexReader r;
|
||||
public IndexSearcher s;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
|
||||
index = newDirectory();
|
||||
RandomIndexWriter writer =
|
||||
new RandomIndexWriter(
|
||||
random(),
|
||||
index,
|
||||
newIndexWriterConfig(
|
||||
new MockAnalyzer(
|
||||
random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET))
|
||||
.setSimilarity(sim)
|
||||
.setMergePolicy(newLogMergePolicy()));
|
||||
// Query is "President Washington"
|
||||
{
|
||||
Document d1 = new Document();
|
||||
d1.add(newField("id", "d1", TextField.TYPE_STORED));
|
||||
d1.add(
|
||||
newTextField(
|
||||
"body", "President Washington was the first leader of the US", Field.Store.YES));
|
||||
writer.addDocument(d1);
|
||||
}
|
||||
|
||||
{
|
||||
Document d2 = new Document();
|
||||
d2.add(newField("id", "d2", TextField.TYPE_STORED));
|
||||
d2.add(
|
||||
newTextField(
|
||||
"body",
|
||||
"The president is head of the executive branch of government",
|
||||
Field.Store.YES));
|
||||
writer.addDocument(d2);
|
||||
}
|
||||
|
||||
{
|
||||
Document d3 = new Document();
|
||||
d3.add(newField("id", "d3", TextField.TYPE_STORED));
|
||||
d3.add(
|
||||
newTextField(
|
||||
"body", "George Washington was a general in the Revolutionary War", Field.Store.YES));
|
||||
writer.addDocument(d3);
|
||||
}
|
||||
|
||||
{
|
||||
Document d4 = new Document();
|
||||
d4.add(newField("id", "d4", TextField.TYPE_STORED));
|
||||
d4.add(newTextField("body", "A company or college can have a president", Field.Store.YES));
|
||||
writer.addDocument(d4);
|
||||
}
|
||||
|
||||
writer.forceMerge(1);
|
||||
r = getOnlyLeafReader(writer.getReader());
|
||||
writer.close();
|
||||
s = new IndexSearcher(r);
|
||||
s.setSimilarity(sim);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
r.close();
|
||||
index.close();
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void testSimpleQuery1() throws Exception {
|
||||
|
||||
BooleanClause clause1 = new BooleanClause(tq("body", "george"), Occur.SHOULD);
|
||||
BooleanClause clause2 = new BooleanClause(tq("body", "washington"), Occur.SHOULD);
|
||||
|
||||
IndriAndQuery q = new IndriAndQuery(Arrays.asList(clause1, clause2));
|
||||
|
||||
ScoreDoc[] h = s.search(q, 1000).scoreDocs;
|
||||
|
||||
try {
|
||||
assertEquals("2 docs should match " + q.toString(), 2, h.length);
|
||||
} catch (Error e) {
|
||||
printHits("testSimpleEqualScores1", h, s);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
public void testSimpleQuery2() throws Exception {
|
||||
|
||||
BooleanClause clause1 = new BooleanClause(tq("body", "president"), Occur.SHOULD);
|
||||
BooleanClause clause2 = new BooleanClause(tq("body", "washington"), Occur.SHOULD);
|
||||
|
||||
IndriAndQuery q = new IndriAndQuery(Arrays.asList(clause1, clause2));
|
||||
|
||||
ScoreDoc[] h = s.search(q, 1000).scoreDocs;
|
||||
|
||||
try {
|
||||
assertEquals("all docs should match " + q.toString(), 4, h.length);
|
||||
} catch (Error e) {
|
||||
printHits("testSimpleEqualScores1", h, s);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
/** macro */
|
||||
protected Query tq(String f, String t) {
|
||||
return new TermQuery(new Term(f, t));
|
||||
}
|
||||
|
||||
/** macro */
|
||||
protected Query tq(String f, String t, float b) {
|
||||
Query q = tq(f, t);
|
||||
return new BoostQuery(q, b);
|
||||
}
|
||||
|
||||
protected void printHits(String test, ScoreDoc[] h, IndexSearcher searcher) throws Exception {
|
||||
|
||||
System.err.println("------- " + test + " -------");
|
||||
|
||||
for (int i = 0; i < h.length; i++) {
|
||||
Document d = searcher.doc(h[i].doc);
|
||||
float score = h[i].score;
|
||||
System.err.println("#" + i + ": " + score + " - " + d.get("body"));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,49 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.lucene.search.similarities;
|
||||
|
||||
import java.util.Random;
|
||||
|
||||
public class TestIndriDirichletSimilarity extends BaseSimilarityTestCase {
|
||||
|
||||
@Override
|
||||
protected Similarity getSimilarity(Random random) {
|
||||
// smoothing parameter mu, unbounded
|
||||
final float mu;
|
||||
switch (random.nextInt(4)) {
|
||||
case 0:
|
||||
// minimum value
|
||||
mu = 0;
|
||||
break;
|
||||
case 1:
|
||||
// tiny value
|
||||
mu = Float.MIN_VALUE;
|
||||
break;
|
||||
case 2:
|
||||
// maximum value
|
||||
// we just limit the test to "reasonable" mu values but don't enforce
|
||||
// this anywhere.
|
||||
mu = Integer.MAX_VALUE;
|
||||
break;
|
||||
default:
|
||||
// random value
|
||||
mu = Integer.MAX_VALUE * random.nextFloat();
|
||||
break;
|
||||
}
|
||||
return new IndriDirichletSimilarity(mu);
|
||||
}
|
||||
}
|
|
@ -368,7 +368,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
float score = scorer.score(freq, norm);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue("infinite/NaN score: " + score, Float.isFinite(score));
|
||||
assertTrue("negative score: " + score, score >= 0);
|
||||
if (!(similarity instanceof IndriDirichletSimilarity)) {
|
||||
assertTrue("negative score: " + score, score >= 0);
|
||||
}
|
||||
assertTrue("greater than maxScore: " + score + ">" + maxScore, score <= maxScore);
|
||||
// check explanation matches
|
||||
Explanation explanation =
|
||||
|
@ -395,7 +397,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
float prevScore = scorer.score(prevFreq, norm);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevScore));
|
||||
assertTrue(prevScore >= 0);
|
||||
if (!(similarity instanceof IndriDirichletSimilarity)) {
|
||||
assertTrue(prevScore >= 0);
|
||||
}
|
||||
// check explanation matches
|
||||
Explanation prevExplanation =
|
||||
scorer.explain(
|
||||
|
@ -419,7 +423,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
float prevNormScore = scorer.score(freq, norm - 1);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevNormScore));
|
||||
assertTrue(prevNormScore >= 0);
|
||||
if (!(similarity instanceof IndriDirichletSimilarity)) {
|
||||
assertTrue(prevNormScore >= 0);
|
||||
}
|
||||
// check explanation matches
|
||||
Explanation prevNormExplanation =
|
||||
scorer.explain(
|
||||
|
@ -459,7 +465,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
|
|||
float prevTermScore = prevTermScorer.score(freq, norm);
|
||||
// check that score isn't infinite or negative
|
||||
assertTrue(Float.isFinite(prevTermScore));
|
||||
assertTrue(prevTermScore >= 0);
|
||||
if (!(similarity instanceof IndriDirichletSimilarity)) {
|
||||
assertTrue(prevTermScore >= 0);
|
||||
}
|
||||
// check explanation matches
|
||||
Explanation prevTermExplanation =
|
||||
prevTermScorer.explain(
|
||||
|
|
|
@ -33,6 +33,9 @@ public class TestFeature extends SolrTestCase {
|
|||
// the FilterFeatureScorer may simply inherit Scorer's default implementation
|
||||
if (scorerClassMethod.getName().equals("twoPhaseIterator")) continue;
|
||||
|
||||
// the FilterFeatureScorer may simply inherit Scorer's default implementation
|
||||
if (scorerClassMethod.getName().equals("smoothingScore")) continue;
|
||||
|
||||
// the FilterFeatureScorer's implementation does not influence its parent Weight
|
||||
if (scorerClassMethod.getName().equals("getWeight")) continue;
|
||||
|
||||
|
|
Loading…
Reference in New Issue