LUCENE-9537: Add initial Indri search engine functionality to Lucene

2021-01-29 14:47:24 -05:00 · 2021-01-29 14:47:24 -05:00 · 9cc5c9b798
parent e4cede0e8c
commit 9cc5c9b798
15 changed files with 832 additions and 10 deletions
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@ -1,4 +1,4 @@
-Lucene Change Log
+Lucene Change Log

 For more information on past and future Lucene versions, please see:
 http://s.apache.org/luceneversions
@ -84,6 +84,17 @@ API Changes

 * LUCENE-9646: Set BM25Similarity discountOverlaps via the constructor (Patrick Marty via Bruno Roustant)

+* LUCENE-9537:  Added smoothingScore method and default implementation to 
+  Scorable abstract class.  The smoothing score allows scorers to calculate a 
+  score for a document where the search term or subquery is not present.  The 
+  smoothing score acts like an idf so that documents that do not have terms or 
+  subqueries that are more frequent in the index are not penalized as much as 
+  documents that do not have less frequent terms or subqueries and prevents 
+  scores which are the product or terms or subqueries from going to zero. Added 
+  the implementation of the Indri AND and the IndriDirichletSimilarity from the 
+  academic Indri search engine: http://www.lemurproject.org/indri.php. 
+  (Cameron VandenBerg)   
+
 Improvements

 * LUCENE-9687: Hunspell support improvements: add SpellChecker API, support default encoding and
--- a/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java
@ -184,8 +184,8 @@ public class IndexSearcher {
  /**
   * Runs searches for each segment separately, using the provided Executor. NOTE: if you are using
   * {@link NIOFSDirectory}, do not use the shutdownNow method of ExecutorService as this uses
-   * Thread.interrupt under-the-hood which can silently close file descriptors (see <a
-   * href="https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
+   * Thread.interrupt under-the-hood which can silently close file descriptors (see <a href=
+   * "https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
   *
   * @lucene.experimental
   */
@ -199,8 +199,8 @@ public class IndexSearcher {
   * <p>Given a non-<code>null</code> {@link Executor} this method runs searches for each segment
   * separately, using the provided Executor. NOTE: if you are using {@link NIOFSDirectory}, do not
   * use the shutdownNow method of ExecutorService as this uses Thread.interrupt under-the-hood
-   * which can silently close file descriptors (see <a
-   * href="https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
+   * which can silently close file descriptors (see <a href=
+   * "https://issues.apache.org/jira/browse/LUCENE-2239">LUCENE-2239</a>).
   *
   * @see IndexReaderContext
   * @see IndexReader#getContext()
@ -729,7 +729,7 @@ public class IndexSearcher {
      throws IOException {

    // TODO: should we make this
-    // threaded...?  the Collector could be sync'd?
+    // threaded...? the Collector could be sync'd?
    // always use single thread:
    for (LeafReaderContext ctx : leaves) { // search each subreader
      final LeafCollector leafCollector;
--- a/lucene/core/src/java/org/apache/lucene/search/IndriAndQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriAndQuery.java
@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.List;
+
+/** A Query that matches documents matching combinations of subqueries. */
+public class IndriAndQuery extends IndriQuery {
+
+  public IndriAndQuery(List<BooleanClause> clauses) {
+    super(clauses);
+  }
+
+  @Override
+  public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
+      throws IOException {
+    IndriAndQuery query = this;
+    return new IndriAndWeight(query, searcher, ScoreMode.TOP_SCORES, boost);
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/IndriAndScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriAndScorer.java
@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * Combines scores of subscorers. If a subscorer does not contain the docId, a smoothing score is
+ * calculated for that document/subscorer combination.
+ */
+public class IndriAndScorer extends IndriDisjunctionScorer {
+
+  protected IndriAndScorer(Weight weight, List<Scorer> subScorers, ScoreMode scoreMode, float boost)
+      throws IOException {
+    super(weight, subScorers, scoreMode, boost);
+  }
+
+  @Override
+  public float score(List<Scorer> subScorers) throws IOException {
+    int docId = this.docID();
+    return scoreDoc(subScorers, docId);
+  }
+
+  @Override
+  public float smoothingScore(List<Scorer> subScorers, int docId) throws IOException {
+    return scoreDoc(subScorers, docId);
+  }
+
+  private float scoreDoc(List<Scorer> subScorers, int docId) throws IOException {
+    double score = 0;
+    double boostSum = 0.0;
+    for (Scorer scorer : subScorers) {
+      if (scorer instanceof IndriScorer) {
+        IndriScorer indriScorer = (IndriScorer) scorer;
+        int scorerDocId = indriScorer.docID();
+        // If the query exists in the document, score the document
+        // Otherwise, compute a smoothing score, which acts like an idf
+        // for subqueries/terms
+        double tempScore = 0;
+        if (docId == scorerDocId) {
+          tempScore = indriScorer.score();
+        } else {
+          tempScore = indriScorer.smoothingScore(docId);
+        }
+        tempScore *= indriScorer.getBoost();
+        score += tempScore;
+        boostSum += indriScorer.getBoost();
+      }
+    }
+    if (boostSum == 0) {
+      return 0;
+    } else {
+      return (float) (score / boostSum);
+    }
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/IndriAndWeight.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriAndWeight.java
@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import org.apache.lucene.index.LeafReaderContext;
+
+/** The Weight for IndriAndQuery, used to normalize, score and explain these queries. */
+public class IndriAndWeight extends Weight {
+
+  private final IndriAndQuery query;
+  private final ArrayList<Weight> weights;
+  private final ScoreMode scoreMode;
+  private final float boost;
+
+  public IndriAndWeight(
+      IndriAndQuery query, IndexSearcher searcher, ScoreMode scoreMode, float boost)
+      throws IOException {
+    super(query);
+    this.query = query;
+    this.boost = boost;
+    this.scoreMode = scoreMode;
+    weights = new ArrayList<>();
+    for (BooleanClause c : query) {
+      Weight w = searcher.createWeight(c.getQuery(), scoreMode, 1.0f);
+      weights.add(w);
+    }
+  }
+
+  private Scorer getScorer(LeafReaderContext context) throws IOException {
+    List<Scorer> subScorers = new ArrayList<>();
+
+    for (Weight w : weights) {
+      Scorer scorer = w.scorer(context);
+      if (scorer != null) {
+        subScorers.add(scorer);
+      }
+    }
+
+    if (subScorers.isEmpty()) {
+      return null;
+    }
+    Scorer scorer = subScorers.get(0);
+    if (subScorers.size() > 1) {
+      scorer = new IndriAndScorer(this, subScorers, scoreMode, boost);
+    }
+    return scorer;
+  }
+
+  @Override
+  public Scorer scorer(LeafReaderContext context) throws IOException {
+    return getScorer(context);
+  }
+
+  @Override
+  public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
+    Scorer scorer = getScorer(context);
+    if (scorer != null) {
+      BulkScorer bulkScorer = new DefaultBulkScorer(scorer);
+      return bulkScorer;
+    }
+    return null;
+  }
+
+  @Override
+  public boolean isCacheable(LeafReaderContext ctx) {
+    for (Weight w : weights) {
+      if (w.isCacheable(ctx) == false) return false;
+    }
+    return true;
+  }
+
+  @Override
+  public Explanation explain(LeafReaderContext context, int doc) throws IOException {
+    List<Explanation> subs = new ArrayList<>();
+    boolean fail = false;
+    Iterator<BooleanClause> cIter = query.iterator();
+    for (Iterator<Weight> wIter = weights.iterator(); wIter.hasNext(); ) {
+      Weight w = wIter.next();
+      BooleanClause c = cIter.next();
+      Explanation e = w.explain(context, doc);
+      if (e.isMatch()) {
+        subs.add(e);
+      } else if (c.isRequired()) {
+        subs.add(
+            Explanation.noMatch(
+                "no match on required clause (" + c.getQuery().toString() + ")", e));
+        fail = true;
+      }
+    }
+    if (fail) {
+      return Explanation.noMatch(
+          "Failure to meet condition(s) of required/prohibited clause(s)", subs);
+    } else {
+      Scorer scorer = scorer(context);
+      if (scorer != null) {
+        int advanced = scorer.iterator().advance(doc);
+        assert advanced == doc;
+        return Explanation.match(scorer.score(), "sum of:", subs);
+      } else {
+        return Explanation.noMatch(
+            "Failure to meet condition(s) of required/prohibited clause(s)", subs);
+      }
+    }
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/IndriDisjunctionScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriDisjunctionScorer.java
@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * The Indri implemenation of a disjunction scorer which stores the subscorers for the child
+ * queries. The score and smoothingScore methods use the list of all subscorers and not just the
+ * matches so that a smoothingScore can be calculated if there is not an exact match.
+ */
+public abstract class IndriDisjunctionScorer extends IndriScorer {
+
+  private final List<Scorer> subScorersList;
+  private final DisiPriorityQueue subScorers;
+  private final DocIdSetIterator approximation;
+
+  protected IndriDisjunctionScorer(
+      Weight weight, List<Scorer> subScorersList, ScoreMode scoreMode, float boost) {
+    super(weight, boost);
+    this.subScorersList = subScorersList;
+    this.subScorers = new DisiPriorityQueue(subScorersList.size());
+    for (Scorer scorer : subScorersList) {
+      final DisiWrapper w = new DisiWrapper(scorer);
+      this.subScorers.add(w);
+    }
+    this.approximation = new DisjunctionDISIApproximation(this.subScorers);
+  }
+
+  @Override
+  public DocIdSetIterator iterator() {
+    return approximation;
+  }
+
+  @Override
+  public float getMaxScore(int upTo) throws IOException {
+    return 0;
+  }
+
+  public List<Scorer> getSubMatches() throws IOException {
+    return subScorersList;
+  }
+
+  abstract float score(List<Scorer> subScorers) throws IOException;
+
+  public abstract float smoothingScore(List<Scorer> subScorers, int docId) throws IOException;
+
+  @Override
+  public float score() throws IOException {
+    return score(getSubMatches());
+  }
+
+  @Override
+  public float smoothingScore(int docId) throws IOException {
+    return smoothingScore(getSubMatches(), docId);
+  }
+
+  @Override
+  public int docID() {
+    return subScorers.top().doc;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/IndriQuery.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriQuery.java
@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Objects;
+
+/**
+ * A Basic abstract query that all IndriQueries can extend to implement toString, equals,
+ * getClauses, and iterator.
+ */
+public abstract class IndriQuery extends Query implements Iterable<BooleanClause> {
+
+  private List<BooleanClause> clauses;
+
+  public IndriQuery(List<BooleanClause> clauses) {
+    this.clauses = clauses;
+  }
+
+  @Override
+  public abstract Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
+      throws IOException;
+
+  @Override
+  public String toString(String field) {
+    StringBuilder buffer = new StringBuilder();
+
+    int i = 0;
+    for (BooleanClause c : this) {
+      buffer.append(c.getOccur().toString());
+
+      Query subQuery = c.getQuery();
+      if (subQuery instanceof BooleanQuery) { // wrap sub-bools in parens
+        buffer.append("(");
+        buffer.append(subQuery.toString(field));
+        buffer.append(")");
+      } else {
+        buffer.append(subQuery.toString(field));
+      }
+
+      if (i != clauses.size() - 1) {
+        buffer.append(" ");
+      }
+      i += 1;
+    }
+
+    return buffer.toString();
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    return sameClassAs(o) && equalsTo(getClass().cast(o));
+  }
+
+  @Override
+  public void visit(QueryVisitor visitor) {
+    visitor.visitLeaf(this);
+  }
+
+  private boolean equalsTo(IndriQuery other) {
+    return clauses.equals(other.clauses);
+  }
+
+  @Override
+  public int hashCode() {
+    int hashCode = Objects.hash(clauses);
+    if (hashCode == 0) {
+      hashCode = 1;
+    }
+    return hashCode;
+  }
+
+  @Override
+  public Iterator<BooleanClause> iterator() {
+    return clauses.iterator();
+  }
+
+  public List<BooleanClause> getClauses() {
+    return this.clauses;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/IndriScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/IndriScorer.java
@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.io.IOException;
+
+/**
+ * The Indri parent scorer that stores the boost so that IndriScorers can use the boost outside of
+ * the term.
+ */
+public abstract class IndriScorer extends Scorer {
+
+  private float boost;
+
+  protected IndriScorer(Weight weight, float boost) {
+    super(weight);
+    this.boost = boost;
+  }
+
+  @Override
+  public abstract DocIdSetIterator iterator();
+
+  @Override
+  public abstract float getMaxScore(int upTo) throws IOException;
+
+  @Override
+  public abstract float score() throws IOException;
+
+  public abstract float smoothingScore(int docId) throws IOException;
+
+  @Override
+  public abstract int docID();
+
+  public float getBoost() {
+    return this.boost;
+  }
+}
--- a/lucene/core/src/java/org/apache/lucene/search/Scorable.java
+++ b/lucene/core/src/java/org/apache/lucene/search/Scorable.java
@ -27,6 +27,21 @@ public abstract class Scorable {
  /** Returns the score of the current document matching the query. */
  public abstract float score() throws IOException;

+  /**
+   * Returns the smoothing score of the current document matching the query. This score is used when
+   * the query/term does not appear in the document, and behaves like an idf. The smoothing score is
+   * particularly important when the Scorer returns a product of probabilities so that the document
+   * score does not go to zero when one probability is zero. This can return 0 or a smoothing score.
+   *
+   * <p>Smoothing scores are described in many papers, including: Metzler, D. and Croft, W. B. ,
+   * "Combining the Language Model and Inference Network Approaches to Retrieval," Information
+   * Processing and Management Special Issue on Bayesian Networks and Information Retrieval, 40(5),
+   * pp.735-750.
+   */
+  public float smoothingScore(int docId) throws IOException {
+    return 0f;
+  }
+
  /** Returns the doc ID that is currently being scored. */
  public abstract int docID();

--- a/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
+++ b/lucene/core/src/java/org/apache/lucene/search/TermScorer.java
@ -75,6 +75,11 @@ public final class TermScorer extends Scorer {
    return docScorer.score(postingsEnum.docID(), postingsEnum.freq());
  }

+  @Override
+  public float smoothingScore(int docId) throws IOException {
+    return docScorer.score(docId, 0);
+  }
+
  @Override
  public int advanceShallow(int target) throws IOException {
    return impactsDisi.advanceShallow(target);
--- a/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java
+++ b/lucene/core/src/java/org/apache/lucene/search/similarities/IndriDirichletSimilarity.java
@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.util.List;
+import java.util.Locale;
+import org.apache.lucene.search.Explanation;
+
+/**
+ * Bayesian smoothing using Dirichlet priors as implemented in the Indri Search engine
+ * (http://www.lemurproject.org/indri.php). Indri Dirichelet Smoothing!
+ *
+ * <pre class="prettyprint">
+ * tf_E + mu*P(t|D) P(t|E)= documentLength + documentMu
+ * mu*P(t|C) + tf_D where P(t|D)= doclen + mu
+ * </pre>
+ *
+ * <p>A larger value for mu, produces more smoothing. Smoothing is most important for short
+ * documents where the probabilities are more granular.
+ */
+public class IndriDirichletSimilarity extends LMSimilarity {
+
+  /** The &mu; parameter. */
+  private final float mu;
+
+  /** Instantiates the similarity with the provided &mu; parameter. */
+  public IndriDirichletSimilarity(CollectionModel collectionModel, float mu) {
+    super(collectionModel);
+    this.mu = mu;
+  }
+
+  /** Instantiates the similarity with the provided &mu; parameter. */
+  public IndriDirichletSimilarity(float mu) {
+    this.mu = mu;
+  }
+
+  /** Instantiates the similarity with the default &mu; value of 2000. */
+  public IndriDirichletSimilarity(CollectionModel collectionModel) {
+    this(collectionModel, 2000);
+  }
+
+  /** Instantiates the similarity with the default &mu; value of 2000. */
+  public IndriDirichletSimilarity() {
+    this(new IndriCollectionModel(), 2000);
+  }
+
+  @Override
+  protected double score(BasicStats stats, double freq, double docLen) {
+    double collectionProbability = ((LMStats) stats).getCollectionProbability();
+    double score = (freq + (mu * collectionProbability)) / (docLen + mu);
+    return (Math.log(score));
+  }
+
+  @Override
+  protected void explain(List<Explanation> subs, BasicStats stats, double freq, double docLen) {
+    if (stats.getBoost() != 1.0f) {
+      subs.add(Explanation.match(stats.getBoost(), "boost"));
+    }
+
+    subs.add(Explanation.match(mu, "mu"));
+    double collectionProbability = ((LMStats) stats).getCollectionProbability();
+    Explanation weightExpl =
+        Explanation.match(
+            (float) Math.log((freq + (mu * collectionProbability)) / (docLen + mu)), "term weight");
+    subs.add(weightExpl);
+    subs.add(Explanation.match((float) Math.log(mu / (docLen + mu)), "document norm"));
+    super.explain(subs, stats, freq, docLen);
+  }
+
+  /** Returns the &mu; parameter. */
+  public float getMu() {
+    return mu;
+  }
+
+  public String getName() {
+    return String.format(Locale.ROOT, "IndriDirichlet(%f)", getMu());
+  }
+
+  /**
+   * Models {@code p(w|C)} as the number of occurrences of the term in the collection, divided by
+   * the total number of tokens {@code + 1}.
+   */
+  public static class IndriCollectionModel implements CollectionModel {
+
+    /** Sole constructor: parameter-free */
+    public IndriCollectionModel() {}
+
+    @Override
+    public double computeProbability(BasicStats stats) {
+      return ((double) stats.getTotalTermFreq()) / ((double) stats.getNumberOfFieldTokens());
+    }
+
+    @Override
+    public String getName() {
+      return null;
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/search/TestIndriAndQuery.java
+++ b/lucene/core/src/test/org/apache/lucene/search/TestIndriAndQuery.java
@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search;
+
+import java.util.Arrays;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenFilter;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.similarities.IndriDirichletSimilarity;
+import org.apache.lucene.search.similarities.Similarity;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+
+public class TestIndriAndQuery extends LuceneTestCase {
+
+  /** threshold for comparing floats */
+  public static final float SCORE_COMP_THRESH = 0.0000f;
+
+  public Similarity sim = new IndriDirichletSimilarity();
+  public Directory index;
+  public IndexReader r;
+  public IndexSearcher s;
+
+  @Override
+  public void setUp() throws Exception {
+    super.setUp();
+
+    index = newDirectory();
+    RandomIndexWriter writer =
+        new RandomIndexWriter(
+            random(),
+            index,
+            newIndexWriterConfig(
+                    new MockAnalyzer(
+                        random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET))
+                .setSimilarity(sim)
+                .setMergePolicy(newLogMergePolicy()));
+    // Query is "President Washington"
+    {
+      Document d1 = new Document();
+      d1.add(newField("id", "d1", TextField.TYPE_STORED));
+      d1.add(
+          newTextField(
+              "body", "President Washington was the first leader of the US", Field.Store.YES));
+      writer.addDocument(d1);
+    }
+
+    {
+      Document d2 = new Document();
+      d2.add(newField("id", "d2", TextField.TYPE_STORED));
+      d2.add(
+          newTextField(
+              "body",
+              "The president is head of the executive branch of government",
+              Field.Store.YES));
+      writer.addDocument(d2);
+    }
+
+    {
+      Document d3 = new Document();
+      d3.add(newField("id", "d3", TextField.TYPE_STORED));
+      d3.add(
+          newTextField(
+              "body", "George Washington was a general in the Revolutionary War", Field.Store.YES));
+      writer.addDocument(d3);
+    }
+
+    {
+      Document d4 = new Document();
+      d4.add(newField("id", "d4", TextField.TYPE_STORED));
+      d4.add(newTextField("body", "A company or college can have a president", Field.Store.YES));
+      writer.addDocument(d4);
+    }
+
+    writer.forceMerge(1);
+    r = getOnlyLeafReader(writer.getReader());
+    writer.close();
+    s = new IndexSearcher(r);
+    s.setSimilarity(sim);
+  }
+
+  @Override
+  public void tearDown() throws Exception {
+    r.close();
+    index.close();
+    super.tearDown();
+  }
+
+  public void testSimpleQuery1() throws Exception {
+
+    BooleanClause clause1 = new BooleanClause(tq("body", "george"), Occur.SHOULD);
+    BooleanClause clause2 = new BooleanClause(tq("body", "washington"), Occur.SHOULD);
+
+    IndriAndQuery q = new IndriAndQuery(Arrays.asList(clause1, clause2));
+
+    ScoreDoc[] h = s.search(q, 1000).scoreDocs;
+
+    try {
+      assertEquals("2 docs should match " + q.toString(), 2, h.length);
+    } catch (Error e) {
+      printHits("testSimpleEqualScores1", h, s);
+      throw e;
+    }
+  }
+
+  public void testSimpleQuery2() throws Exception {
+
+    BooleanClause clause1 = new BooleanClause(tq("body", "president"), Occur.SHOULD);
+    BooleanClause clause2 = new BooleanClause(tq("body", "washington"), Occur.SHOULD);
+
+    IndriAndQuery q = new IndriAndQuery(Arrays.asList(clause1, clause2));
+
+    ScoreDoc[] h = s.search(q, 1000).scoreDocs;
+
+    try {
+      assertEquals("all docs should match " + q.toString(), 4, h.length);
+    } catch (Error e) {
+      printHits("testSimpleEqualScores1", h, s);
+      throw e;
+    }
+  }
+
+  /** macro */
+  protected Query tq(String f, String t) {
+    return new TermQuery(new Term(f, t));
+  }
+
+  /** macro */
+  protected Query tq(String f, String t, float b) {
+    Query q = tq(f, t);
+    return new BoostQuery(q, b);
+  }
+
+  protected void printHits(String test, ScoreDoc[] h, IndexSearcher searcher) throws Exception {
+
+    System.err.println("------- " + test + " -------");
+
+    for (int i = 0; i < h.length; i++) {
+      Document d = searcher.doc(h[i].doc);
+      float score = h[i].score;
+      System.err.println("#" + i + ": " + score + " - " + d.get("body"));
+    }
+  }
+}
--- a/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndriDirichletSimilarity.java
+++ b/lucene/core/src/test/org/apache/lucene/search/similarities/TestIndriDirichletSimilarity.java
@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similarities;
+
+import java.util.Random;
+
+public class TestIndriDirichletSimilarity extends BaseSimilarityTestCase {
+
+  @Override
+  protected Similarity getSimilarity(Random random) {
+    // smoothing parameter mu, unbounded
+    final float mu;
+    switch (random.nextInt(4)) {
+      case 0:
+        // minimum value
+        mu = 0;
+        break;
+      case 1:
+        // tiny value
+        mu = Float.MIN_VALUE;
+        break;
+      case 2:
+        // maximum value
+        // we just limit the test to "reasonable" mu values but don't enforce
+        // this anywhere.
+        mu = Integer.MAX_VALUE;
+        break;
+      default:
+        // random value
+        mu = Integer.MAX_VALUE * random.nextFloat();
+        break;
+    }
+    return new IndriDirichletSimilarity(mu);
+  }
+}
--- a/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/similarities/BaseSimilarityTestCase.java
@ -368,7 +368,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
      float score = scorer.score(freq, norm);
      // check that score isn't infinite or negative
      assertTrue("infinite/NaN score: " + score, Float.isFinite(score));
-      assertTrue("negative score: " + score, score >= 0);
+      if (!(similarity instanceof IndriDirichletSimilarity)) {
+        assertTrue("negative score: " + score, score >= 0);
+      }
      assertTrue("greater than maxScore: " + score + ">" + maxScore, score <= maxScore);
      // check explanation matches
      Explanation explanation =
@ -395,7 +397,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
      float prevScore = scorer.score(prevFreq, norm);
      // check that score isn't infinite or negative
      assertTrue(Float.isFinite(prevScore));
-      assertTrue(prevScore >= 0);
+      if (!(similarity instanceof IndriDirichletSimilarity)) {
+        assertTrue(prevScore >= 0);
+      }
      // check explanation matches
      Explanation prevExplanation =
          scorer.explain(
@ -419,7 +423,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
        float prevNormScore = scorer.score(freq, norm - 1);
        // check that score isn't infinite or negative
        assertTrue(Float.isFinite(prevNormScore));
-        assertTrue(prevNormScore >= 0);
+        if (!(similarity instanceof IndriDirichletSimilarity)) {
+          assertTrue(prevNormScore >= 0);
+        }
        // check explanation matches
        Explanation prevNormExplanation =
            scorer.explain(
@ -459,7 +465,9 @@ public abstract class BaseSimilarityTestCase extends LuceneTestCase {
        float prevTermScore = prevTermScorer.score(freq, norm);
        // check that score isn't infinite or negative
        assertTrue(Float.isFinite(prevTermScore));
-        assertTrue(prevTermScore >= 0);
+        if (!(similarity instanceof IndriDirichletSimilarity)) {
+          assertTrue(prevTermScore >= 0);
+        }
        // check explanation matches
        Explanation prevTermExplanation =
            prevTermScorer.explain(
--- a/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFeature.java
+++ b/solr/contrib/ltr/src/test/org/apache/solr/ltr/feature/TestFeature.java
@ -33,6 +33,9 @@ public class TestFeature extends SolrTestCase {
        // the FilterFeatureScorer may simply inherit Scorer's default implementation
        if (scorerClassMethod.getName().equals("twoPhaseIterator")) continue;

+        // the FilterFeatureScorer may simply inherit Scorer's default implementation
+        if (scorerClassMethod.getName().equals("smoothingScore")) continue;
+        
        // the FilterFeatureScorer's implementation does not influence its parent Weight
        if (scorerClassMethod.getName().equals("getWeight")) continue;