This commit is contained in:
Karl Wright 2017-12-13 07:10:13 -05:00
commit 5fe3d9259a
40 changed files with 1173 additions and 193 deletions

View File

@ -37,6 +37,9 @@ Changes in Runtime Behavior
divide-by-zero hacks. IndexSearcher.termStatistics/collectionStatistics return null
instead of returning bogus values for a non-existent term or field. (Robert Muir)
* LUCENE-7996: FunctionQuery and FunctionScoreQuery now return a score of 0
when the function produces a negative value. (Adrien Grand)
Improvements
* LUCENE-7997: Add BaseSimilarityTestCase to sanity check similarities.
@ -44,6 +47,9 @@ Improvements
Add missing range checks for similarity parameters.
Improve BM25 and ClassicSimilarity's explanations. (Robert Muir)
* LUCENE-8011: Improved similarity explanations.
(Mayya Sharipova via Adrien Grand)
Optimizations
* LUCENE-8040: Optimize IndexSearcher.collectionStatistics, avoiding MultiFields/MultiTerms

View File

@ -716,7 +716,7 @@ final class DocumentsWriterFlushControl implements Accountable {
return infoStream;
}
ThreadState findLargestNonPendingWriter() {
synchronized ThreadState findLargestNonPendingWriter() {
ThreadState maxRamUsingThreadState = null;
long maxRamSoFar = 0;
Iterator<ThreadState> activePerThreadsIterator = allActiveThreadStates();

View File

@ -39,10 +39,14 @@ public class AfterEffectB extends AfterEffect {
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
getClass().getSimpleName() + ", computed from: ",
Explanation.match((float) tfn, "tfn"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
Explanation.match(stats.getDocFreq(), "docFreq"));
getClass().getSimpleName()
+ ", computed as (F + 1) / (n * (tfn + 1)) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"),
Explanation.match(stats.getTotalTermFreq(),
"F, total number of occurrences of term across all documents + 1"),
Explanation.match(stats.getDocFreq(),
"n, number of documents containing term + 1"),
Explanation.match((float) tfn, "tfn, normalized term frequency"));
}
@Override

View File

@ -37,8 +37,8 @@ public class AfterEffectL extends AfterEffect {
public final Explanation explain(BasicStats stats, double tfn) {
return Explanation.match(
(float) (scoreTimes1pTfn(stats) / (1 + tfn)),
getClass().getSimpleName() + ", computed from: ",
Explanation.match((float) tfn, "tfn"));
getClass().getSimpleName() + ", computed as 1 / (tfn + 1) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"));
}
@Override

View File

@ -118,16 +118,19 @@ public abstract class Axiomatic extends SimilarityBase {
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
subs.add(Explanation.match((float) stats.getBoost(),
"boost, query boost"));
}
subs.add(Explanation.match(this.k, "k"));
subs.add(Explanation.match(this.s, "s"));
subs.add(Explanation.match(this.queryLen, "queryLen"));
subs.add(Explanation.match((float) tf(stats, freq, docLen), "tf"));
subs.add(Explanation.match((float) ln(stats, freq, docLen), "ln"));
subs.add(Explanation.match((float) tfln(stats, freq, docLen), "tfln"));
subs.add(Explanation.match((float) idf(stats, freq, docLen), "idf"));
subs.add(Explanation.match(this.k,
"k, hyperparam for the primitive weighting function"));
subs.add(Explanation.match(this.s,
"s, hyperparam for the growth function"));
subs.add(Explanation.match(this.queryLen, "queryLen, query length"));
subs.add(tfExplain(stats, freq, docLen));
subs.add(lnExplain(stats, freq, docLen));
subs.add(tflnExplain(stats, freq, docLen));
subs.add(idfExplain(stats, freq, docLen));
subs.add(Explanation.match((float) gamma(stats, freq, docLen), "gamma"));
super.explain(subs, stats, doc, freq, docLen);
}
@ -162,4 +165,47 @@ public abstract class Axiomatic extends SimilarityBase {
* compute the gamma component (only for F3EXp and F3LOG)
*/
protected abstract double gamma(BasicStats stats, double freq, double docLen);
/**
* Explain the score of the term frequency component for a single document
* @param stats the corpus level statistics
* @param freq number of occurrences of term in the document
* @param docLen the document length
* @return Explanation of how the tf component was computed
*/
protected abstract Explanation tfExplain(BasicStats stats,
double freq, double docLen);
/**
* Explain the score of the document length component for a single document
* @param stats the corpus level statistics
* @param freq number of occurrences of term in the document
* @param docLen the document length
* @return Explanation of how the ln component was computed
*/
protected abstract Explanation lnExplain(BasicStats stats,
double freq, double docLen);
/**
* Explain the score of the mixed term frequency and
* document length component for a single document
* @param stats the corpus level statistics
* @param freq number of occurrences of term in the document
* @param docLen the document length
* @return Explanation of how the tfln component was computed
*/
protected abstract Explanation tflnExplain(BasicStats stats,
double freq, double docLen);
/**
* Explain the score of the inverted document frequency component
* for a single document
* @param stats the corpus level statistics
* @param freq number of occurrences of term in the document
* @param docLen the document length
* @return Explanation of how the idf component was computed
*/
protected abstract Explanation idfExplain(BasicStats stats, double freq, double docLen);
}

View File

@ -16,6 +16,9 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F1EXP is defined as Sum(tf(term_doc_freq)*ln(docLen)*IDF(term))
* where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq
@ -92,4 +95,37 @@ public class AxiomaticF1EXP extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency computed as 1 + log(1 + log(freq)) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"));
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length computed as (avgdl + s) / (avgdl + dl * s) from:",
Explanation.match((float) stats.getAvgFieldLength(),
"avgdl, average length of field across all documents"),
Explanation.match((float) docLen,
"dl, length of field"));
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, equals to 1");
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as " +
"Math.pow((N + 1) / n, k) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F1LOG is defined as Sum(tf(term_doc_freq)*ln(docLen)*IDF(term))
* where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq
@ -85,4 +87,36 @@ public class AxiomaticF1LOG extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency computed as 1 + log(1 + log(freq)) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"));
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length computed as (avgdl + s) / (avgdl + dl * s) from:",
Explanation.match((float) stats.getAvgFieldLength(),
"avgdl, average length of field across all documents"),
Explanation.match((float) docLen,
"dl, length of field"));
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, equals to 1");
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as log((N + 1) / n) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F2EXP is defined as Sum(tfln(term_doc_freq, docLen)*IDF(term))
* where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq
@ -91,4 +93,38 @@ public class AxiomaticF2EXP extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency, equals to 1");
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length, equals to 1");
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, " +
"computed as freq / (freq + s + s * dl / avgdl) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"),
Explanation.match((float) docLen,
"dl, length of field"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgdl, average length of field across all documents"));
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as " +
"Math.pow((N + 1) / n, k) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F2EXP is defined as Sum(tfln(term_doc_freq, docLen)*IDF(term))
* where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq
@ -83,4 +85,37 @@ public class AxiomaticF2LOG extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return 0.0;
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency, equals to 1");
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length, equals to 1");
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, " +
"computed as freq / (freq + s + s * dl / avgdl) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"),
Explanation.match((float) docLen,
"dl, length of field"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgdl, average length of field across all documents"));
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as log((N + 1) / n) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* where IDF(t) = pow((N+1)/df(t), k) N=total num of docs, df=doc freq
@ -91,4 +93,33 @@ public class AxiomaticF3EXP extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency computed as 1 + log(1 + log(freq)) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"));
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length, equals to 1");
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, equals to 1");
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as " +
"Math.pow((N + 1) / n, k) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -16,6 +16,8 @@
*/
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* F3EXP is defined as Sum(tf(term_doc_freq)*IDF(term)-gamma(docLen, queryLen))
* where IDF(t) = ln((N+1)/df(t)) N=total num of docs, df=doc freq
@ -80,4 +82,32 @@ public class AxiomaticF3LOG extends Axiomatic {
protected double gamma(BasicStats stats, double freq, double docLen) {
return (docLen - this.queryLen) * this.s * this.queryLen / stats.getAvgFieldLength();
}
@Override
protected Explanation tfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tf(stats, freq, docLen),
"tf, term frequency computed as 1 + log(1 + log(freq)) from:",
Explanation.match((float) freq,
"freq, number of occurrences of term in the document"));
};
@Override
protected Explanation lnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) ln(stats, freq, docLen),
"ln, document length, equals to 1");
};
protected Explanation tflnExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) tfln(stats, freq, docLen),
"tfln, mixed term frequency and document length, equals to 1");
};
protected Explanation idfExplain(BasicStats stats, double freq, double docLen){
return Explanation.match((float) idf(stats, freq, docLen),
"idf, inverted document frequency computed as log((N + 1) / n) from:",
Explanation.match((float) stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match((float) stats.getDocFreq(),
"n, number of documents containing term"));
};
}

View File

@ -41,21 +41,13 @@ public abstract class BasicModel {
* non-decreasing with {@code tfn}. */
public abstract double score(BasicStats stats, double tfn, double aeTimes1pTfn);
/**
* Returns an explanation for the score.
* <p>Most basic models use the number of documents and the total term
* frequency to compute Inf<sub>1</sub>. This method provides a generic
* explanation for such models. Subclasses that use other statistics must
* override this method.</p>
* Subclasses must override this method.
*/
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
return Explanation.match(
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"));
}
public abstract Explanation explain (BasicStats stats, double tfn, double aeTimes1pTfn);
/**
* Subclasses must override this method to return the code of the
* basic model formula. Refer to the original paper for the list.

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
@ -48,6 +50,26 @@ public class BasicModelG extends BasicModel {
return (B - (B - A) / (1 + tfn)) * aeTimes1pTfn;
}
@Override
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
double F = stats.getTotalTermFreq() + 1;
double N = stats.getNumberOfDocuments();
double lambda = F / (N + F);
Explanation explLambda = Explanation.match((float) lambda,
"lambda, computed as F / (N + F) from:",
Explanation.match((float) F,
"F, total number of occurrences of term across all docs + 1"),
Explanation.match((float) N,
"N, total number of documents with field"));
return Explanation.match(
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed as " +
"log2(lambda + 1) + tfn * log2((1 + lambda) / lambda) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"),
explLambda);
}
@Override
public String toString() {
return "G";

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
@ -42,6 +44,19 @@ public class BasicModelIF extends BasicModel {
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
}
@Override
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
return Explanation.match(
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed as " +
"tfn * log2(1 + (N + 1) / (F + 0.5)) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"),
Explanation.match(stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match(stats.getTotalTermFreq(),
"F, total number of occurrences of term across all documents"));
}
@Override
public String toString() {
return "I(F)";

View File

@ -47,9 +47,13 @@ public class BasicModelIn extends BasicModel {
public final Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
return Explanation.match(
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"),
Explanation.match(stats.getDocFreq(), "docFreq"));
getClass().getSimpleName() +
", computed as tfn * log2((N + 1) / (n + 0.5)) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"),
Explanation.match(stats.getNumberOfDocuments(),
"N, total number of documents with field"),
Explanation.match(stats.getDocFreq(),
"n, number of documents containing term"));
}
@Override

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
@ -44,6 +45,26 @@ public class BasicModelIne extends BasicModel {
return A * aeTimes1pTfn * (1 - 1 / (1 + tfn));
}
@Override
public Explanation explain(BasicStats stats, double tfn, double aeTimes1pTfn) {
double F = stats.getTotalTermFreq();
double N = stats.getNumberOfDocuments();
double ne = N * (1 - Math.pow((N - 1) / N, F));
Explanation explNe = Explanation.match((float) ne,
"ne, computed as N * (1 - Math.pow((N - 1) / N, F)) from:",
Explanation.match((float) F,
"F, total number of occurrences of term across all docs"),
Explanation.match((float) N,
"N, total number of documents with field"));
return Explanation.match(
(float) (score(stats, tfn, aeTimes1pTfn) * (1 + tfn) / aeTimes1pTfn),
getClass().getSimpleName() + ", computed as " +
"tfn * log2((N + 1) / (ne + 0.5)) from:",
Explanation.match((float) tfn, "tfn, normalized term frequency"),
explNe);
}
@Override
public String toString() {
return "I(ne)";

View File

@ -77,7 +77,7 @@ public class BooleanSimilarity extends Similarity {
@Override
public Explanation explain(int doc, Explanation freq) throws IOException {
Explanation queryBoostExpl = Explanation.match(boost, "boost");
Explanation queryBoostExpl = Explanation.match(boost, "boost, query boost");
return Explanation.match(
queryBoostExpl.getValue(),
"score(" + getClass().getSimpleName() + ", doc=" + doc + "), computed from:",

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Implements the <em>Divergence from Independence (DFI)</em> model based on Chi-square statistics
* (i.e., standardized Chi-squared distance from independence in term frequency tf).
@ -75,6 +77,38 @@ public class DFISimilarity extends SimilarityBase {
return independence;
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
final double expected = (stats.getTotalTermFreq() + 1) * docLen /
(stats.getNumberOfFieldTokens() + 1);
if (freq.getValue() <= expected){
return Explanation.match((float) 0, "score(" +
getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), equals to 0");
}
Explanation explExpected = Explanation.match((float) expected,
"expected, computed as (F + 1) * dl / (T + 1) from:",
Explanation.match(stats.getTotalTermFreq(),
"F, total number of occurrences of term across all docs"),
Explanation.match((float) docLen, "dl, length of field"),
Explanation.match(stats.getNumberOfFieldTokens(),
"T, total number of tokens in the field"));
final double measure = independence.score(freq.getValue(), expected);
Explanation explMeasure = Explanation.match((float) measure,
"measure, computed as independence.score(freq, expected) from:",
freq,
explExpected);
return Explanation.match(
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), computed as boost * log2(measure + 1) from:",
Explanation.match( (float)stats.getBoost(), "boost, query boost"),
explMeasure);
}
@Override
public String toString() {
return "DFI(" + independence + ")";

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.Explanation;
@ -122,7 +123,7 @@ public class DFRSimilarity extends SimilarityBase {
protected void explain(List<Explanation> subs,
BasicStats stats, int doc, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match( (float)stats.getBoost(), "boost"));
subs.add(Explanation.match( (float)stats.getBoost(), "boost, query boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
@ -133,6 +134,20 @@ public class DFRSimilarity extends SimilarityBase {
subs.add(afterEffect.explain(stats, tfn));
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), computed as boost * " +
"basicModel.score(stats, tfn) * afterEffect.score(stats, tfn) from:",
subs);
}
@Override
public String toString() {
return "DFR " + basicModel.toString() + afterEffect.toString()

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.search.Explanation;
@ -113,7 +114,7 @@ public class IBSimilarity extends SimilarityBase {
protected void explain(
List<Explanation> subs, BasicStats stats, int doc, double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float)stats.getBoost(), "boost"));
subs.add(Explanation.match((float)stats.getBoost(), "boost, query boost"));
}
Explanation normExpl = normalization.explain(stats, freq, docLen);
Explanation lambdaExpl = lambda.explain(stats);
@ -121,6 +122,22 @@ public class IBSimilarity extends SimilarityBase {
subs.add(lambdaExpl);
subs.add(distribution.explain(stats, normExpl.getValue(), lambdaExpl.getValue()));
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), computed as boost * " +
"distribution.score(stats, normalization.tfn(stats, freq," +
" docLen), lambda.lambda(stats)) from:",
subs);
}
/**
* The name of IB methods follow the pattern

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -86,20 +87,43 @@ public class LMDirichletSimilarity extends LMSimilarity {
protected void explain(List<Explanation> subs, BasicStats stats, int doc,
double freq, double docLen) {
if (stats.getBoost() != 1.0d) {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
subs.add(Explanation.match((float) stats.getBoost(), "query boost"));
}
double p = ((LMStats)stats).getCollectionProbability();
Explanation explP = Explanation.match((float) p,
"P, probability that the current term is generated by the collection");
Explanation explFreq = Explanation.match((float) freq,
"freq, number of occurrences of term in the document");
subs.add(Explanation.match(mu, "mu"));
Explanation weightExpl = Explanation.match(
(float)Math.log(1 + freq /
(mu * ((LMStats)stats).getCollectionProbability())),
"term weight");
"term weight, computed as log(1 + freq /(mu * P)) from:",
explFreq,
explP);
subs.add(weightExpl);
subs.add(Explanation.match(
(float)Math.log(mu / (docLen + mu)), "document norm"));
(float)Math.log(mu / (docLen + mu)),
"document norm, computed as log(mu / (dl + mu))"));
subs.add(Explanation.match((float) docLen,"dl, length of field"));
super.explain(subs, stats, doc, freq, docLen);
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), computed as boost * " +
"(term weight + document norm) from:",
subs);
}
/** Returns the &mu; parameter. */
public float getMu() {
return mu;

View File

@ -17,6 +17,7 @@
package org.apache.lucene.search.similarities;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
@ -79,9 +80,31 @@ public class LMJelinekMercerSimilarity extends LMSimilarity {
subs.add(Explanation.match((float) stats.getBoost(), "boost"));
}
subs.add(Explanation.match(lambda, "lambda"));
double p = ((LMStats)stats).getCollectionProbability();
Explanation explP = Explanation.match((float) p,
"P, probability that the current term is generated by the collection");
subs.add(explP);
Explanation explFreq = Explanation.match((float) freq,
"freq, number of occurrences of term in the document");
subs.add(explFreq);
subs.add(Explanation.match((float) docLen,"dl, length of field"));
super.explain(subs, stats, doc, freq, docLen);
}
@Override
protected Explanation explain(
BasicStats stats, int doc, Explanation freq, double docLen) {
List<Explanation> subs = new ArrayList<>();
explain(subs, stats, doc, freq.getValue(), docLen);
return Explanation.match(
(float) score(stats, freq.getValue(), docLen),
"score(" + getClass().getSimpleName() + ", doc=" + doc + ", freq=" +
freq.getValue() +"), computed as boost * " +
"log(1 + ((1 - lambda) * freq / dl) /(lambda * P)) from:",
subs);
}
/** Returns the &lambda; parameter. */
public float getLambda() {
return lambda;

View File

@ -37,9 +37,12 @@ public class LambdaDF extends Lambda {
public final Explanation explain(BasicStats stats) {
return Explanation.match(
lambda(stats),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getDocFreq(), "docFreq"),
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"));
getClass().getSimpleName()
+ ", computed as (n + 1) / (N + 1) from:",
Explanation.match(stats.getDocFreq(),
"n, number of documents containing term"),
Explanation.match(stats.getNumberOfDocuments(),
"N, total number of documents with field"));
}
@Override

View File

@ -33,15 +33,17 @@ public class LambdaTTF extends Lambda {
return (stats.getTotalTermFreq()+1F) / (stats.getNumberOfDocuments()+1F);
}
@Override
public final Explanation explain(BasicStats stats) {
return Explanation.match(
lambda(stats),
getClass().getSimpleName() + ", computed from: ",
Explanation.match(stats.getTotalTermFreq(), "totalTermFreq"),
Explanation.match(stats.getNumberOfDocuments(), "numberOfDocuments"));
getClass().getSimpleName()
+ ", computed as (F + 1) / (N + 1) from:",
Explanation.match(stats.getTotalTermFreq(),
"F, total number of occurrences of term across all documents"),
Explanation.match(stats.getNumberOfDocuments(),
"N, total number of documents with field"));
}
@Override
public String toString() {
return "L";

View File

@ -47,10 +47,12 @@ public abstract class Normalization {
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
(float) tfn(stats, tf, len),
getClass().getSimpleName() + ", computed from: ",
Explanation.match((float) tf, "tf"),
Explanation.match((float) stats.getAvgFieldLength(), "avgFieldLength"),
Explanation.match((float) len, "len"));
getClass().getSimpleName() + ", computed from:",
Explanation.match((float) tf,
"tf, number of occurrences of term in the document"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgfl, average length of field across all documents"),
Explanation.match((float) len, "fl, field length of the document"));
}
/** Implementation used when there is no normalization. */

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Normalization model that assumes a uniform distribution of the term frequency.
* <p>While this model is parameterless in the
@ -55,6 +57,21 @@ public class NormalizationH1 extends Normalization {
return tf * c * (stats.getAvgFieldLength() / len);
}
@Override
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
(float) tfn(stats, tf, len),
getClass().getSimpleName()
+ ", computed as tf * c * (avgfl / fl) from:",
Explanation.match((float) tf,
"tf, number of occurrences of term in the document"),
Explanation.match(c,
"c, hyper-parameter"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgfl, average length of field across all documents"),
Explanation.match((float) len, "fl, field length of the document"));
}
@Override
public String toString() {
return "1";

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
import static org.apache.lucene.search.similarities.SimilarityBase.log2;
/**
@ -57,6 +59,21 @@ public class NormalizationH2 extends Normalization {
return tf * log2(1 + c * stats.getAvgFieldLength() / len);
}
@Override
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
(float) tfn(stats, tf, len),
getClass().getSimpleName()
+ ", computed as tf * log2(1 + c * avgfl / fl) from:",
Explanation.match((float) tf,
"tf, number of occurrences of term in the document"),
Explanation.match(c,
"c, hyper-parameter"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgfl, average length of field across all documents"),
Explanation.match((float) len, "fl, field length of the document"));
}
@Override
public String toString() {
return "2";

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Dirichlet Priors normalization
* @lucene.experimental
@ -47,6 +49,23 @@ public class NormalizationH3 extends Normalization {
return (tf + mu * ((stats.getTotalTermFreq()+1F) / (stats.getNumberOfFieldTokens()+1F))) / (len + mu) * mu;
}
@Override
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
(float) tfn(stats, tf, len),
getClass().getSimpleName()
+ ", computed as (tf + mu * ((F+1) / (T+1))) / (fl + mu) * mu from:",
Explanation.match((float) tf,
"tf, number of occurrences of term in the document"),
Explanation.match(mu,
"mu, smoothing parameter"),
Explanation.match((float) stats.getTotalTermFreq(),
"F, total number of occurrences of term across all documents"),
Explanation.match((float) stats.getNumberOfFieldTokens(),
"T, total number of tokens of the field across all documents"),
Explanation.match((float) len, "fl, field length of the document"));
}
@Override
public String toString() {
return "3(" + mu + ")";

View File

@ -17,6 +17,8 @@
package org.apache.lucene.search.similarities;
import org.apache.lucene.search.Explanation;
/**
* Pareto-Zipf Normalization
* @lucene.experimental
@ -48,6 +50,20 @@ public class NormalizationZ extends Normalization {
return tf * Math.pow(stats.avgFieldLength / len, z);
}
@Override
public Explanation explain(BasicStats stats, double tf, double len) {
return Explanation.match(
(float) tfn(stats, tf, len),
getClass().getSimpleName()
+ ", computed as tf * Math.pow(avgfl / fl, z) from:",
Explanation.match((float) tf,
"tf, number of occurrences of term in the document"),
Explanation.match((float) stats.getAvgFieldLength(),
"avgfl, average length of field across all documents"),
Explanation.match((float) len, "fl, field length of the document"),
Explanation.match(z, "z, relates to specificity of the language"));
}
@Override
public String toString() {
return "Z(" + z + ")";

View File

@ -29,8 +29,12 @@ Apache UIMA 2.3.1
Apache ZooKeeper 3.4.10
Jetty 9.3.20.v20170531
Upgrade Notes
----------------------
(No Changes)
* LUCENE-7996: The 'func' query parser now returns scores that are equal to 0
when a negative value is produced. This change is due to the fact that
Lucene now requires scores to be positive. (Adrien Grand)
================== 7.3.0 ==================
@ -45,9 +49,15 @@ Apache UIMA 2.3.1
Apache ZooKeeper 3.4.10
Jetty 9.3.20.v20170531
Optimizations
----------------------
(No Changes)
* SOLR-11711: Fixed distributed processing of facet.field/facet.pivot sub requests to prevent requesting
unneccessary and excessive '0' count terms from each shard (Houston Putman via hossman)
Other Changes
----------------------
* SOLR-11575: Improve ref-guide solrj snippets via mock 'print()' method (Jason Gerlowski via hossman)
================== 7.2.0 ==================

View File

@ -729,13 +729,13 @@ function stop_solr() {
DIR="$1"
SOLR_PORT="$2"
STOP_PORT="${STOP_PORT:-$(expr $SOLR_PORT - 1000)}"
THIS_STOP_PORT="${STOP_PORT:-$(expr $SOLR_PORT - 1000)}"
STOP_KEY="$3"
SOLR_PID="$4"
if [ "$SOLR_PID" != "" ]; then
echo -e "Sending stop command to Solr running on port $SOLR_PORT ... waiting up to $SOLR_STOP_WAIT seconds to allow Jetty process $SOLR_PID to stop gracefully."
"$JAVA" $SOLR_SSL_OPTS $AUTHC_OPTS -jar "$DIR/start.jar" "STOP.PORT=$STOP_PORT" "STOP.KEY=$STOP_KEY" --stop || true
"$JAVA" $SOLR_SSL_OPTS $AUTHC_OPTS -jar "$DIR/start.jar" "STOP.PORT=$THIS_STOP_PORT" "STOP.KEY=$STOP_KEY" --stop || true
(loops=0
while true
do

View File

@ -571,21 +571,8 @@ public class FacetComponent extends SearchComponent {
// set the initial limit higher to increase accuracy
dff.initialLimit = doOverRequestMath(dff.initialLimit, dff.overrequestRatio,
dff.overrequestCount);
// If option FACET_DISTRIB_MCO is turned on then we will use 1 as the initial
// minCount (unless the user explicitly set it to something less than 1). If
// option FACET_DISTRIB_MCO is turned off then we will use 0 as the initial
// minCount regardless of what the user might have provided (prior to the
// addition of the FACET_DISTRIB_MCO option the default logic was to use 0).
// As described in issues SOLR-8559 and SOLR-8988 the use of 1 provides a
// significant performance boost.
dff.initialMincount = dff.mco ? Math.min(dff.minCount, 1) : 0;
} else {
// if limit==-1, then no need to artificially lower mincount to 0 if
// it's 1
dff.initialMincount = Math.min(dff.minCount, 1);
}
dff.initialMincount = Math.min(dff.minCount, 1);
} else {
// we're sorting by index order.
// if minCount==0, we should always be able to get accurate results w/o
@ -682,10 +669,8 @@ public class FacetComponent extends SearchComponent {
} else if ( FacetParams.FACET_SORT_COUNT.equals(sort) ) {
if ( 0 < requestedLimit ) {
shardLimit = doOverRequestMath(shardLimit, overRequestRatio, overRequestCount);
shardMinCount = 0;
} else {
shardMinCount = Math.min(requestedMinCount, 1);
}
shardMinCount = Math.min(requestedMinCount, 1);
}
sreq.params.set(paramStart + FacetParams.FACET_LIMIT, shardLimit);
sreq.params.set(paramStart + FacetParams.FACET_PIVOT_MINCOUNT, shardMinCount);
@ -1437,7 +1422,6 @@ public class FacetComponent extends SearchComponent {
public int initialLimit; // how many terms requested in first phase
public int initialMincount; // mincount param sent to each shard
public boolean mco;
public double overrequestRatio;
public int overrequestCount;
public boolean needRefinements;
@ -1456,9 +1440,6 @@ public class FacetComponent extends SearchComponent {
= params.getFieldDouble(field, FacetParams.FACET_OVERREQUEST_RATIO, 1.5);
this.overrequestCount
= params.getFieldInt(field, FacetParams.FACET_OVERREQUEST_COUNT, 10);
this.mco
= params.getFieldBool(field, FacetParams.FACET_DISTRIB_MCO, false);
}
void add(int shardNum, NamedList shardCounts, int numRequested) {
@ -1496,10 +1477,10 @@ public class FacetComponent extends SearchComponent {
}
}
// the largest possible missing term is initialMincount if we received
// the largest possible missing term is (initialMincount - 1) if we received
// less than the number requested.
if (numRequested < 0 || numRequested != 0 && numReceived < numRequested) {
last = initialMincount;
last = Math.max(0, initialMincount - 1);
}
missingMaxPossible += last;

View File

@ -53,7 +53,6 @@ import static org.apache.solr.common.params.FacetParams.FACET_OVERREQUEST_RATIO;
import static org.apache.solr.common.params.FacetParams.FACET_PIVOT;
import static org.apache.solr.common.params.FacetParams.FACET_PIVOT_MINCOUNT;
import static org.apache.solr.common.params.FacetParams.FACET_SORT;
import static org.apache.solr.common.params.FacetParams.FACET_DISTRIB_MCO;
/**
* <p>
@ -85,8 +84,6 @@ public class TestCloudPivotFacet extends AbstractFullDistribZkTestBase {
// param used by test purely for tracing & validation
private static String TRACE_MIN = "_test_min";
// param used by test purely for tracing & validation
private static String TRACE_DISTRIB_MIN = "_test_distrib_min";
// param used by test purely for tracing & validation
private static String TRACE_MISS = "_test_miss";
// param used by test purely for tracing & validation
private static String TRACE_SORT = "_test_sort";
@ -199,12 +196,6 @@ public class TestCloudPivotFacet extends AbstractFullDistribZkTestBase {
baseP.add(TRACE_MIN, min);
}
if (random().nextBoolean()) {
pivotP.add(FACET_DISTRIB_MCO, "true");
// trace param for validation
baseP.add(TRACE_DISTRIB_MIN, "true");
}
if (random().nextBoolean()) {
String missing = ""+random().nextBoolean();
pivotP.add(FACET_MISSING, missing);

View File

@ -19,93 +19,57 @@ package org.apache.solr.handler.component;
import java.util.List;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.response.FieldStatsInfo;
import org.apache.solr.client.solrj.response.PivotField;
import org.apache.solr.common.params.FacetParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.search.facet.DistributedFacetSimpleRefinementLongTailTest;
import org.junit.Test;
/**
* <p>
* test demonstrating how overrequesting helps finds top-terms in the "long tail"
* of shards that don't have even distributions of terms (something that can be common
* in cases of custom sharding -- even if you don't know that there is a corrolation
* between the property you are sharding on and the property you are faceting on).
*
* NOTE: This test ignores the control collection (in single node mode, there is no
* <p>
* <b>NOTE:</b> This test ignores the control collection (in single node mode, there is no
* need for the overrequesting, all the data is local -- so comparisons with it wouldn't
* be valid in the cases we are testing here)
* </p>
* <p>
* <b>NOTE:</b> uses the same indexed documents as {@link DistributedFacetSimpleRefinementLongTailTest} --
* however the behavior of <code>refine:simple</code> is "simpler" then the refinement logic used by
* <code>facet.pivot</code> so the assertions in this test vary from that test.
* </p>
*/
public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTestCase {
private int docNumber = 0;
private String STAT_FIELD = null; // will be randomized single value vs multivalued
public DistributedFacetPivotLongTailTest() {
// we need DVs on point fields to compute stats & facets
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
STAT_FIELD = random().nextBoolean() ? "stat_i1" : "stat_i";
}
public int getDocNum() {
docNumber++;
return docNumber;
}
@Test
@ShardsFixed(num = 3)
public void test() throws Exception {
final SolrClient shard0 = clients.get(0);
final SolrClient shard1 = clients.get(1);
final SolrClient shard2 = clients.get(2);
// the 5 top foo_s terms have 100 docs each on every shard
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 5; j++) {
shard0.add(sdoc("id", getDocNum(), "foo_s", "aaa"+j, "stat_i", j * 13 - i));
shard1.add(sdoc("id", getDocNum(), "foo_s", "aaa"+j, "stat_i", j * 3 + i));
shard2.add(sdoc("id", getDocNum(), "foo_s", "aaa"+j, "stat_i", i * 7 + j));
}
}
// 20 foo_s terms that come in "second" with 50 docs each
// on both shard0 & shard1 ("bbb_")
for (int i = 0; i < 50; i++) {
for (int j = 0; j < 20; j++) {
shard0.add(sdoc("id", getDocNum(), "foo_s", "bbb"+j, "stat_i", 0));
shard1.add(sdoc("id", getDocNum(), "foo_s", "bbb"+j, "stat_i", 1));
}
// distracting term appears on only on shard2 50 times
shard2.add(sdoc("id", getDocNum(), "foo_s", "junkA"));
}
// put "bbb0" on shard2 exactly once to sanity check refinement
shard2.add(sdoc("id", getDocNum(), "foo_s", "bbb0", "stat_i", -2));
// long 'tail' foo_s term appears in 45 docs on every shard
// foo_s:tail is the only term with bar_s sub-pivot terms
for (int i = 0; i < 45; i++) {
// for sub-pivot, shard0 & shard1 have 6 docs each for "tailB"
// but the top 5 terms are ccc(0-4) -- 7 on each shard
// (4 docs each have junk terms)
String sub_term = (i < 35) ? "ccc"+(i % 5) : ((i < 41) ? "tailB" : "junkA");
shard0.add(sdoc("id", getDocNum(), "foo_s", "tail", "bar_s", sub_term, "stat_i", i));
shard1.add(sdoc("id", getDocNum(), "foo_s", "tail", "bar_s", sub_term, "stat_i", i));
// shard2's top 5 sub-pivot terms are junk only it has with 8 docs each
// and 5 docs that use "tailB"
// NOTE: none of these get stat_i ! !
sub_term = (i < 40) ? "junkB"+(i % 5) : "tailB";
shard2.add(sdoc("id", getDocNum(), "foo_s", "tail", "bar_s", sub_term));
}
// really long tail uncommon foo_s terms on shard2
for (int i = 0; i < 30; i++) {
shard2.add(sdoc("id", getDocNum(), "foo_s", "zzz"+i));
}
DistributedFacetSimpleRefinementLongTailTest.buildIndexes(clients, STAT_FIELD);
commit();
sanityCheckIndividualShards();
checkRefinementAndOverrequesting();
doTestDeepPivotStats();
}
private void sanityCheckIndividualShards() throws Exception {
assertEquals("This test assumes exactly 3 shards/clients", 3, clients.size());
SolrParams req = params( "q", "*:*",
"distrib", "false",
"facet", "true",
@ -116,10 +80,11 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
PivotField pivot = null;
List<PivotField> pivots = null;
List<PivotField>[] shardPivots = new List[3];
shardPivots[0] = shard0.query( req ).getFacetPivot().get("foo_s,bar_s");
shardPivots[1] = shard1.query( req ).getFacetPivot().get("foo_s,bar_s");
shardPivots[2] = shard2.query( req ).getFacetPivot().get("foo_s,bar_s");
List<PivotField>[] shardPivots = new List[clients.size()];
for (int i = 0; i < clients.size(); i++) {
shardPivots[i] = clients.get(i).query( req ).getFacetPivot().get("foo_s,bar_s");
}
// top 5 same on all shards
for (int i = 0; i < 3; i++) {
@ -143,11 +108,9 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
assertEquals(50, shardPivots[2].get(5).getCount());
assertEquals("tail", shardPivots[2].get(6).getValue());
assertEquals(45, shardPivots[2].get(6).getCount());
assertEquals("bbb0", shardPivots[2].get(7).getValue());
assertEquals(1, shardPivots[2].get(7).getCount());
for (int j = 8; j < 10; j++) {
for (int j = 7; j < 10; j++) {
pivot = shardPivots[2].get(j);
assertTrue(pivot.toString(), pivot.getValue().toString().startsWith("zzz"));
assertTrue(pivot.toString(), pivot.getValue().toString().startsWith("ZZZ"));
assertEquals(pivot.toString(), 1, pivot.getCount());
}
// check sub-shardPivots on "tail" from shard2
@ -161,9 +124,12 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
pivot = pivots.get(5);
assertEquals("tailB", pivot.getValue());
assertEquals(5, pivot.getCount());
}
private void checkRefinementAndOverrequesting() throws Exception {
// if we disable overrequesting, we don't find the long tail
List<PivotField> pivots = null;
PivotField pivot = null;
pivots = queryServer( params( "q", "*:*",
"shards", getShardsString(),
FacetParams.FACET_OVERREQUEST_COUNT, "0",
@ -172,7 +138,7 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
"facet.limit", "6",
"facet.pivot", "{!stats=sxy}foo_s,bar_s",
"stats", "true",
"stats.field", "{!tag=sxy}stat_i")
"stats.field", "{!tag=sxy}" + STAT_FIELD)
).getFacetPivot().get("foo_s,bar_s");
assertEquals(6, pivots.size());
for (int i = 0; i < 5; i++) {
@ -185,8 +151,8 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
assertTrue(pivot.toString(), pivot.getValue().equals("bbb0"));
assertEquals(pivot.toString(), 101, pivot.getCount());
// basic check of refined stats
FieldStatsInfo bbb0Stats = pivot.getFieldStatsInfo().get("stat_i");
assertEquals("stat_i", bbb0Stats.getName());
FieldStatsInfo bbb0Stats = pivot.getFieldStatsInfo().get(STAT_FIELD);
assertEquals(STAT_FIELD, bbb0Stats.getName());
assertEquals(-2.0, bbb0Stats.getMin());
assertEquals(1.0, bbb0Stats.getMax());
assertEquals(101, (long) bbb0Stats.getCount());
@ -295,11 +261,10 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
assertTrue(pivot.toString(), pivot.getValue().toString().startsWith("ccc"));
assertEquals(pivot.toString(), 14, pivot.getCount());
}
doTestDeepPivotStats();
}
public void doTestDeepPivotStats() throws Exception {
private void doTestDeepPivotStats() throws Exception {
// Deep checking of some Facet stats - no refinement involved here
List<PivotField> pivots =
@ -309,7 +274,7 @@ public class DistributedFacetPivotLongTailTest extends BaseDistributedSearchTest
"rows" , "0",
"facet.pivot","{!stats=s1}foo_s,bar_s",
"stats", "true",
"stats.field", "{!key=avg_price tag=s1}stat_i").getFacetPivot().get("foo_s,bar_s");
"stats.field", "{!key=avg_price tag=s1}" + STAT_FIELD).getFacetPivot().get("foo_s,bar_s");
PivotField aaa0PivotField = pivots.get(0);
assertEquals("aaa0", aaa0PivotField.getValue());
assertEquals(300, aaa0PivotField.getCount());

View File

@ -0,0 +1,428 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.facet;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.solr.BaseDistributedSearchTestCase;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.junit.Test;
/**
* A test the demonstrates some of the expected behavior fo "long tail" terms when using <code>refine:simple</code>
* <p>
* <b>NOTE:</b> This test ignores the control collection (in single node mode, there is no
* need for the overrequesting, all the data is local -- so comparisons with it wouldn't
* be valid in the cases we are testing here)
* </p>
* <p>
* <b>NOTE:</b> This test is heavily inspired by (and uses the same indexed documents) as
* {@link org.apache.solr.handler.component.DistributedFacetPivotLongTailTest} -- however the behavior of
* <code>refine:simple</code> is "simpler" then the refinement logic used by
* <code>facet.pivot</code> so the assertions in this test vary from that test.
* </p>
*/
public class DistributedFacetSimpleRefinementLongTailTest extends BaseDistributedSearchTestCase {
// TODO: SOLR-11695: need "num_values" and "missing"...
// TODO: add hll & variance - update all assertions to test their values (right after any mention of 'stddev')
private static List<String> ALL_STATS = Arrays.asList("min", "max", "sum", "stddev", "avg", "sumsq", "unique");
private String STAT_FIELD = "stat_i1";
private String ALL_STATS_JSON = "";
public DistributedFacetSimpleRefinementLongTailTest() {
// we need DVs on point fields to compute stats & facets
if (Boolean.getBoolean(NUMERIC_POINTS_SYSPROP)) System.setProperty(NUMERIC_DOCVALUES_SYSPROP,"true");
// TODO: randomizing STAT_FIELD to be multiValued=true blocked by SOLR-11706
// STAT_FIELD = random().nextBoolean() ? "stat_i1" : "stat_i";
for (String stat : ALL_STATS) {
ALL_STATS_JSON += stat + ":'" + stat + "(" + STAT_FIELD + ")',";
}
}
@Test
@ShardsFixed(num = 3)
public void test() throws Exception {
buildIndexes(clients, STAT_FIELD);
commit();
sanityCheckIndividualShards();
checkRefinementAndOverrequesting();
checkSubFacetStats();
}
public static void buildIndexes(final List<SolrClient> clients, final String statField) throws Exception {
assertEquals("This indexing code assumes exactly 3 shards/clients", 3, clients.size());
final AtomicInteger docNum = new AtomicInteger();
final SolrClient shard0 = clients.get(0);
final SolrClient shard1 = clients.get(1);
final SolrClient shard2 = clients.get(2);
// the 5 top foo_s terms have 100 docs each on every shard
for (int i = 0; i < 100; i++) {
for (int j = 0; j < 5; j++) {
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, j * 13 - i));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, j * 3 + i));
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "aaa"+j, statField, i * 7 + j));
}
}
// 20 foo_s terms that come in "second" with 50 docs each
// on both shard0 & shard1 ("bbb_")
for (int i = 0; i < 50; i++) {
for (int j = 0; j < 20; j++) {
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb"+j, statField, 0));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb"+j, statField, 1));
}
// distracting term appears on only on shard2 50 times
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "junkA"));
}
// put "bbb0" on shard2 exactly once to sanity check refinement
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "bbb0", statField, -2));
// long 'tail' foo_s term appears in 45 docs on every shard
// foo_s:tail is the only term with bar_s sub-pivot terms
for (int i = 0; i < 45; i++) {
// for sub-pivot, shard0 & shard1 have 6 docs each for "tailB"
// but the top 5 terms are ccc(0-4) -- 7 on each shard
// (4 docs each have junk terms)
String sub_term = (i < 35) ? "ccc"+(i % 5) : ((i < 41) ? "tailB" : "junkA");
shard0.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term, statField, i));
shard1.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term, statField, i));
// shard2's top 5 sub-pivot terms are junk only it has with 8 docs each
// and 5 docs that use "tailB"
// NOTE: none of these get statField ! !
sub_term = (i < 40) ? "junkB"+(i % 5) : "tailB";
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "tail", "bar_s", sub_term));
}
// really long tail uncommon foo_s terms on shard2
for (int i = 0; i < 30; i++) {
// NOTE: using "Z" here so these sort before bbb0 when they tie for '1' instance each on shard2
shard2.add(sdoc("id", docNum.incrementAndGet(), "foo_s", "ZZZ"+i));
}
}
private void sanityCheckIndividualShards() throws Exception {
// sanity check that our expectations about each shard (non-distrib) are correct
SolrParams req = params( "q", "*:*", "distrib", "false", "json.facet",
" { foo:{ type:terms, limit:10, field:foo_s, facet:{ bar:{ type:terms, limit:10, field:bar_s }}}}");
List<NamedList>[] shardFooBuckets = new List[clients.size()];
for (int i = 0; i < clients.size(); i++) {
shardFooBuckets[i] = (List<NamedList>)
((NamedList<NamedList>)clients.get(i).query( req ).getResponse().get("facets")).get("foo").get("buckets");
}
// top 5 same on all shards
for (int i = 0; i < 3; i++) {
assertEquals(10, shardFooBuckets[i].size());
for (int j = 0; j < 5; j++) {
NamedList bucket = shardFooBuckets[i].get(j);
assertEquals(bucket.toString(), "aaa"+j, bucket.get("val"));
assertEquals(bucket.toString(), 100, bucket.get("count"));
}
}
// top 6-10 same on shard0 & shard1
for (int i = 0; i < 2; i++) {
for (int j = 5; j < 10; j++) {
NamedList bucket = shardFooBuckets[i].get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("bbb"));
assertEquals(bucket.toString(), 50, bucket.get("count"));
}
}
// 6-10 on shard2
assertEquals("junkA", shardFooBuckets[2].get(5).get("val"));
assertEquals(50, shardFooBuckets[2].get(5).get("count"));
assertEquals("tail", shardFooBuckets[2].get(6).get("val"));
assertEquals(45, shardFooBuckets[2].get(6).get("count"));
for (int j = 7; j < 10; j++) {
NamedList bucket = shardFooBuckets[2].get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ZZZ"));
assertEquals(bucket.toString(), 1, bucket.get("count"));
}
// check 'bar' sub buckets on "tail" from shard2
{ List<NamedList> bar_buckets = (List<NamedList>) ((NamedList<NamedList>) shardFooBuckets[2].get(6).get("bar")).get("buckets");
assertEquals(6, bar_buckets.size());
for (int j = 0; j < 5; j++) {
NamedList bucket = bar_buckets.get(j);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("junkB"));
assertEquals(bucket.toString(), 8, bucket.get("count"));
}
NamedList bucket = bar_buckets.get(5);
assertEquals("tailB", bucket.get("val"));
assertEquals(5, bucket.get("count"));
}
}
private void checkRefinementAndOverrequesting() throws Exception {
// // distributed queries // //
{ // w/o refinement, the default overrequest isn't enough to find the long 'tail' *OR* the correct count for 'bbb0'...
List<NamedList> foo_buckets = (List<NamedList>)
((NamedList<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, refine:none, limit:6, field:foo_s } }"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, foo_buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = foo_buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
}
// this will be short the "+1" fo the doc added to shard2...
NamedList bucket = foo_buckets.get(5);
assertTrue(bucket.toString(), bucket.get("val").equals("bbb0")); // 'tail' is missed
assertEquals(bucket.toString(), 100L, bucket.get("count")); // will not include the "+1" for the doc added to shard2
}
// even if we enable refinement, we still won't find the long 'tail' ...
// regardless of wether we use either the default overrequest, or disable overrequesting...
for (String over : Arrays.asList( "", "overrequest:0,")) {
List<NamedList> foo_buckets = (List<NamedList>)
((NamedList<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, refine:simple, limit:6, "+ over +" field:foo_s, facet:{ " + ALL_STATS_JSON +
" bar: { type:terms, refine:simple, limit:6, "+ over +" field:bar_s, facet:{"+ALL_STATS_JSON+"}}}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, foo_buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = foo_buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
}
// ...but it should have correctly asked shard2 to refine bbb0
NamedList bucket = foo_buckets.get(5);
assertTrue(bucket.toString(), bucket.get("val").equals("bbb0"));
assertEquals(bucket.toString(), 101L, bucket.get("count"));
// ...and the status under bbb0 should be correct to include the refinement
assertEquals(ALL_STATS.size() + 3, bucket.size()); // val,count,facet
assertEquals(-2L, bucket.get("min")); // this min only exists on shard2
assertEquals(1L, bucket.get("max"));
// assertEquals(101L, bucket.get("num_values")); // TODO: SOLR-11695
// assertEquals(0L, bucket.get("missing")); // TODO: SOLR-11695
assertEquals(48.0D, bucket.get("sum"));
assertEquals(0.475247524752475D, (double) bucket.get("avg"), 0.1E-7);
assertEquals(54.0D, (double) bucket.get("sumsq"), 0.1E-7);
// assertEquals(0.55846323792D, bucket.getStddev(), 0.1E-7); // TODO: SOLR-11725
assertEquals(0.55569169111D, (double) bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
}
// with a limit==6, we have to "overrequest >= 20" in order to ensure that 'tail' is included in the top 6
// this is because of how the "simple" refinement process works: the "top buckets" are determined based
// on the info available in the first pass request.
//
// Even though 'tail' is returned in the top6 for shard2, the cummulative total for 'bbb0' from shard0 and shard1 is
// high enough that the simple facet refinement ignores 'tail' because it assumes 'bbb0's final total will be greater.
//
// Meanwhile, for the sub-facet on 'bar', a limit==6 means we should correctly find 'tailB' as the top sub-term of 'tail',
// regardless of how much overrequest is used (or even if we don't have any refinement) since it's always in the top6...
for (String bar_opts : Arrays.asList( "refine:none,",
"refine:simple,",
"refine:none, overrequest:0,",
"refine:simple, overrequest:0," )) {
List<NamedList> buckets = (List<NamedList>)
((NamedList<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:6, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
}
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(6, buckets.size());
bucket = buckets.get(0);
assertEquals(bucket.toString(), "tailB", bucket.get("val"));
assertEquals(bucket.toString(), 17L, bucket.get("count"));
for (int i = 1; i < 6; i++) { // ccc(0-4)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
}
}
// if we lower the limit on the sub-bucket to '5', overrequesting of at least 1 should still ensure
// that we get the correct top5 including "tailB" -- even w/o refinement
for (String bar_opts : Arrays.asList( "refine:none,",
"refine:simple,",
"refine:none, overrequest:1,",
"refine:simple, overrequest:1," )) {
List<NamedList> buckets = (List<NamedList>)
((NamedList<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:5, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
}
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(5, buckets.size());
bucket = buckets.get(0);
assertEquals(bucket.toString(), "tailB", bucket.get("val"));
assertEquals(bucket.toString(), 17L, bucket.get("count"));
for (int i = 1; i < 5; i++) { // ccc(0-3)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
}
}
// however: with a lower sub-facet limit==5, and overrequesting disabled,
// we're going to miss out on tailB even if we have refinement
for (String bar_opts : Arrays.asList( "refine:none, overrequest:0,",
"refine:simple, overrequest:0," )) {
List<NamedList> buckets = (List<NamedList>)
((NamedList<NamedList>)
queryServer( params( "q", "*:*", "shards", getShardsString(), "json.facet",
"{ foo: { type:terms, limit:6, overrequest:20, refine:simple, field:foo_s, facet:{ " +
" bar: { type:terms, limit:5, " + bar_opts + " field:bar_s }}}}"
) ).getResponse().get("facets")).get("foo").get("buckets");
assertEquals(6, buckets.size());
for (int i = 0; i < 5; i++) {
NamedList bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("aaa"));
assertEquals(bucket.toString(), 300L, bucket.get("count"));
}
NamedList bucket = buckets.get(5);
assertEquals(bucket.toString(), "tail", bucket.get("val"));
assertEquals(bucket.toString(), 135L, bucket.get("count"));
// check the sub buckets
buckets = ((NamedList<NamedList<List<NamedList>>>) bucket).get("bar").get("buckets");
assertEquals(5, buckets.size());
for (int i = 0; i < 5; i++) { // ccc(0-4)
bucket = buckets.get(i);
assertTrue(bucket.toString(), bucket.get("val").toString().startsWith("ccc"));
assertEquals(bucket.toString(), 14L, bucket.get("count"));
}
}
}
private void checkSubFacetStats() throws Exception {
// Deep checking of some Facet stats
// the assertions only care about the first 5 results of each facet, but to get the long tail more are needed
// from the sub-shards. results should be the same regardless of: "high limit" vs "low limit + high overrequest"
checkSubFacetStats("refine:simple, limit: 100,");
checkSubFacetStats("refine:simple, overrequest: 100,");
// and the results shouldn't change if we explicitly disable refinement
checkSubFacetStats("refine:none, limit: 100,");
checkSubFacetStats("refine:none, overrequest: 100,");
}
private void checkSubFacetStats(String extraJson) throws Exception {
String commonJson = "type: terms, " + extraJson;
NamedList<NamedList> all_facets = (NamedList) queryServer
( params( "q", "*:*", "shards", getShardsString(), "rows" , "0", "json.facet",
"{ foo : { " + commonJson + " field: foo_s, facet: { " +
ALL_STATS_JSON + " bar: { " + commonJson + " field: bar_s, facet: { " + ALL_STATS_JSON + "} } } } }"
) ).getResponse().get("facets");
assertNotNull(all_facets);
List<NamedList> foo_buckets = (List) ((NamedList)all_facets.get("foo")).get("buckets");
NamedList aaa0_Bucket = foo_buckets.get(0);
assertEquals(ALL_STATS.size() + 3, aaa0_Bucket.size()); // val,count,facet
assertEquals("aaa0", aaa0_Bucket.get("val"));
assertEquals(300L, aaa0_Bucket.get("count"));
assertEquals(-99L, aaa0_Bucket.get("min"));
assertEquals(693L, aaa0_Bucket.get("max"));
// assertEquals(300L, aaa0_Bucket.get("num_values")); // TODO: SOLR-11695
// assertEquals(0L, aaa0_Bucket.get("missing")); // TODO: SOLR-11695
assertEquals(34650.0D, aaa0_Bucket.get("sum"));
assertEquals(115.5D, (double) aaa0_Bucket.get("avg"), 0.1E-7);
assertEquals(1.674585E7D, (double) aaa0_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(206.4493184076D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
assertEquals(206.1049489944D, (double) aaa0_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
NamedList tail_Bucket = foo_buckets.get(5);
assertEquals(ALL_STATS.size() + 3, tail_Bucket.size()); // val,count,facet
assertEquals("tail", tail_Bucket.get("val"));
assertEquals(135L, tail_Bucket.get("count"));
assertEquals(0L, tail_Bucket.get("min"));
assertEquals(44L, tail_Bucket.get("max"));
// assertEquals(90L, tail_Bucket.get("num_values")); // TODO: SOLR-11695
// assertEquals(45L, tail_Bucket.get("missing")); // TODO: SOLR-11695
assertEquals(1980.0D, tail_Bucket.get("sum"));
assertEquals(22.0D, (double) tail_Bucket.get("avg"), 0.1E-7);
assertEquals(58740.0D, (double) tail_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(13.0599310011D, (double) tail_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
assertEquals(12.9871731592D, (double) tail_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
List<NamedList> tail_bar_buckets = (List) ((NamedList)tail_Bucket.get("bar")).get("buckets");
NamedList tailB_Bucket = tail_bar_buckets.get(0);
assertEquals(ALL_STATS.size() + 2, tailB_Bucket.size()); // val,count ... NO SUB FACETS
assertEquals("tailB", tailB_Bucket.get("val"));
assertEquals(17L, tailB_Bucket.get("count"));
assertEquals(35L, tailB_Bucket.get("min"));
assertEquals(40L, tailB_Bucket.get("max"));
// assertEquals(12L, tailB_Bucket.get("num_values")); // TODO: SOLR-11695
// assertEquals(5L, tailB_Bucket.get("missing")); // TODO: SOLR-11695
assertEquals(450.0D, tailB_Bucket.get("sum"));
assertEquals(37.5D, (double) tailB_Bucket.get("avg"), 0.1E-7);
assertEquals(16910.0D, (double) tailB_Bucket.get("sumsq"), 0.1E-7);
// assertEquals(1.78376517D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // TODO: SOLR-11725
assertEquals(1.70782513D, (double) tailB_Bucket.get("stddev"), 0.1E-7); // json.facet is using the "uncorrected stddev"
}
}

View File

@ -1885,6 +1885,7 @@ Time in seconds to wait until new replicas are created, and until leader replica
====
This operation does not hold necessary locks on the replicas that belong to on the source node. So don't perform other collection operations in this period.
====
[[movereplica]]
== MOVEREPLICA: Move a Replica to a New Node
@ -1913,14 +1914,17 @@ The name of the destination node. This parameter is required.
Request ID to track this action which will be <<Asynchronous Calls,processed asynchronously>>.
[[utilizenode]]
== UTILIZENODE: Utilize a new node
== UTILIZENODE: Utilize a New Node
This command can be used to move some replicas from the existing nodes to a new node or lightly loaded node and reduce the load on them. This uses your autoscaling policies and preferences to identify which replica needs to be moved. It tries to fix any policy violations first and then it tries to move some load off of the most loaded nodes according to the preferences.
This command can be used to move some replicas from the existing nodes to either a new node or a less loaded node to reduce the load on the existing node.
This uses your autoscaling policies and preferences to identify which replica needs to be moved. It tries to fix any policy violations first and then it tries to move some load off of the most loaded nodes according to the preferences.
`/admin/collections?action=UTILIZENODE&node=nodeName`
=== UTILIZENODE Parameters
`node`:: The name of the node that needs to be utilized. This parameter is required
`node`:: The name of the node that needs to be utilized. This parameter is required.
== Asynchronous Calls

View File

@ -25,7 +25,32 @@ When planning your Solr upgrade, consider the customizations you have made to yo
Detailed steps for upgrading a Solr cluster can be found in the section <<upgrading-a-solr-cluster.adoc#upgrading-a-solr-cluster,Upgrading a Solr Cluster>>.
== Upgrading from Solr 7.0
== Upgrading from 7.x Releases
=== Solr 7.2
See the https://wiki.apache.org/solr/ReleaseNote72[7.2 Release Notes] for an overview of the main new features in Solr 7.2.
Users should be aware of the following major changes from v7.1:
* Starting a query string with <<local-parameters-in-queries.adoc#local-parameters-in-queries,local parameters>> `{!myparser ...}` is used to switch from one query parser to another, and is intended for use by Solr system developers, not end users doing searches. To reduce negative side-effects of unintended hack-ability, Solr now limits the cases when local parameters will be parsed to only contexts in which the default parser is "<<other-parsers.adoc#lucene-query-parser,lucene>>" or "<<other-parsers.adoc#function-query-parser,func>>".
+
So, if `defType=edismax` then `q={!myparser ...}` won't work. In that example, put the desired query parser into the `defType` parameter.
+
Another example is if `deftype=edismax` then `hl.q={!myparser ...}` won't work for the same reason. In this example, either put the desired query parser into the `hl.qparser` parameter or set `hl.qparser=lucene`. Most users won't run into these cases but some will need to change.
+
If you must have full backwards compatibility, use `luceneMatchVersion=7.1.0` or an earlier version.
* The eDisMax parser by default no longer allows subqueries that specify a Solr parser using either local parameters, or the older `\_query_` magic field trick.
+
For example, `{!prefix f=myfield v=enterp}` or `\_query_:"{!prefix f=myfield v=enterp}"` are not supported by default any longer. If you want to allow power-users to do this, set `uf=*,\_query_` or some other value that includes `\_query_`.
+
If you need full backwards compatibility for the time being, use `luceneMatchVersion=7.1.0` or something earlier.
=== Solr 7.1
See the https://wiki.apache.org/solr/ReleaseNote71[7.1 Release Notes] for an overview of the main new features of Solr 7.1.
Users should be aware of the following major changes from v7.0:
* The feature to automatically add replicas if a replica goes down, previously available only when storing indexes in HDFS, has been ported to the autoscaling framework. Due to this, `autoAddReplicas` is now available to all users even if their indexes are on local disks.
@ -39,7 +64,7 @@ More information about the changes to this feature can be found in the section <
* Shard and cluster metric reporter configuration now require a class attribute.
** If a reporter configures the `group="shard"` attribute then please also configure the `class="org.apache.solr.metrics.reporters.solr.SolrShardReporter"` attribute.
** If a reporter configures the `group="cluster"` attribute then please also configure the `class="org.apache.solr.metrics.reporters.solr.SolrClusterReporter"` attribute.
** If a reporter configures the `group="cluster"` attribute then please also configure the `class="org.apache.solr.metrics.reporters.solr.SolrClusterReporter"` attribute.
+
See the section <<metrics-reporting.adoc#shard-and-cluster-reporters,Shard and Cluster Reporters>> for more information.
@ -48,12 +73,12 @@ See the section <<metrics-reporting.adoc#shard-and-cluster-reporters,Shard and C
* The `RunExecutableListener` was removed for security reasons. If you want to listen to events caused by updates, commits, or optimize, write your own listener as native Java class as part of a Solr plugin.
* In the XML query parser (`defType=xmlparser` or `{!xmlparser ... }`) the resolving of external entities is now disallowed by default.
== Upgrading from Any 6.x Release
== Upgrading to 7.0 from Any 6.x Release
The upgrade from Solr 6.x to Solr 7 introduces several *major* changes that you should be aware of before upgrading. Please do a thorough review of the section <<major-changes-in-solr-7.adoc#major-changes-in-solr-7,Major Changes in Solr 7>> before starting your upgrade.
== Upgrading from Older Versions of Solr
== Upgrading from pre-6.x Versions of Solr
Users upgrading from versions of Solr prior to 6.x are strongly encouraged to consult {solr-javadocs}/changes/Changes.html[`CHANGES.txt`] for the details of _all_ changes since the version they are upgrading from.
A summary of the significant changes between Solr 5.x and Solr 6.0 can be found in the <<major-changes-from-solr-5-to-solr-6.adoc#major-changes-from-solr-5-to-solr-6,Major Changes from Solr 5 to Solr 6>> section.
A summary of the significant changes between Solr 5.x and Solr 6.0 can be found in the section <<major-changes-from-solr-5-to-solr-6.adoc#major-changes-from-solr-5-to-solr-6,Major Changes from Solr 5 to Solr 6>>.

View File

@ -142,10 +142,13 @@ However, since the first node in the first example had more than 1 replica for a
In the above example the node with port 8983 has two replicas for `shard1` in violation of our policy.
== Suggestions API ==
Suggestions are operations recommended by the system according to the policies and preferences the user has set. Note that the suggestions are made only if there are `violations` to the policies and the collection admin operation would use the preferences to identify the target node.
== Suggestions API
Suggestions are operations recommended by the system according to the policies and preferences the user has set.
Suggestions are made only if there are `violations` to active policies. The `operation` section of the response uses the defined preferences to identify the target node.
The API is available at `/admin/autocaling/suggestion`. Here is an example output from a suggestion request:
The API is available at `/admin/autocaling/suggestion`
[source,json]
----
{
@ -191,7 +194,7 @@ The API is available at `/admin/autocaling/suggestion`
"WARNING":"This response format is experimental. It is likely to change in the future."}
----
The operation is an actual API call that can be invoked to remedy the current violation
The suggested `operation` is an API call that can be invoked to remedy the current violation.
== History API
@ -213,14 +216,31 @@ further insight into e.g., exact operations that were computed and/or executed.
Specifically, the following query parameters can be used (they are turned into
filter queries, so an implicit AND is applied):
* `trigger` - trigger name
* `eventType` - event type / trigger type (e.g., `nodeAdded`)
* `collection` - collection name involved in event processing
* `stage` - event processing stage
* `action` - trigger action
* `node` - node name that the event refers to
* `beforeAction` - beforeAction stage
* `afterAction` - afterAction stage
`trigger`::
The name of the trigger.
`eventType`::
The event type or trigger type (e.g., `nodeAdded`).
`collection`::
The name of the collection involved in event processing.
`stage`::
An event processing stage.
`action`::
A trigger action.
`node`::
A node name that the event refers to.
`beforeAction`::
A `beforeAction` stage.
`afterAction`::
An `afterAction` stage.
// TODO someday add an input example also
.Example output
[source,json]

View File

@ -135,8 +135,12 @@ public interface FacetParams {
* In SOLR-8599 and SOLR-8988, significant performance increase has been seen when enabling this optimization.
*
* Note: enabling this flag has no effect when the conditions above are not met. For those other cases the default behavior is sufficient.
*
* @deprecated
* This option is no longer used nor will if affect any queries as the fix has been built in. (SOLR-11711)
* This will be removed entirely in 8.0.0
*/
@Deprecated
public static final String FACET_DISTRIB_MCO = FACET_DISTRIB + ".mco";
/**

View File

@ -18,14 +18,17 @@
package org.apache.solr.client.ref_guide_examples;
import java.io.File;
import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.UUID;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.ORDER;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.beans.Field;
import org.apache.solr.client.solrj.impl.HttpSolrClient;
@ -54,6 +57,8 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
private static final int NUM_INDEXED_DOCUMENTS = 3;
private static final int NUM_LIVE_NODES = 1;
private Queue<String> expectedLines = new ArrayDeque();
@BeforeClass
public static void setUpCluster() throws Exception {
@ -69,6 +74,7 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
@Override
public void setUp() throws Exception {
super.setUp();
expectedLines.clear();
final SolrClient client = getSolrClient();
final List<TechProduct> products = new ArrayList<TechProduct>();
@ -84,6 +90,7 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
@Override
public void tearDown() throws Exception {
super.tearDown();
ensureNoLeftoverOutputExpectations();
final SolrClient client = getSolrClient();
client.deleteByQuery("techproducts", "*:*");
@ -92,45 +99,60 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
@Test
public void queryWithRawSolrParamsExample() throws Exception {
expectLine("Found 3 documents");
expectLine("id: 1; name: Fitbit Alta");
expectLine("id: 2; name: Sony Walkman");
expectLine("id: 3; name: Garmin GPS");
// tag::solrj-query-with-raw-solrparams[]
final SolrClient client = getSolrClient();
final Map<String, String> queryParamMap = new HashMap<String, String>();
queryParamMap.put("q", "*:*");
queryParamMap.put("fl", "id, name");
queryParamMap.put("sort", "id asc");
MapSolrParams queryParams = new MapSolrParams(queryParamMap);
final QueryResponse response = client.query("techproducts", queryParams);
final SolrDocumentList documents = response.getResults();
assertEquals(NUM_INDEXED_DOCUMENTS, documents.getNumFound());
print("Found " + documents.getNumFound() + " documents");
for(SolrDocument document : documents) {
assertTrue(document.getFieldNames().contains("id"));
assertTrue(document.getFieldNames().contains("name"));
final String id = (String) document.getFirstValue("id");
final String name = (String) document.getFirstValue("name");
print("id: " + id + "; name: " + name);
}
// end::solrj-query-with-raw-solrparams[]
}
@Test
public void queryWithSolrQueryExample() throws Exception {
final int numResultsToReturn = 1;
final int numResultsToReturn = 3;
expectLine("Found 3 documents");
expectLine("id: 1; name: Fitbit Alta");
expectLine("id: 2; name: Sony Walkman");
expectLine("id: 3; name: Garmin GPS");
final SolrClient client = getSolrClient();
// tag::solrj-query-with-solrquery[]
final SolrQuery query = new SolrQuery("*:*");
query.addField("id");
query.addField("name");
query.setSort("id", ORDER.asc);
query.setRows(numResultsToReturn);
// end::solrj-query-with-solrquery[]
final QueryResponse response = client.query("techproducts", query);
final SolrDocumentList documents = response.getResults();
assertEquals(NUM_INDEXED_DOCUMENTS, documents.getNumFound());
print("Found " + documents.getNumFound() + " documents");
assertEquals(numResultsToReturn, documents.size());
for(SolrDocument document : documents) {
assertTrue(document.getFieldNames().contains("id"));
assertTrue(document.getFieldNames().contains("name"));
final String id = (String) document.getFirstValue("id");
final String name = (String) document.getFirstValue("name");
print("id: "+ id + "; name: " + name);
}
}
@ -167,26 +189,32 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
@Test
public void queryBeanValueTypeExample() throws Exception {
expectLine("Found 3 documents");
expectLine("id: 1; name: Fitbit Alta");
expectLine("id: 2; name: Sony Walkman");
expectLine("id: 3; name: Garmin GPS");
// tag::solrj-query-bean-value-type[]
final SolrClient client = getSolrClient();
final SolrQuery query = new SolrQuery("*:*");
query.addField("id");
query.addField("name");
query.setSort("id", ORDER.asc);
final QueryResponse response = client.query("techproducts", query);
final List<TechProduct> products = response.getBeans(TechProduct.class);
// end::solrj-query-bean-value-type[]
assertEquals(NUM_INDEXED_DOCUMENTS, products.size());
print("Found " + products.size() + " documents");
for (TechProduct product : products) {
assertFalse(product.id.isEmpty());
assertFalse(product.name.isEmpty());
print("id: " + product.id + "; name: " + product.name);
}
}
@Test
public void otherSolrApisExample() throws Exception {
expectLine("Found "+NUM_LIVE_NODES+" live nodes");
// tag::solrj-other-apis[]
final SolrClient client = getSolrClient();
@ -196,7 +224,7 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
final NamedList<Object> cluster = (NamedList<Object>) response.get("cluster");
final List<String> liveNodes = (List<String>) cluster.get("live_nodes");
assertEquals(NUM_LIVE_NODES, liveNodes.size());
print("Found " + liveNodes.size() + " live nodes");
// end::solrj-other-apis[]
}
@ -231,5 +259,28 @@ public class UsingSolrJRefGuideExamplesTest extends SolrCloudTestCase {
public TechProduct() {}
}
// end::solrj-techproduct-value-type[]
private void expectLine(String expectedLine) {
expectedLines.add(expectedLine);
}
private void print(String actualOutput) {
final String nextExpectedLine = expectedLines.poll();
assertNotNull("No more output expected, but was asked to print: " + actualOutput, nextExpectedLine);
final String unexpectedOutputMessage = "Expected line containing " + nextExpectedLine + ", but printed line was: "
+ actualOutput;
assertTrue(unexpectedOutputMessage, actualOutput.contains(nextExpectedLine));
}
private void ensureNoLeftoverOutputExpectations() {
if (expectedLines.isEmpty()) return;
final StringBuilder builder = new StringBuilder();
builder.append("Leftover output was expected but not printed:");
for (String expectedLine : expectedLines) {
builder.append("\n\t" + expectedLine);
}
fail(builder.toString());
}
}