BlendedTermQuery should ignore fields that don't exists in the index (#41125)
Today the blended term query detects if a term exists in a field by looking at the term statistics in the index. However the value to indicate that a term has no occurence in a field have changed in Lucene. A non-existing term now returns a doc and total term frequency of 0. Because of this disrepancy the blended term query picks 0 as the minimum frequency for a term even if other fields have documents for this terms. This confuses the term queries that the blending creates since some of them contain a custom state that indicates a frequency of 0 even though the term has some occurence in the field. For these terms an exception is thrown because the term query always checks that the term state's frequency is greater than 0 if there are documents associate to it. This change fixes this bug by ignoring terms with a doc freq of 0 when the blended term query picks the minimum term frequency among the requested fields. Closes #41118
This commit is contained in:
parent
116167df55
commit
c22a2cea12
|
@ -113,23 +113,17 @@ public abstract class BlendedTermQuery extends Query {
|
||||||
// TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
|
// TODO: Maybe it could also make sense to assume independent distributions of documents and eg. have:
|
||||||
// df = df1 + df2 - (df1 * df2 / maxDoc)?
|
// df = df1 + df2 - (df1 * df2 / maxDoc)?
|
||||||
max = Math.max(df, max);
|
max = Math.max(df, max);
|
||||||
if (minSumTTF != -1 && ctx.totalTermFreq() != -1) {
|
if (ctx.totalTermFreq() > 0) {
|
||||||
// we need to find out the minimum sumTTF to adjust the statistics
|
// we need to find out the minimum sumTTF to adjust the statistics
|
||||||
// otherwise the statistics don't match
|
// otherwise the statistics don't match
|
||||||
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
|
minSumTTF = Math.min(minSumTTF, reader.getSumTotalTermFreq(terms[i].field()));
|
||||||
} else {
|
|
||||||
minSumTTF = -1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
if (minSumTTF != -1 && maxDoc > minSumTTF) {
|
|
||||||
maxDoc = (int)minSumTTF;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (max == 0) {
|
if (max == 0) {
|
||||||
return; // we are done that term doesn't exist at all
|
return; // we are done that term doesn't exist at all
|
||||||
}
|
}
|
||||||
long sumTTF = minSumTTF == -1 ? -1 : 0;
|
long sumTTF = 0;
|
||||||
final int[] tieBreak = new int[contexts.length];
|
final int[] tieBreak = new int[contexts.length];
|
||||||
for (int i = 0; i < tieBreak.length; ++i) {
|
for (int i = 0; i < tieBreak.length; ++i) {
|
||||||
tieBreak[i] = i;
|
tieBreak[i] = i;
|
||||||
|
@ -165,11 +159,7 @@ public abstract class BlendedTermQuery extends Query {
|
||||||
}
|
}
|
||||||
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
|
contexts[i] = ctx = adjustDF(reader.getContext(), ctx, Math.min(maxDoc, actualDf));
|
||||||
prev = current;
|
prev = current;
|
||||||
if (sumTTF >= 0 && ctx.totalTermFreq() >= 0) {
|
sumTTF += ctx.totalTermFreq();
|
||||||
sumTTF += ctx.totalTermFreq();
|
|
||||||
} else {
|
|
||||||
sumTTF = -1; // omit once TF is omitted anywhere!
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
sumTTF = Math.min(sumTTF, minSumTTF);
|
sumTTF = Math.min(sumTTF, minSumTTF);
|
||||||
for (int i = 0; i < contexts.length; i++) {
|
for (int i = 0; i < contexts.length; i++) {
|
||||||
|
@ -177,17 +167,12 @@ public abstract class BlendedTermQuery extends Query {
|
||||||
if (df == 0) {
|
if (df == 0) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// the blended sumTTF can't be greater than the sumTTTF on the field
|
contexts[i] = adjustTTF(reader.getContext(), contexts[i], sumTTF);
|
||||||
final long fixedTTF = sumTTF == -1 ? -1 : sumTTF;
|
|
||||||
contexts[i] = adjustTTF(reader.getContext(), contexts[i], fixedTTF);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termContext, long sumTTF) throws IOException {
|
private TermStates adjustTTF(IndexReaderContext readerContext, TermStates termContext, long sumTTF) throws IOException {
|
||||||
assert termContext.wasBuiltFor(readerContext);
|
assert termContext.wasBuiltFor(readerContext);
|
||||||
if (sumTTF == -1 && termContext.totalTermFreq() == -1) {
|
|
||||||
return termContext;
|
|
||||||
}
|
|
||||||
TermStates newTermContext = new TermStates(readerContext);
|
TermStates newTermContext = new TermStates(readerContext);
|
||||||
List<LeafReaderContext> leaves = readerContext.leaves();
|
List<LeafReaderContext> leaves = readerContext.leaves();
|
||||||
final int len;
|
final int len;
|
||||||
|
@ -213,12 +198,7 @@ public abstract class BlendedTermQuery extends Query {
|
||||||
private static TermStates adjustDF(IndexReaderContext readerContext, TermStates ctx, int newDocFreq) throws IOException {
|
private static TermStates adjustDF(IndexReaderContext readerContext, TermStates ctx, int newDocFreq) throws IOException {
|
||||||
assert ctx.wasBuiltFor(readerContext);
|
assert ctx.wasBuiltFor(readerContext);
|
||||||
// Use a value of ttf that is consistent with the doc freq (ie. gte)
|
// Use a value of ttf that is consistent with the doc freq (ie. gte)
|
||||||
long newTTF;
|
long newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
|
||||||
if (ctx.totalTermFreq() < 0) {
|
|
||||||
newTTF = -1;
|
|
||||||
} else {
|
|
||||||
newTTF = Math.max(ctx.totalTermFreq(), newDocFreq);
|
|
||||||
}
|
|
||||||
List<LeafReaderContext> leaves = readerContext.leaves();
|
List<LeafReaderContext> leaves = readerContext.leaves();
|
||||||
final int len;
|
final int len;
|
||||||
if (leaves == null) {
|
if (leaves == null) {
|
||||||
|
|
|
@ -28,10 +28,12 @@ import org.apache.lucene.index.IndexOptions;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.index.MultiReader;
|
import org.apache.lucene.index.MultiReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
import org.apache.lucene.index.TermStates;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.DisjunctionMaxQuery;
|
import org.apache.lucene.search.DisjunctionMaxQuery;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.QueryUtils;
|
import org.apache.lucene.search.QueryUtils;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.ScoreMode;
|
import org.apache.lucene.search.ScoreMode;
|
||||||
|
@ -52,6 +54,8 @@ import java.util.Set;
|
||||||
|
|
||||||
import static org.hamcrest.Matchers.containsInAnyOrder;
|
import static org.hamcrest.Matchers.containsInAnyOrder;
|
||||||
import static org.hamcrest.Matchers.equalTo;
|
import static org.hamcrest.Matchers.equalTo;
|
||||||
|
import static org.hamcrest.Matchers.greaterThan;
|
||||||
|
import static org.hamcrest.Matchers.instanceOf;
|
||||||
|
|
||||||
public class BlendedTermQueryTests extends ESTestCase {
|
public class BlendedTermQueryTests extends ESTestCase {
|
||||||
public void testDismaxQuery() throws IOException {
|
public void testDismaxQuery() throws IOException {
|
||||||
|
@ -114,6 +118,61 @@ public class BlendedTermQueryTests extends ESTestCase {
|
||||||
assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
|
assertEquals(Integer.toString(1), reader.document(scoreDocs[0].doc).getField("id").stringValue());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
// test with an unknown field
|
||||||
|
String[] fields = new String[] {"username", "song", "unknown_field"};
|
||||||
|
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "foo"), 1.0f);
|
||||||
|
Query rewrite = searcher.rewrite(query);
|
||||||
|
assertThat(rewrite, instanceOf(BooleanQuery.class));
|
||||||
|
for (BooleanClause clause : (BooleanQuery) rewrite) {
|
||||||
|
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
|
||||||
|
TermQuery termQuery = (TermQuery) clause.getQuery();
|
||||||
|
TermStates termStates = termQuery.getTermStates();
|
||||||
|
if (termQuery.getTerm().field().equals("unknown_field")) {
|
||||||
|
assertThat(termStates.docFreq(), equalTo(0));
|
||||||
|
assertThat(termStates.totalTermFreq(), equalTo(0L));
|
||||||
|
} else {
|
||||||
|
assertThat(termStates.docFreq(), greaterThan(0));
|
||||||
|
assertThat(termStates.totalTermFreq(), greaterThan(0L));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertThat(searcher.search(query, 10).totalHits.value, equalTo((long) iters + username.length));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// test with an unknown field and an unknown term
|
||||||
|
String[] fields = new String[] {"username", "song", "unknown_field"};
|
||||||
|
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "unknown_term"), 1.0f);
|
||||||
|
Query rewrite = searcher.rewrite(query);
|
||||||
|
assertThat(rewrite, instanceOf(BooleanQuery.class));
|
||||||
|
for (BooleanClause clause : (BooleanQuery) rewrite) {
|
||||||
|
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
|
||||||
|
TermQuery termQuery = (TermQuery) clause.getQuery();
|
||||||
|
TermStates termStates = termQuery.getTermStates();
|
||||||
|
assertThat(termStates.docFreq(), equalTo(0));
|
||||||
|
assertThat(termStates.totalTermFreq(), equalTo(0L));
|
||||||
|
}
|
||||||
|
assertThat(searcher.search(query, 10).totalHits.value, equalTo(0L));
|
||||||
|
}
|
||||||
|
{
|
||||||
|
// test with an unknown field and a term that is present in only one field
|
||||||
|
String[] fields = new String[] {"username", "song", "id", "unknown_field"};
|
||||||
|
Query query = BlendedTermQuery.dismaxBlendedQuery(toTerms(fields, "fan"), 1.0f);
|
||||||
|
Query rewrite = searcher.rewrite(query);
|
||||||
|
assertThat(rewrite, instanceOf(BooleanQuery.class));
|
||||||
|
for (BooleanClause clause : (BooleanQuery) rewrite) {
|
||||||
|
assertThat(clause.getQuery(), instanceOf(TermQuery.class));
|
||||||
|
TermQuery termQuery = (TermQuery) clause.getQuery();
|
||||||
|
TermStates termStates = termQuery.getTermStates();
|
||||||
|
if (termQuery.getTerm().field().equals("username")) {
|
||||||
|
assertThat(termStates.docFreq(), equalTo(1));
|
||||||
|
assertThat(termStates.totalTermFreq(), equalTo(1L));
|
||||||
|
} else {
|
||||||
|
assertThat(termStates.docFreq(), equalTo(0));
|
||||||
|
assertThat(termStates.totalTermFreq(), equalTo(0L));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assertThat(searcher.search(query, 10).totalHits.value, equalTo(1L));
|
||||||
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
w.close();
|
w.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
|
Loading…
Reference in New Issue