diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index a0d3d69f2a4..bea6a00e346 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -200,6 +200,9 @@ Bug Fixes * LUCENE-6395: Seeking by term ordinal was failing to set the term's bytes in MemoryIndex (Mike McCandless) +* LUCENE-6429: Removed the TermQuery(Term,int) constructor which could lead to + inconsistent term statistics. (Adrien Grand, Robert Muir) + Optimizations * LUCENE-6183, LUCENE-5647: Avoid recompressing stored fields diff --git a/lucene/core/src/java/org/apache/lucene/index/TermContext.java b/lucene/core/src/java/org/apache/lucene/index/TermContext.java index e6945d98f82..4c3190c5a82 100644 --- a/lucene/core/src/java/org/apache/lucene/index/TermContext.java +++ b/lucene/core/src/java/org/apache/lucene/index/TermContext.java @@ -53,6 +53,7 @@ public final class TermContext { assert context != null && context.isTopLevel; topReaderContext = context; docFreq = 0; + totalTermFreq = 0; final int len; if (context.leaves() == null) { len = 1; @@ -107,6 +108,7 @@ public final class TermContext { */ public void clear() { docFreq = 0; + totalTermFreq = 0; Arrays.fill(states, null); } @@ -160,12 +162,6 @@ public final class TermContext { public long totalTermFreq() { return totalTermFreq; } - - /** expert: only available for queries that want to lie about docfreq - * @lucene.internal */ - public void setDocFreq(int docFreq) { - this.docFreq = docFreq; - } /** Returns true if all terms stored here are real (e.g., not auto-prefix terms). * diff --git a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java index eb2846a1172..4aba98ad0ac 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TermQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/TermQuery.java @@ -18,6 +18,7 @@ package org.apache.lucene.search; */ import java.io.IOException; +import java.util.Objects; import java.util.Set; import org.apache.lucene.index.IndexReaderContext; @@ -40,7 +41,6 @@ import org.apache.lucene.util.ToStringUtils; */ public class TermQuery extends Query { private final Term term; - private final int docFreq; private final TermContext perReaderTermState; final class TermWeight extends Weight { @@ -138,16 +138,7 @@ public class TermQuery extends Query { /** Constructs a query for the term t. */ public TermQuery(Term t) { - this(t, -1); - } - - /** - * Expert: constructs a TermQuery that will use the provided docFreq instead - * of looking up the docFreq against the searcher. - */ - public TermQuery(Term t, int docFreq) { - term = t; - this.docFreq = docFreq; + term = Objects.requireNonNull(t); perReaderTermState = null; } @@ -157,9 +148,8 @@ public class TermQuery extends Query { */ public TermQuery(Term t, TermContext states) { assert states != null; - term = t; - docFreq = states.docFreq(); - perReaderTermState = states; + term = Objects.requireNonNull(t); + perReaderTermState = Objects.requireNonNull(states); } /** Returns the term of this query. */ @@ -181,9 +171,6 @@ public class TermQuery extends Query { termState = this.perReaderTermState; } - // we must not ignore the given docFreq - if set use the given value (lie) - if (docFreq != -1) termState.setDocFreq(docFreq); - return new TermWeight(searcher, needsScores, termState); } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java index 845ac62dc32..7a8b714117e 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/queries/FuzzyLikeThisQuery.java @@ -27,12 +27,21 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.MultiFields; import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermContext; import org.apache.lucene.index.Terms; -import org.apache.lucene.search.*; -import org.apache.lucene.search.similarities.TFIDFSimilarity; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.BoostAttribute; +import org.apache.lucene.search.ConstantScoreQuery; +import org.apache.lucene.search.MaxNonCompetitiveBoostAttribute; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.similarities.DefaultSimilarity; +import org.apache.lucene.search.similarities.TFIDFSimilarity; import org.apache.lucene.util.AttributeSource; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.PriorityQueue; @@ -256,6 +265,27 @@ public class FuzzyLikeThisQuery extends Query } } + private Query newTermQuery(IndexReader reader, Term term) throws IOException { + if (ignoreTF) { + return new ConstantScoreQuery(new TermQuery(term)); + } else { + // we build an artificial TermContext that will give an overall df and ttf + // equal to 1 + TermContext context = new TermContext(reader.getContext()); + for (LeafReaderContext leafContext : reader.leaves()) { + Terms terms = leafContext.reader().terms(term.field()); + if (terms != null) { + TermsEnum termsEnum = terms.iterator(); + if (termsEnum.seekExact(term.bytes())) { + int freq = 1 - context.docFreq(); // we want the total df and ttf to be 1 + context.register(termsEnum.termState(), leafContext.ord, freq, freq); + } + } + } + return new TermQuery(term, context); + } + } + @Override public Query rewrite(IndexReader reader) throws IOException { @@ -298,7 +328,7 @@ public class FuzzyLikeThisQuery extends Query { //optimize where only one selected variant ScoreTerm st= variants.get(0); - Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); + Query tq = newTermQuery(reader, st.term); tq.setBoost(st.score); // set the boost to a mix of IDF and score bq.add(tq, BooleanClause.Occur.SHOULD); } @@ -310,7 +340,7 @@ public class FuzzyLikeThisQuery extends Query { ScoreTerm st = iterator2.next(); // found a match - Query tq = ignoreTF ? new ConstantScoreQuery(new TermQuery(st.term)) : new TermQuery(st.term, 1); + Query tq = newTermQuery(reader, st.term); tq.setBoost(st.score); // set the boost using the ScoreTerm's score termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query }