diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 661377f4661..cb969eb3388 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -52,6 +52,8 @@ Improvements * LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer. (Adrien Gallou via Tomoko Uchida) + +* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta) Bug fixes diff --git a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java index 4fb6c4f410e..709e56fc5e3 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java @@ -650,13 +650,17 @@ public final class MoreLikeThis { */ private PriorityQueue createQueue(Map> perFieldTermFrequencies) throws IOException { // have collected all words in doc and their freqs - int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); FreqQ queue = new FreqQ(limit); // will order words by score for (Map.Entry> entry : perFieldTermFrequencies.entrySet()) { Map perWordTermFrequencies = entry.getValue(); String fieldName = entry.getKey(); + long numDocs = ir.getDocCount(fieldName); + if(numDocs == -1) { + numDocs = ir.numDocs(); + } + for (Map.Entry tfEntry : perWordTermFrequencies.entrySet()) { // for every word String word = tfEntry.getKey(); int tf = tfEntry.getValue().x; // term freq in the source doc diff --git a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java index 2061068bc72..eb21d1effe1 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/mlt/TestMoreLikeThis.java @@ -23,6 +23,7 @@ import java.util.Arrays; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; +import java.util.List; import java.util.Locale; import java.util.Map; @@ -130,6 +131,60 @@ public class TestMoreLikeThis extends LuceneTestCase { writer.addDocument(doc); } + public void testSmallSampleFromCorpus() throws Throwable { + // add series of docs with terms of decreasing df + Directory dir = newDirectory(); + RandomIndexWriter writer = new RandomIndexWriter(random(), dir); + for (int i = 0; i < 1980; i++) { + Document doc = new Document(); + doc.add(newTextField("text", "filler", Field.Store.YES)); + writer.addDocument(doc); + } + for (int i = 0; i < 18; i++) { + Document doc = new Document(); + doc.add(newTextField("one_percent", "all", Field.Store.YES)); + writer.addDocument(doc); + } + for (int i = 0; i < 2; i++) { + Document doc = new Document(); + doc.add(newTextField("one_percent", "all", Field.Store.YES)); + doc.add(newTextField("one_percent", "tenth", Field.Store.YES)); + writer.addDocument(doc); + } + IndexReader reader = writer.getReader(); + writer.close(); + + // setup MLT query + MoreLikeThis mlt = new MoreLikeThis(reader); + Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); + mlt.setAnalyzer(analyzer); + mlt.setMaxQueryTerms(3); + mlt.setMinDocFreq(1); + mlt.setMinTermFreq(1); + mlt.setMinWordLen(1); + mlt.setFieldNames(new String[]{"one_percent"}); + + BooleanQuery query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth tenth all")); + Collection clauses = query.clauses(); + + assertTrue(clauses.size() == 2); + Term term = ((TermQuery) ((List) clauses).get(0).getQuery()).getTerm(); + assertTrue(term.text().equals("all")); + term = ((TermQuery) ((List) clauses).get(1).getQuery()).getTerm(); + assertTrue(term.text().equals("tenth")); + + + query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth all all")); + clauses = query.clauses(); + + assertTrue(clauses.size() == 2); + term = ((TermQuery) ((List) clauses).get(0).getQuery()).getTerm(); + assertTrue(term.text().equals("all")); + term = ((TermQuery) ((List) clauses).get(1).getQuery()).getTerm(); + assertTrue(term.text().equals("tenth")); + + } + public void testBoostFactor() throws Throwable { Map originalValues = getOriginalValues(); mlt.setFieldNames(new String[] {"text"}); diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 8fb9bcc007b..b1fa38c1802 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -66,6 +66,11 @@ Upgrade Notes * SOLR-13593 SOLR-13690 SOLR-13691: Allow to look up analyzer components by their SPI names in field type configuration. (Tomoko Uchida) +Improvements +---------------------- + +* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta) + Other Changes ----------------------