LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (#871)

* SOLR-13752: MoreLikeThis MLT is biased for uncommon fields
This commit is contained in:
Andrew Hind 2019-09-25 05:58:26 +01:00 committed by Anshum Gupta
parent 4ec4061cbc
commit d279fe8a80
4 changed files with 67 additions and 1 deletions

View File

@ -53,6 +53,8 @@ Improvements
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
(Adrien Gallou via Tomoko Uchida)
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
Bug fixes
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while

View File

@ -650,13 +650,17 @@ public final class MoreLikeThis {
*/
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
// have collected all words in doc and their freqs
int numDocs = ir.numDocs();
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
FreqQ queue = new FreqQ(limit); // will order words by score
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
Map<String, Int> perWordTermFrequencies = entry.getValue();
String fieldName = entry.getKey();
long numDocs = ir.getDocCount(fieldName);
if(numDocs == -1) {
numDocs = ir.numDocs();
}
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
String word = tfEntry.getKey();
int tf = tfEntry.getValue().x; // term freq in the source doc

View File

@ -23,6 +23,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
@ -130,6 +131,60 @@ public class TestMoreLikeThis extends LuceneTestCase {
writer.addDocument(doc);
}
public void testSmallSampleFromCorpus() throws Throwable {
// add series of docs with terms of decreasing df
Directory dir = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
for (int i = 0; i < 1980; i++) {
Document doc = new Document();
doc.add(newTextField("text", "filler", Field.Store.YES));
writer.addDocument(doc);
}
for (int i = 0; i < 18; i++) {
Document doc = new Document();
doc.add(newTextField("one_percent", "all", Field.Store.YES));
writer.addDocument(doc);
}
for (int i = 0; i < 2; i++) {
Document doc = new Document();
doc.add(newTextField("one_percent", "all", Field.Store.YES));
doc.add(newTextField("one_percent", "tenth", Field.Store.YES));
writer.addDocument(doc);
}
IndexReader reader = writer.getReader();
writer.close();
// setup MLT query
MoreLikeThis mlt = new MoreLikeThis(reader);
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
mlt.setAnalyzer(analyzer);
mlt.setMaxQueryTerms(3);
mlt.setMinDocFreq(1);
mlt.setMinTermFreq(1);
mlt.setMinWordLen(1);
mlt.setFieldNames(new String[]{"one_percent"});
BooleanQuery query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth tenth all"));
Collection<BooleanClause> clauses = query.clauses();
assertTrue(clauses.size() == 2);
Term term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
assertTrue(term.text().equals("all"));
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
assertTrue(term.text().equals("tenth"));
query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth all all"));
clauses = query.clauses();
assertTrue(clauses.size() == 2);
term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
assertTrue(term.text().equals("all"));
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
assertTrue(term.text().equals("tenth"));
}
public void testBoostFactor() throws Throwable {
Map<String,Float> originalValues = getOriginalValues();
mlt.setFieldNames(new String[] {"text"});

View File

@ -66,6 +66,11 @@ Upgrade Notes
* SOLR-13593 SOLR-13690 SOLR-13691: Allow to look up analyzer components by their SPI names in field type configuration. (Tomoko Uchida)
Improvements
----------------------
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
Other Changes
----------------------