mirror of https://github.com/apache/lucene.git
LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (#871)
* SOLR-13752: MoreLikeThis MLT is biased for uncommon fields
This commit is contained in:
parent
4ec4061cbc
commit
d279fe8a80
|
@ -53,6 +53,8 @@ Improvements
|
|||
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
|
||||
(Adrien Gallou via Tomoko Uchida)
|
||||
|
||||
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||
|
||||
Bug fixes
|
||||
|
||||
* LUCENE-8663: NRTCachingDirectory.slowFileExists may open a file while
|
||||
|
|
|
@ -650,13 +650,17 @@ public final class MoreLikeThis {
|
|||
*/
|
||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
|
||||
// have collected all words in doc and their freqs
|
||||
int numDocs = ir.numDocs();
|
||||
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
||||
FreqQ queue = new FreqQ(limit); // will order words by score
|
||||
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
|
||||
Map<String, Int> perWordTermFrequencies = entry.getValue();
|
||||
String fieldName = entry.getKey();
|
||||
|
||||
long numDocs = ir.getDocCount(fieldName);
|
||||
if(numDocs == -1) {
|
||||
numDocs = ir.numDocs();
|
||||
}
|
||||
|
||||
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
||||
String word = tfEntry.getKey();
|
||||
int tf = tfEntry.getValue().x; // term freq in the source doc
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -130,6 +131,60 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
|||
writer.addDocument(doc);
|
||||
}
|
||||
|
||||
public void testSmallSampleFromCorpus() throws Throwable {
|
||||
// add series of docs with terms of decreasing df
|
||||
Directory dir = newDirectory();
|
||||
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||
for (int i = 0; i < 1980; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("text", "filler", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
for (int i = 0; i < 18; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("one_percent", "all", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Document doc = new Document();
|
||||
doc.add(newTextField("one_percent", "all", Field.Store.YES));
|
||||
doc.add(newTextField("one_percent", "tenth", Field.Store.YES));
|
||||
writer.addDocument(doc);
|
||||
}
|
||||
IndexReader reader = writer.getReader();
|
||||
writer.close();
|
||||
|
||||
// setup MLT query
|
||||
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||
mlt.setAnalyzer(analyzer);
|
||||
mlt.setMaxQueryTerms(3);
|
||||
mlt.setMinDocFreq(1);
|
||||
mlt.setMinTermFreq(1);
|
||||
mlt.setMinWordLen(1);
|
||||
mlt.setFieldNames(new String[]{"one_percent"});
|
||||
|
||||
BooleanQuery query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth tenth all"));
|
||||
Collection<BooleanClause> clauses = query.clauses();
|
||||
|
||||
assertTrue(clauses.size() == 2);
|
||||
Term term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
|
||||
assertTrue(term.text().equals("all"));
|
||||
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
|
||||
assertTrue(term.text().equals("tenth"));
|
||||
|
||||
|
||||
query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth all all"));
|
||||
clauses = query.clauses();
|
||||
|
||||
assertTrue(clauses.size() == 2);
|
||||
term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
|
||||
assertTrue(term.text().equals("all"));
|
||||
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
|
||||
assertTrue(term.text().equals("tenth"));
|
||||
|
||||
}
|
||||
|
||||
public void testBoostFactor() throws Throwable {
|
||||
Map<String,Float> originalValues = getOriginalValues();
|
||||
mlt.setFieldNames(new String[] {"text"});
|
||||
|
|
|
@ -66,6 +66,11 @@ Upgrade Notes
|
|||
|
||||
* SOLR-13593 SOLR-13690 SOLR-13691: Allow to look up analyzer components by their SPI names in field type configuration. (Tomoko Uchida)
|
||||
|
||||
Improvements
|
||||
----------------------
|
||||
|
||||
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||
|
||||
Other Changes
|
||||
----------------------
|
||||
|
||||
|
|
Loading…
Reference in New Issue