mirror of https://github.com/apache/lucene.git
LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (#871)
* SOLR-13752: MoreLikeThis MLT is biased for uncommon fields
This commit is contained in:
parent
4ec4061cbc
commit
d279fe8a80
|
@ -52,6 +52,8 @@ Improvements
|
||||||
|
|
||||||
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
|
* LUCENE-8937: Avoid agressive stemming on numbers in the FrenchMinimalStemmer.
|
||||||
(Adrien Gallou via Tomoko Uchida)
|
(Adrien Gallou via Tomoko Uchida)
|
||||||
|
|
||||||
|
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
|
|
|
@ -650,13 +650,17 @@ public final class MoreLikeThis {
|
||||||
*/
|
*/
|
||||||
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
|
private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException {
|
||||||
// have collected all words in doc and their freqs
|
// have collected all words in doc and their freqs
|
||||||
int numDocs = ir.numDocs();
|
|
||||||
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies));
|
||||||
FreqQ queue = new FreqQ(limit); // will order words by score
|
FreqQ queue = new FreqQ(limit); // will order words by score
|
||||||
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
|
for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) {
|
||||||
Map<String, Int> perWordTermFrequencies = entry.getValue();
|
Map<String, Int> perWordTermFrequencies = entry.getValue();
|
||||||
String fieldName = entry.getKey();
|
String fieldName = entry.getKey();
|
||||||
|
|
||||||
|
long numDocs = ir.getDocCount(fieldName);
|
||||||
|
if(numDocs == -1) {
|
||||||
|
numDocs = ir.numDocs();
|
||||||
|
}
|
||||||
|
|
||||||
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word
|
||||||
String word = tfEntry.getKey();
|
String word = tfEntry.getKey();
|
||||||
int tf = tfEntry.getValue().x; // term freq in the source doc
|
int tf = tfEntry.getValue().x; // term freq in the source doc
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.util.Arrays;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
@ -130,6 +131,60 @@ public class TestMoreLikeThis extends LuceneTestCase {
|
||||||
writer.addDocument(doc);
|
writer.addDocument(doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void testSmallSampleFromCorpus() throws Throwable {
|
||||||
|
// add series of docs with terms of decreasing df
|
||||||
|
Directory dir = newDirectory();
|
||||||
|
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
|
||||||
|
for (int i = 0; i < 1980; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("text", "filler", Field.Store.YES));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 18; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("one_percent", "all", Field.Store.YES));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < 2; i++) {
|
||||||
|
Document doc = new Document();
|
||||||
|
doc.add(newTextField("one_percent", "all", Field.Store.YES));
|
||||||
|
doc.add(newTextField("one_percent", "tenth", Field.Store.YES));
|
||||||
|
writer.addDocument(doc);
|
||||||
|
}
|
||||||
|
IndexReader reader = writer.getReader();
|
||||||
|
writer.close();
|
||||||
|
|
||||||
|
// setup MLT query
|
||||||
|
MoreLikeThis mlt = new MoreLikeThis(reader);
|
||||||
|
Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
|
||||||
|
mlt.setAnalyzer(analyzer);
|
||||||
|
mlt.setMaxQueryTerms(3);
|
||||||
|
mlt.setMinDocFreq(1);
|
||||||
|
mlt.setMinTermFreq(1);
|
||||||
|
mlt.setMinWordLen(1);
|
||||||
|
mlt.setFieldNames(new String[]{"one_percent"});
|
||||||
|
|
||||||
|
BooleanQuery query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth tenth all"));
|
||||||
|
Collection<BooleanClause> clauses = query.clauses();
|
||||||
|
|
||||||
|
assertTrue(clauses.size() == 2);
|
||||||
|
Term term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
|
||||||
|
assertTrue(term.text().equals("all"));
|
||||||
|
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
|
||||||
|
assertTrue(term.text().equals("tenth"));
|
||||||
|
|
||||||
|
|
||||||
|
query = (BooleanQuery) mlt.like("one_percent", new StringReader("tenth all all"));
|
||||||
|
clauses = query.clauses();
|
||||||
|
|
||||||
|
assertTrue(clauses.size() == 2);
|
||||||
|
term = ((TermQuery) ((List<BooleanClause>) clauses).get(0).getQuery()).getTerm();
|
||||||
|
assertTrue(term.text().equals("all"));
|
||||||
|
term = ((TermQuery) ((List<BooleanClause>) clauses).get(1).getQuery()).getTerm();
|
||||||
|
assertTrue(term.text().equals("tenth"));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
public void testBoostFactor() throws Throwable {
|
public void testBoostFactor() throws Throwable {
|
||||||
Map<String,Float> originalValues = getOriginalValues();
|
Map<String,Float> originalValues = getOriginalValues();
|
||||||
mlt.setFieldNames(new String[] {"text"});
|
mlt.setFieldNames(new String[] {"text"});
|
||||||
|
|
|
@ -66,6 +66,11 @@ Upgrade Notes
|
||||||
|
|
||||||
* SOLR-13593 SOLR-13690 SOLR-13691: Allow to look up analyzer components by their SPI names in field type configuration. (Tomoko Uchida)
|
* SOLR-13593 SOLR-13690 SOLR-13691: Allow to look up analyzer components by their SPI names in field type configuration. (Tomoko Uchida)
|
||||||
|
|
||||||
|
Improvements
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
|
||||||
|
|
||||||
Other Changes
|
Other Changes
|
||||||
----------------------
|
----------------------
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue