MLT Query: Fix exclude with artificial documents

Artificial documents get assigned a random id. When include is set to false
(default), the ids of these documents also get included, when they should
rather be ignored.

Closes #8679
This commit is contained in:
Alex Ksikes 2014-11-27 09:47:02 +01:00
parent fe762c0eb5
commit d7338ffdbc
2 changed files with 24 additions and 18 deletions

View File

@ -27,6 +27,7 @@ import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.action.termvectors.MultiTermVectorsRequest;
import org.elasticsearch.action.termvectors.TermVectorsRequest;
@ -37,7 +38,6 @@ import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.search.MoreLikeThisQuery;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
@ -47,6 +47,8 @@ import java.util.Iterator;
import java.util.List;
import java.util.Set;
import static org.elasticsearch.index.mapper.Uid.createUidAsBytes;
/**
*
*/
@ -257,9 +259,7 @@ public class MoreLikeThisQueryParser implements QueryParser {
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
// exclude the items from the search
if (!include) {
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items.getRequests()));
ConstantScoreQuery query = new ConstantScoreQuery(filter);
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
handleExclude(boolQuery, items);
}
return boolQuery;
}
@ -305,4 +305,20 @@ public class MoreLikeThisQueryParser implements QueryParser {
}
return moreLikeFields;
}
private void handleExclude(BooleanQuery boolQuery, MultiTermVectorsRequest likeItems) {
// artificial docs get assigned a random id and should be disregarded
List<BytesRef> uids = new ArrayList<>();
for (TermVectorsRequest item : likeItems) {
if (item.doc() != null) {
continue;
}
uids.add(createUidAsBytes(item.type(), item.id()));
}
if (!uids.isEmpty()) {
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, uids.toArray(new BytesRef[0]));
ConstantScoreQuery query = new ConstantScoreQuery(filter);
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
}
}
}

View File

@ -20,7 +20,6 @@
package org.elasticsearch.mlt;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.LuceneTestCase;
import org.elasticsearch.action.admin.cluster.health.ClusterHealthStatus;
import org.elasticsearch.action.admin.indices.create.CreateIndexRequestBuilder;
import org.elasticsearch.action.index.IndexRequestBuilder;
@ -569,17 +568,10 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
}
@Test
@LuceneTestCase.AwaitsFix(bugUrl = "alex k working on it")
public void testMoreLikeThisArtificialDocs() throws Exception {
int numFields = randomIntBetween(5, 10);
logger.info("Creating an index with multiple fields ...");
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties");
for (int i = 0; i < numFields; i++) {
mapping.startObject("field"+i).field("type", "string").endObject();
}
mapping.endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).get());
createIndex("test");
ensureGreen();
logger.info("Indexing a single document ...");
@ -588,17 +580,15 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
doc.field("field"+i, generateRandomStringArray(5, 10));
}
doc.endObject();
List<IndexRequestBuilder> builders = new ArrayList<>();
builders.add(client().prepareIndex("test", "type1", "1").setSource(doc));
indexRandom(true, builders);
indexRandom(true, client().prepareIndex("test", "type1", "0").setSource(doc));
logger.info("Checking the document matches ...");
MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(doc).index("test").type("type1"))
.like((Item) new Item().doc(doc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.maxQueryTerms(100)
.percentTermsToMatch(1); // strict all terms must match!
.minimumShouldMatch("100%"); // strict all terms must match!
SearchResponse response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);