MLT Query: Support for artificial documents

Previously, the only way to specify a document not present in the index was to
use `like_text`. This would usually lead to complex queries made of multiple
MLT queries per document field. This commit adds the ability to the MLT query
to directly specify documents not present in the index (artificial documents).
The syntax is similar to the Percolator API or to the Multi Term Vector API.

Closes #7725
This commit is contained in:
Alex Ksikes 2014-09-15 16:17:49 +02:00
parent 43a1e1c353
commit b118558962
10 changed files with 276 additions and 46 deletions

View File

@ -16,7 +16,7 @@ running it against one or more fields.
}
--------------------------------------------------
Additionally, More Like This can find documents that are "like" a set of
More Like This can find documents that are "like" a set of
chosen documents. The syntax to specify one or more documents is similar to
the <<docs-multi-get,Multi GET API>>, and supports the `ids` or `docs` array.
If only one document is specified, the query behaves the same as the
@ -46,6 +46,41 @@ If only one document is specified, the query behaves the same as the
}
--------------------------------------------------
Additionally, the `doc` syntax of the
<<docs-multi-termvectors,Multi Term Vectors API>> is also supported. This is useful in
order to specify one or more documents not present in the index, and in
this case should be preferred over only using `like_text`.
[source,js]
--------------------------------------------------
{
"more_like_this" : {
"fields" : ["name.first", "name.last"],
"docs" : [
{
"_index" : "test",
"_type" : "type",
"doc" : {
"name": {
"first": "Ben",
"last": "Grimm"
},
"tweet": "You got no idea what I'd... what I'd give to be invisible."
}
}
},
{
"_index" : "test",
"_type" : "type",
"_id" : "2"
}
],
"min_term_freq" : 1,
"max_query_terms" : 12
}
}
--------------------------------------------------
`more_like_this` can be shortened to `mlt`.
Under the hood, `more_like_this` simply creates multiple `should` clauses in a `bool` query of
@ -81,8 +116,8 @@ for `ids` or `docs`.
not specified.
|`ids` or `docs` |A list of documents following the same syntax as the
<<docs-multi-get,Multi GET API>>. The text is fetched from `fields`
unless specified otherwise in each `doc`.
<<docs-multi-get,Multi GET API>> or <<docs-multi-termvectors,Multi Term Vectors API>>.
The text is fetched from `fields` unless specified otherwise in each `doc`.
|`include` |When using `ids` or `docs`, specifies whether the documents should be
included from the search. Defaults to `false`.

View File

@ -0,0 +1,54 @@
---
"Basic mlt query with docs":
- do:
index:
index: test_1
type: test
id: 1
body: { foo: bar }
- do:
index:
index: test_1
type: test
id: 2
body: { foo: baz }
- do:
index:
index: test_1
type: test
id: 3
body: { foo: foo }
- do:
indices.refresh: {}
- do:
cluster.health:
wait_for_status: green
- do:
search:
index: test_1
type: test
body:
query:
more_like_this:
docs:
-
_index: test_1
_type: test
doc:
foo: bar
-
_index: test_1
_type: test
_id: 2
ids:
- 3
include: true
min_doc_freq: 0
min_term_freq: 0
- match: { hits.total: 3 }

View File

@ -19,10 +19,10 @@
package org.elasticsearch.action.termvector;
import com.google.common.collect.Iterators;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.*;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
@ -31,12 +31,9 @@ import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;
public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsRequest> implements CompositeIndicesRequest {
public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsRequest> implements Iterable<TermVectorRequest>, CompositeIndicesRequest {
String preference;
List<TermVectorRequest> requests = new ArrayList<>();
@ -53,11 +50,6 @@ public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsReque
return this;
}
public MultiTermVectorsRequest add(MultiGetRequest.Item item) {
requests.add(new TermVectorRequest(item));
return this;
}
@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = null;
@ -81,6 +73,19 @@ public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsReque
return requests;
}
@Override
public Iterator<TermVectorRequest> iterator() {
return Iterators.unmodifiableIterator(requests.iterator());
}
public boolean isEmpty() {
return requests.isEmpty() && ids.isEmpty();
}
public List<TermVectorRequest> getRequests() {
return requests;
}
public void add(TermVectorRequest template, BytesReference data) throws Exception {
XContentParser.Token token;
String currentFieldName = null;

View File

@ -23,6 +23,7 @@ import com.google.common.collect.Sets;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.DocumentRequest;
import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
@ -45,7 +46,7 @@ import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
* Note, the {@link #index()}, {@link #type(String)} and {@link #id(String)} are
* required.
*/
public class TermVectorRequest extends SingleShardOperationRequest<TermVectorRequest> {
public class TermVectorRequest extends SingleShardOperationRequest<TermVectorRequest> implements DocumentRequest<TermVectorRequest> {
private String type;
@ -287,7 +288,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
* Return only term vectors for special selected fields. Returns the term
* vectors for all fields if selectedFields == null
*/
public TermVectorRequest selectedFields(String[] fields) {
public TermVectorRequest selectedFields(String... fields) {
selectedFields = fields != null && fields.length != 0 ? Sets.newHashSet(fields) : null;
return this;
}

View File

@ -21,7 +21,8 @@ package org.elasticsearch.index.mapper;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.DocumentRequest;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.common.lucene.BytesRefs;
import java.util.Collection;
@ -95,11 +96,11 @@ public final class Uid {
return new Uid(uid.substring(0, delimiterIndex), uid.substring(delimiterIndex + 1));
}
public static BytesRef[] createUids(List<MultiGetRequest.Item> items) {
BytesRef[] uids = new BytesRef[items.size()];
public static BytesRef[] createUids(List<? extends DocumentRequest> requests) {
BytesRef[] uids = new BytesRef[requests.size()];
int idx = 0;
for (MultiGetRequest.Item item : items) {
uids[idx++] = createUidAsBytes(item);
for (DocumentRequest item : requests) {
uids[idx++] = createUidAsBytes(item.type(), item.id());
}
return uids;
}
@ -112,10 +113,6 @@ public final class Uid {
return createUidAsBytes(new BytesRef(type), id);
}
public static BytesRef createUidAsBytes(MultiGetRequest.Item item) {
return createUidAsBytes(item.type(), item.id());
}
public static BytesRef createUidAsBytes(BytesRef type, BytesRef id) {
final BytesRef ref = new BytesRef(type.length + 1 + id.length);
System.arraycopy(type.bytes, type.offset, ref.bytes, 0, type.length);

View File

@ -22,9 +22,9 @@ package org.elasticsearch.index.query;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.lucene.uid.Versions;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.*;
import org.elasticsearch.index.VersionType;
import org.elasticsearch.search.fetch.source.FetchSourceContext;
@ -44,6 +44,8 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
* A single get item. Pure delegate to multi get.
*/
public static final class Item extends MultiGetRequest.Item implements ToXContent {
private BytesReference doc;
public Item() {
super();
}
@ -52,6 +54,15 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
super(index, type, id);
}
public BytesReference doc() {
return doc;
}
public Item doc(XContentBuilder doc) {
this.doc = doc.bytes();
return this;
}
@Override
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
builder.startObject();
@ -61,6 +72,17 @@ public class MoreLikeThisQueryBuilder extends BaseQueryBuilder implements Boosta
if (this.id() != null) {
builder.field("_id", this.id());
}
if (this.doc() != null) {
XContentType contentType = XContentFactory.xContentType(doc);
if (contentType == builder.contentType()) {
builder.rawField("doc", doc);
} else {
XContentParser parser = XContentFactory.xContent(contentType).createParser(doc);
parser.nextToken();
builder.field("doc");
builder.copyCurrentStructure(parser);
}
}
if (this.type() != null) {
builder.field("_type", this.type());
}

View File

@ -28,7 +28,8 @@ import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.Query;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.ParseField;
import org.elasticsearch.common.Strings;
@ -41,7 +42,6 @@ import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
@ -100,7 +100,7 @@ public class MoreLikeThisQueryParser implements QueryParser {
XContentParser.Token token;
String currentFieldName = null;
List<MultiGetRequest.Item> items = new ArrayList<MultiGetRequest.Item>();
MultiTermVectorsRequest items = new MultiTermVectorsRequest();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
@ -155,9 +155,21 @@ public class MoreLikeThisQueryParser implements QueryParser {
moreLikeFields.add(parseContext.indexName(parser.text()));
}
} else if (Fields.DOCUMENT_IDS.match(currentFieldName, parseContext.parseFlags())) {
MultiGetRequest.parseIds(parser, items);
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
if (!token.isValue()) {
throw new ElasticsearchIllegalArgumentException("ids array element should only contain ids");
}
items.add(new TermVectorRequest().id(parser.text()));
}
} else if (Fields.DOCUMENTS.match(currentFieldName, parseContext.parseFlags())) {
MultiGetRequest.parseDocuments(parser, items);
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
if (token != XContentParser.Token.START_OBJECT) {
throw new ElasticsearchIllegalArgumentException("docs array element should include an object");
}
TermVectorRequest termVectorRequest = new TermVectorRequest();
TermVectorRequest.parseRequest(termVectorRequest, parser);
items.add(termVectorRequest);
}
} else {
throw new QueryParsingException(parseContext.index(), "[mlt] query does not support [" + currentFieldName + "]");
}
@ -197,7 +209,7 @@ public class MoreLikeThisQueryParser implements QueryParser {
// handle items
if (!items.isEmpty()) {
// set default index, type and fields if not specified
for (MultiGetRequest.Item item : items) {
for (TermVectorRequest item : items) {
if (item.index() == null) {
item.index(parseContext.index().name());
}
@ -209,11 +221,12 @@ public class MoreLikeThisQueryParser implements QueryParser {
item.type(parseContext.queryTypes().iterator().next());
}
}
if (item.fields() == null && item.fetchSourceContext() == null) {
// default fields if not present but don't override for artificial docs
if (item.selectedFields() == null && item.doc() == null) {
if (useDefaultField) {
item.fields("*");
item.selectedFields("*");
} else {
item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
item.selectedFields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
}
}
}
@ -224,7 +237,7 @@ public class MoreLikeThisQueryParser implements QueryParser {
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
// exclude the items from the search
if (!include) {
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items));
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items.getRequests()));
ConstantScoreQuery query = new ConstantScoreQuery(filter);
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
}

View File

@ -47,11 +47,7 @@ public class MoreLikeThisFetchService extends AbstractComponent {
this.client = client;
}
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
MultiTermVectorsRequest request = new MultiTermVectorsRequest();
for (MultiGetRequest.Item item : items) {
request.add(item);
}
public Fields[] fetch(MultiTermVectorsRequest request) throws IOException {
List<Fields> likeFields = new ArrayList<>();
MultiTermVectorsResponse responses = client.multiTermVectors(request).actionGet();
for (MultiTermVectorsItemResponse response : responses) {

View File

@ -35,7 +35,9 @@ import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.NumericUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.compress.CompressedString;
import org.elasticsearch.common.lucene.Lucene;
@ -1666,10 +1668,11 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
super(null, ImmutableSettings.Builder.EMPTY_SETTINGS);
}
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
@Override
public Fields[] fetch(MultiTermVectorsRequest items) throws IOException {
List<Fields> likeTexts = new ArrayList<>();
for (MultiGetRequest.Item item : items) {
likeTexts.add(generateFields(item.fields(), item.id()));
for (TermVectorRequest item : items) {
likeTexts.add(generateFields(item.selectedFields().toArray(Strings.EMPTY_ARRAY), item.id()));
}
return likeTexts.toArray(Fields.EMPTY_ARRAY);
}

View File

@ -29,8 +29,10 @@ import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.action.search.SearchType;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.query.MoreLikeThisQueryBuilder;
import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
@ -565,4 +567,106 @@ public class MoreLikeThisActionTests extends ElasticsearchIntegrationTest {
}
}
}
@Test
public void testMoreLikeThisArtificialDocs() throws Exception {
int numFields = randomIntBetween(5, 10);
logger.info("Creating an index with multiple fields ...");
XContentBuilder mapping = jsonBuilder().startObject().startObject("type1").startObject("properties");
for (int i = 0; i < numFields; i++) {
mapping.startObject("field"+i).field("type", "string").endObject();
}
mapping.endObject().endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping).get());
ensureGreen();
logger.info("Indexing a single document ...");
XContentBuilder doc = jsonBuilder().startObject();
for (int i = 0; i < numFields; i++) {
doc.field("field"+i, generateRandomStringArray(5, 10));
}
doc.endObject();
List<IndexRequestBuilder> builders = new ArrayList<>();
builders.add(client().prepareIndex("test", "type1", "1").setSource(doc));
indexRandom(true, builders);
logger.info("Checking the document matches ...");
MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(doc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.maxQueryTerms(100)
.percentTermsToMatch(1); // strict all terms must match!
SearchResponse response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
assertHitCount(response, 1);
}
@Test
public void testMoreLikeThisMalformedArtificialDocs() throws Exception {
logger.info("Creating an index with a single document ...");
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource(jsonBuilder()
.startObject()
.field("text", "Hello World!")
.field("date", "2009-01-01")
.endObject()));
logger.info("Checking with a malformed field value ...");
XContentBuilder malformedFieldDoc = jsonBuilder()
.startObject()
.field("text", "Hello World!")
.field("date", "this is not a date!")
.endObject();
MoreLikeThisQueryBuilder mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(malformedFieldDoc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.percentTermsToMatch(0);
SearchResponse response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
assertHitCount(response, 0);
logger.info("Checking with an empty document ...");
XContentBuilder emptyDoc = jsonBuilder().startObject().endObject();
mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(emptyDoc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.percentTermsToMatch(0);
response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
assertHitCount(response, 0);
logger.info("Checking when document is malformed ...");
XContentBuilder malformedDoc = jsonBuilder().startObject();
mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(malformedDoc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.percentTermsToMatch(0);
response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
assertHitCount(response, 0);
logger.info("Checking the document matches otherwise ...");
XContentBuilder normalDoc = jsonBuilder()
.startObject()
.field("text", "Hello World!")
.field("date", "1000-01-01") // should be properly parsed but ignored ...
.endObject();
mltQuery = moreLikeThisQuery()
.docs((Item) new Item().doc(normalDoc).index("test").type("type1"))
.minTermFreq(0)
.minDocFreq(0)
.percentTermsToMatch(1); // strict all terms must match but date is ignored
response = client().prepareSearch("test").setTypes("type1")
.setQuery(mltQuery).get();
assertSearchResponse(response);
assertHitCount(response, 1);
}
}