From a13336da5482d5ab2ebdf7cbb8120b551848e279 Mon Sep 17 00:00:00 2001 From: Alex Ksikes Date: Thu, 10 Sep 2015 19:03:50 +0200 Subject: [PATCH] Refactors MoreLikeThisQueryBuilder and Parser Relates to #10217 This PR is against the query-refactoring branch. Closes #13486 --- .../common/io/stream/StreamInput.java | 7 + .../common/io/stream/StreamOutput.java | 12 + .../org/elasticsearch/index/VersionType.java | 25 +- .../index/query/MoreLikeThisQueryBuilder.java | 698 +++++++++++++----- .../index/query/MoreLikeThisQueryParser.java | 282 ++----- .../index/query/QueryShardContext.java | 11 +- .../query/MoreLikeThisQueryBuilderTests.java | 289 ++++++++ .../morelikethis/ItemSerializationTests.java | 60 -- .../search/morelikethis/MoreLikeThisIT.java | 34 +- .../ContextAndHeaderTransportIT.java | 2 +- .../migrate_query_refactoring.asciidoc | 11 + 11 files changed, 948 insertions(+), 483 deletions(-) create mode 100644 core/src/test/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilderTests.java delete mode 100644 core/src/test/java/org/elasticsearch/search/morelikethis/ItemSerializationTests.java diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java index c2bbaa3d5e9..8743d11ca18 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamInput.java @@ -350,6 +350,13 @@ public abstract class StreamInput extends InputStream { return ret; } + public String[] readOptionalStringArray() throws IOException { + if (readBoolean()) { + return readStringArray(); + } + return null; + } + @Nullable @SuppressWarnings("unchecked") public Map readMap() throws IOException { diff --git a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java index a8089198f29..fe4026e2a58 100644 --- a/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java +++ b/core/src/main/java/org/elasticsearch/common/io/stream/StreamOutput.java @@ -316,6 +316,18 @@ public abstract class StreamOutput extends OutputStream { } } + /** + * Writes a string array, for nullable string, writes false. + */ + public void writeOptionalStringArray(@Nullable String[] array) throws IOException { + if (array == null) { + writeBoolean(false); + } else { + writeBoolean(true); + writeStringArray(array); + } + } + public void writeMap(@Nullable Map map) throws IOException { writeGenericValue(map); } diff --git a/core/src/main/java/org/elasticsearch/index/VersionType.java b/core/src/main/java/org/elasticsearch/index/VersionType.java index 7800226c90c..a5d8cae2453 100644 --- a/core/src/main/java/org/elasticsearch/index/VersionType.java +++ b/core/src/main/java/org/elasticsearch/index/VersionType.java @@ -18,12 +18,17 @@ */ package org.elasticsearch.index; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; import org.elasticsearch.common.lucene.uid.Versions; +import java.io.IOException; + /** * */ -public enum VersionType { +public enum VersionType implements Writeable { INTERNAL((byte) 0) { @Override public boolean isVersionConflictForWrites(long currentVersion, long expectedVersion) { @@ -219,6 +224,8 @@ public enum VersionType { private final byte value; + private static final VersionType PROTOTYPE = INTERNAL; + VersionType(byte value) { this.value = value; } @@ -304,4 +311,20 @@ public enum VersionType { } throw new IllegalArgumentException("No version type match [" + value + "]"); } + + @Override + public VersionType readFrom(StreamInput in) throws IOException { + int ordinal = in.readVInt(); + assert (ordinal == 0 || ordinal == 1 || ordinal == 2 || ordinal == 3); + return VersionType.values()[ordinal]; + } + + public static VersionType readVersionTypeFrom(StreamInput in) throws IOException { + return PROTOTYPE.readFrom(in); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeVInt(ordinal()); + } } diff --git a/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java b/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java index 52b0e363fa9..f2c236a4201 100644 --- a/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java +++ b/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilder.java @@ -19,25 +19,40 @@ package org.elasticsearch.index.query; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.Fields; +import org.apache.lucene.queries.TermsQuery; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.ExceptionsHelper; -import org.elasticsearch.action.termvectors.TermVectorsRequest; +import org.elasticsearch.action.termvectors.*; +import org.elasticsearch.client.Client; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.ParseFieldMatcher; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.io.stream.StreamOutput; +import org.elasticsearch.common.io.stream.Writeable; +import org.elasticsearch.common.lucene.search.MoreLikeThisQuery; +import org.elasticsearch.common.lucene.search.XMoreLikeThis; import org.elasticsearch.common.lucene.uid.Versions; -import org.elasticsearch.common.xcontent.ToXContent; -import org.elasticsearch.common.xcontent.XContentBuilder; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.common.xcontent.XContentType; +import org.elasticsearch.common.xcontent.*; import org.elasticsearch.index.VersionType; +import org.elasticsearch.index.analysis.Analysis; +import org.elasticsearch.index.mapper.MappedFieldType; +import org.elasticsearch.index.mapper.internal.UidFieldMapper; +import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.*; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.mapper.Uid.createUidAsBytes; /** * A more like this query that finds documents that are "like" the provided set of document(s). @@ -46,10 +61,50 @@ import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; */ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder { + public static final String NAME = "mlt"; + + public static final int DEFAULT_MAX_QUERY_TERMS = XMoreLikeThis.DEFAULT_MAX_QUERY_TERMS; + public static final int DEFAULT_MIN_TERM_FREQ = XMoreLikeThis.DEFAULT_MIN_TERM_FREQ; + public static final int DEFAULT_MIN_DOC_FREQ = XMoreLikeThis.DEFAULT_MIN_DOC_FREQ; + public static final int DEFAULT_MAX_DOC_FREQ = XMoreLikeThis.DEFAULT_MAX_DOC_FREQ; + public static final int DEFAULT_MIN_WORD_LENGTH = XMoreLikeThis.DEFAULT_MIN_WORD_LENGTH; + public static final int DEFAULT_MAX_WORD_LENGTH = XMoreLikeThis.DEFAULT_MAX_WORD_LENGTH; + public static final String DEFAULT_MINIMUM_SHOULD_MATCH = MoreLikeThisQuery.DEFAULT_MINIMUM_SHOULD_MATCH; + public static final float DEFAULT_BOOST_TERMS = 0; // no boost terms + public static final boolean DEFAULT_INCLUDE = false; + public static final boolean DEFAULT_FAIL_ON_UNSUPPORTED_FIELDS = true; + + // document inputs + private final List fields; + private List likeTexts = new ArrayList<>(); + private List unlikeTexts = new ArrayList<>(); + private List likeItems = new ArrayList<>(); + private List unlikeItems = new ArrayList<>(); + + // term selection parameters + private int maxQueryTerms = DEFAULT_MAX_QUERY_TERMS; + private int minTermFreq = DEFAULT_MIN_TERM_FREQ; + private int minDocFreq = DEFAULT_MIN_DOC_FREQ; + private int maxDocFreq = DEFAULT_MAX_DOC_FREQ; + private int minWordLength = DEFAULT_MIN_WORD_LENGTH; + private int maxWordLength = DEFAULT_MAX_WORD_LENGTH; + private String[] stopWords; + private String analyzer; + + // query formation parameters + private String minimumShouldMatch = DEFAULT_MINIMUM_SHOULD_MATCH; + private float boostTerms = DEFAULT_BOOST_TERMS; + private boolean include = DEFAULT_INCLUDE; + + // other parameters + private boolean failOnUnsupportedField = DEFAULT_FAIL_ON_UNSUPPORTED_FIELDS; + + static final MoreLikeThisQueryBuilder PROTOTYPE = new MoreLikeThisQueryBuilder(); + /** * A single item to be used for a {@link MoreLikeThisQueryBuilder}. */ - public static final class Item implements ToXContent { + public static final class Item implements ToXContent, Writeable { public static final Item[] EMPTY_ARRAY = new Item[0]; public interface Field { @@ -74,6 +129,8 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder fields = new ArrayList<>(); @@ -270,6 +313,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder) in.readGenericValue(); + item.routing = in.readOptionalString(); + item.version = in.readLong(); + item.versionType = VersionType.readVersionTypeFrom(in); + return item; + } + + public static Item readItemFrom(StreamInput in) throws IOException { + return PROTOTYPE.readFrom(in); + } + + @Override + public void writeTo(StreamOutput out) throws IOException { + out.writeOptionalString(index); + out.writeOptionalString(type); + out.writeBoolean(doc != null); + if (doc != null) { + out.writeGenericValue(doc); + } else { + out.writeString(id); + } + out.writeOptionalStringArray(fields); + out.writeGenericValue(perFieldAnalyzer); + out.writeOptionalString(routing); + out.writeLong(version); + versionType.writeTo(out); + } + @Override public int hashCode() { return Objects.hash(index, type, id, doc, Arrays.hashCode(fields), perFieldAnalyzer, routing, @@ -349,36 +435,6 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder likeTexts = new ArrayList<>(); - private List unlikeTexts = new ArrayList<>(); - private List likeItems = new ArrayList<>(); - private List unlikeItems = new ArrayList<>(); - private final String[] fields; - - // term selection parameters - private int maxQueryTerms = -1; - private int minTermFreq = -1; - private int minDocFreq = -1; - private int maxDocFreq = -1; - private int minWordLength = -1; - private int maxWordLength = -1; - private String[] stopWords = null; - private String analyzer; - - // query formation parameters - private String minimumShouldMatch = null; - private float boostTerms = -1; - private Boolean include = null; - - // other parameters - private Boolean failOnUnsupportedField; - - static final MoreLikeThisQueryBuilder PROTOTYPE = new MoreLikeThisQueryBuilder(); - - /** * Constructs a new more like this query which uses the "_all" field. */ @@ -392,17 +448,34 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder fields) { this.fields = fields; } + public List fields() { + return fields; + } + /** * Sets the text to use in order to find documents that are "like" this. * * @param likeTexts the text to use when generating the 'More Like This' query. */ public MoreLikeThisQueryBuilder like(String... likeTexts) { - this.likeTexts = new ArrayList<>(); - return addLikeText(likeTexts); + this.likeTexts = Collections.unmodifiableList(Arrays.asList(likeTexts)); + return this; + } + + public List likeTexts() { + return likeTexts; } /** @@ -411,56 +484,36 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder(); - return addLikeItem(likeItems); - } - - /** - * Adds some text to use in order to find documents that are "like" this. - */ - public MoreLikeThisQueryBuilder addLikeText(String... likeTexts) { - Collections.addAll(this.likeTexts, likeTexts); + this.likeItems = Collections.unmodifiableList(Arrays.asList(likeItems)); return this; } - /** - * Adds a document to use in order to find documents that are "like" this. - */ - public MoreLikeThisQueryBuilder addLikeItem(Item... likeItems) { - Collections.addAll(this.likeItems, likeItems); - return this; + public List likeItems() { + return likeItems; } /** * Sets the text from which the terms should not be selected from. */ public MoreLikeThisQueryBuilder unlike(String... unlikeTexts) { - this.unlikeTexts = new ArrayList<>(); - return addUnlikeText(unlikeTexts); + this.unlikeTexts = Collections.unmodifiableList(Arrays.asList(unlikeTexts)); + return this; + } + + public List unlikeTexts() { + return unlikeTexts; } /** * Sets the documents from which the terms should not be selected from. */ public MoreLikeThisQueryBuilder unlike(Item... unlikeItems) { - this.unlikeItems = new ArrayList<>(); - return addUnlikeItem(unlikeItems); - } - - /** - * Adds some text to use in order to find documents that are "unlike" this. - */ - public MoreLikeThisQueryBuilder addUnlikeText(String... unlikeTexts) { - Collections.addAll(this.unlikeTexts, unlikeTexts); + this.unlikeItems = Collections.unmodifiableList(Arrays.asList(unlikeItems)); return this; } - /** - * Adds a document to use in order to find documents that are "unlike" this. - */ - public MoreLikeThisQueryBuilder addUnlikeItem(Item... unlikeItems) { - Collections.addAll(this.unlikeItems, unlikeItems); - return this; + public List unlikeItems() { + return unlikeItems; } /** @@ -472,6 +525,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder2. @@ -481,6 +538,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder5. @@ -490,6 +551,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder0. @@ -508,6 +577,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder0). @@ -517,6 +590,34 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder + *

Any word in this set is considered "uninteresting" and ignored. Even if your Analyzer allows stopwords, you + * might want to tell the MoreLikeThis code to ignore them, as for the purposes of document similarity it seems + * reasonable to assume that "a stop word is never interesting". + */ + public MoreLikeThisQueryBuilder stopWords(String... stopWords) { + this.stopWords = stopWords; + return this; + } + + public MoreLikeThisQueryBuilder stopWords(List stopWords) { + if (stopWords == null) { + throw new IllegalArgumentException("requires stopwords to be non-null"); + } + this.stopWords = stopWords.toArray(new String[stopWords.size()]); + return this; + } + + public String[] stopWords() { + return stopWords; + } + /** * The analyzer that will be used to analyze the text. Defaults to the analyzer associated with the fied. */ @@ -525,6 +626,10 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder30%. @@ -532,18 +637,29 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder1. + * Sets the boost factor to use when boosting terms. Defaults to 0 (deactivated). */ public MoreLikeThisQueryBuilder boostTerms(float boostTerms) { this.boostTerms = boostTerms; return this; } + public float boostTerms() { + return boostTerms; + } + /** * Whether to include the input documents. Defaults to false */ @@ -552,14 +668,22 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder 0) { + builder.field(MoreLikeThisQueryParser.Field.MAX_QUERY_TERMS.getPreferredName(), maxQueryTerms); + builder.field(MoreLikeThisQueryParser.Field.MIN_TERM_FREQ.getPreferredName(), minTermFreq); + builder.field(MoreLikeThisQueryParser.Field.MIN_DOC_FREQ.getPreferredName(), minDocFreq); + builder.field(MoreLikeThisQueryParser.Field.MAX_DOC_FREQ.getPreferredName(), maxDocFreq); + builder.field(MoreLikeThisQueryParser.Field.MIN_WORD_LENGTH.getPreferredName(), minWordLength); + builder.field(MoreLikeThisQueryParser.Field.MAX_WORD_LENGTH.getPreferredName(), maxWordLength); + if (stopWords != null) { builder.field(MoreLikeThisQueryParser.Field.STOP_WORDS.getPreferredName(), stopWords); } if (analyzer != null) { builder.field(MoreLikeThisQueryParser.Field.ANALYZER.getPreferredName(), analyzer); } - if (minimumShouldMatch != null) { - builder.field(MoreLikeThisQueryParser.Field.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch); - } - if (boostTerms != -1) { - builder.field(MoreLikeThisQueryParser.Field.BOOST_TERMS.getPreferredName(), boostTerms); - } - if (include != null) { - builder.field(MoreLikeThisQueryParser.Field.INCLUDE.getPreferredName(), include); - } - if (failOnUnsupportedField != null) { - builder.field(MoreLikeThisQueryParser.Field.FAIL_ON_UNSUPPORTED_FIELD.getPreferredName(), failOnUnsupportedField); - } + builder.field(MoreLikeThisQueryParser.Field.MINIMUM_SHOULD_MATCH.getPreferredName(), minimumShouldMatch); + builder.field(MoreLikeThisQueryParser.Field.BOOST_TERMS.getPreferredName(), boostTerms); + builder.field(MoreLikeThisQueryParser.Field.INCLUDE.getPreferredName(), include); + builder.field(MoreLikeThisQueryParser.Field.FAIL_ON_UNSUPPORTED_FIELD.getPreferredName(), failOnUnsupportedField); printBoostAndQueryName(builder); builder.endObject(); } @@ -679,4 +746,299 @@ public class MoreLikeThisQueryBuilder extends AbstractQueryBuilder(Arrays.asList(stopWords))); + } + + // sets boost terms + if (boostTerms != 0) { + mltQuery.setBoostTerms(true); + mltQuery.setBoostTermsFactor(boostTerms); + } + + // set analyzer + Analyzer analyzerObj = context.analysisService().analyzer(analyzer); + if (analyzerObj == null) { + analyzerObj = context.mapperService().searchAnalyzer(); + } + mltQuery.setAnalyzer(analyzerObj); + + // set like text fields + boolean useDefaultField = (fields == null); + List moreLikeFields = new ArrayList<>(); + if (useDefaultField) { + moreLikeFields = Collections.singletonList(context.defaultField()); + } else { + for (String field : fields) { + MappedFieldType fieldType = context.fieldMapper(field); + moreLikeFields.add(fieldType == null ? field : fieldType.names().indexName()); + } + } + + // possibly remove unsupported fields + removeUnsupportedFields(moreLikeFields, analyzerObj, failOnUnsupportedField); + if (moreLikeFields.isEmpty()) { + return null; + } + mltQuery.setMoreLikeFields(moreLikeFields.toArray(Strings.EMPTY_ARRAY)); + + // handle like texts + if (likeTexts.isEmpty() == false) { + mltQuery.setLikeText(likeTexts); + } + if (unlikeTexts.isEmpty() == false) { + mltQuery.setUnlikeText(unlikeTexts); + } + + // handle items + if (likeItems.isEmpty() == false) { + return handleItems(context, mltQuery, likeItems, unlikeItems, include, moreLikeFields, useDefaultField); + } else { + return mltQuery; + } + } + + private static List removeUnsupportedFields(List moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException { + for (Iterator it = moreLikeFields.iterator(); it.hasNext(); ) { + final String fieldName = it.next(); + if (!Analysis.generatesCharacterTokenStream(analyzer, fieldName)) { + if (failOnUnsupportedField) { + throw new IllegalArgumentException("more_like_this doesn't support binary/numeric fields: [" + fieldName + "]"); + } else { + it.remove(); + } + } + } + return moreLikeFields; + } + + private Query handleItems(QueryShardContext context, MoreLikeThisQuery mltQuery, List likeItems, List unlikeItems, + boolean include, List moreLikeFields, boolean useDefaultField) throws IOException { + // set default index, type and fields if not specified + for (Item item : likeItems) { + setDefaultIndexTypeFields(context, item, moreLikeFields, useDefaultField); + } + for (Item item : unlikeItems) { + setDefaultIndexTypeFields(context, item, moreLikeFields, useDefaultField); + } + + // fetching the items with multi-termvectors API + MultiTermVectorsResponse responses = fetchResponse(context.getClient(), likeItems, unlikeItems, SearchContext.current()); + + // getting the Fields for liked items + mltQuery.setLikeText(getFieldsFor(responses, likeItems)); + + // getting the Fields for unliked items + if (!unlikeItems.isEmpty()) { + org.apache.lucene.index.Fields[] unlikeFields = getFieldsFor(responses, unlikeItems); + if (unlikeFields.length > 0) { + mltQuery.setUnlikeText(unlikeFields); + } + } + + BooleanQuery boolQuery = new BooleanQuery(); + boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD); + + // exclude the items from the search + if (!include) { + handleExclude(boolQuery, likeItems); + } + return boolQuery; + } + + private static void setDefaultIndexTypeFields(QueryShardContext context, Item item, List moreLikeFields, + boolean useDefaultField) { + if (item.index() == null) { + item.index(context.index().name()); + } + if (item.type() == null) { + if (context.queryTypes().size() > 1) { + throw new QueryShardException(context, + "ambiguous type for item with id: " + item.id() + " and index: " + item.index()); + } else { + item.type(context.queryTypes().iterator().next()); + } + } + // default fields if not present but don't override for artificial docs + if ((item.fields() == null || item.fields().length == 0) && item.doc() == null) { + if (useDefaultField) { + item.fields("*"); + } else { + item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()])); + } + } + } + + private MultiTermVectorsResponse fetchResponse(Client client, List likeItems, @Nullable List unlikeItems, + SearchContext searchContext) throws IOException { + MultiTermVectorsRequest request = new MultiTermVectorsRequest(); + for (Item item : likeItems) { + request.add(item.toTermVectorsRequest()); + } + if (unlikeItems != null) { + for (Item item : unlikeItems) { + request.add(item.toTermVectorsRequest()); + } + } + request.copyContextAndHeadersFrom(searchContext); + return client.multiTermVectors(request).actionGet(); + } + + private static Fields[] getFieldsFor(MultiTermVectorsResponse responses, List items) throws IOException { + List likeFields = new ArrayList<>(); + + Set selectedItems = new HashSet<>(); + for (Item request : items) { + selectedItems.add(new Item(request.index(), request.type(), request.id())); + } + + for (MultiTermVectorsItemResponse response : responses) { + if (!hasResponseFromRequest(response, selectedItems)) { + continue; + } + if (response.isFailed()) { + continue; + } + TermVectorsResponse getResponse = response.getResponse(); + if (!getResponse.isExists()) { + continue; + } + likeFields.add(getResponse.getFields()); + } + return likeFields.toArray(Fields.EMPTY_ARRAY); + } + + private static boolean hasResponseFromRequest(MultiTermVectorsItemResponse response, Set selectedItems) { + return selectedItems.contains(new Item(response.getIndex(), response.getType(), response.getId())); + } + + private static void handleExclude(BooleanQuery boolQuery, List likeItems) { + // artificial docs get assigned a random id and should be disregarded + List uids = new ArrayList<>(); + for (Item item : likeItems) { + if (item.doc() != null) { + continue; + } + uids.add(createUidAsBytes(item.type(), item.id())); + } + if (!uids.isEmpty()) { + TermsQuery query = new TermsQuery(UidFieldMapper.NAME, uids.toArray(new BytesRef[0])); + boolQuery.add(query, BooleanClause.Occur.MUST_NOT); + } + } + + @Override + public QueryValidationException validate() { + QueryValidationException validationException = null; + if (likeTexts.isEmpty() && likeItems.isEmpty()) { + validationException = addValidationError("requires 'like' to be specified.", validationException); + } + if (fields != null && fields.isEmpty()) { + validationException = addValidationError("requires 'fields' to be specified", validationException); + } + return validationException; + } + + @Override + protected MoreLikeThisQueryBuilder doReadFrom(StreamInput in) throws IOException { + MoreLikeThisQueryBuilder moreLikeThisQueryBuilder = new MoreLikeThisQueryBuilder((List) in.readGenericValue()); + moreLikeThisQueryBuilder.likeTexts = (List) in.readGenericValue(); + moreLikeThisQueryBuilder.unlikeTexts = (List) in.readGenericValue(); + moreLikeThisQueryBuilder.likeItems = readItems(in); + moreLikeThisQueryBuilder.unlikeItems = readItems(in); + moreLikeThisQueryBuilder.maxQueryTerms = in.readVInt(); + moreLikeThisQueryBuilder.minTermFreq = in.readVInt(); + moreLikeThisQueryBuilder.minDocFreq = in.readVInt(); + moreLikeThisQueryBuilder.maxDocFreq = in.readVInt(); + moreLikeThisQueryBuilder.minWordLength = in.readVInt(); + moreLikeThisQueryBuilder.maxWordLength = in.readVInt(); + moreLikeThisQueryBuilder.stopWords = in.readOptionalStringArray(); + moreLikeThisQueryBuilder.analyzer = in.readOptionalString(); + moreLikeThisQueryBuilder.minimumShouldMatch = in.readString(); + moreLikeThisQueryBuilder.boostTerms = (Float) in.readGenericValue(); + moreLikeThisQueryBuilder.include = in.readBoolean(); + moreLikeThisQueryBuilder.failOnUnsupportedField = in.readBoolean(); + return moreLikeThisQueryBuilder; + } + + private static List readItems(StreamInput in) throws IOException { + List items = new ArrayList<>(); + int size = in.readVInt(); + for (int i = 0; i < size; i++) { + items.add(Item.readItemFrom(in)); + } + return items; + } + + @Override + protected void doWriteTo(StreamOutput out) throws IOException { + out.writeGenericValue(fields); + out.writeGenericValue(likeTexts); + out.writeGenericValue(unlikeTexts); + writeItems(likeItems, out); + writeItems(unlikeItems, out); + out.writeVInt(maxQueryTerms); + out.writeVInt(minTermFreq); + out.writeVInt(minDocFreq); + out.writeVInt(maxDocFreq); + out.writeVInt(minWordLength); + out.writeVInt(maxWordLength); + out.writeOptionalStringArray(stopWords); + out.writeOptionalString(analyzer); + out.writeString(minimumShouldMatch); + out.writeGenericValue(boostTerms); + out.writeBoolean(include); + out.writeBoolean(failOnUnsupportedField); + } + + private static void writeItems(List items, StreamOutput out) throws IOException { + out.writeVInt(items.size()); + for (Item item : items) { + item.writeTo(out); + } + } + + @Override + protected int doHashCode() { + return Objects.hash(fields, likeTexts, unlikeTexts, likeItems, unlikeItems, maxQueryTerms, minTermFreq, + minDocFreq, maxDocFreq, minWordLength, maxWordLength, Arrays.hashCode(stopWords), analyzer, minimumShouldMatch, + boostTerms, include, failOnUnsupportedField); + } + + @Override + protected boolean doEquals(MoreLikeThisQueryBuilder other) { + return Objects.equals(fields, other.fields) && + Objects.equals(likeTexts, other.likeTexts) && + Objects.equals(unlikeTexts, other.unlikeTexts) && + Objects.equals(likeItems, other.likeItems) && + Objects.equals(unlikeItems, other.unlikeItems) && + Objects.equals(maxQueryTerms, other.maxQueryTerms) && + Objects.equals(minTermFreq, other.minTermFreq) && + Objects.equals(minDocFreq, other.minDocFreq) && + Objects.equals(maxDocFreq, other.maxDocFreq) && + Objects.equals(minWordLength, other.minWordLength) && + Objects.equals(maxWordLength, other.maxWordLength) && + Arrays.equals(stopWords, other.stopWords) && // otherwise we are comparing pointers + Objects.equals(analyzer, other.analyzer) && + Objects.equals(minimumShouldMatch, other.minimumShouldMatch) && + Objects.equals(boostTerms, other.boostTerms) && + Objects.equals(include, other.include) && + Objects.equals(failOnUnsupportedField, other.failOnUnsupportedField); + } } diff --git a/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java b/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java index 07adeeceedb..97e9084ae0c 100644 --- a/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java +++ b/core/src/main/java/org/elasticsearch/index/query/MoreLikeThisQueryParser.java @@ -19,43 +19,20 @@ package org.elasticsearch.index.query; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.index.Fields; -import org.apache.lucene.queries.TermsQuery; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.util.BytesRef; -import org.elasticsearch.action.termvectors.*; -import org.elasticsearch.client.Client; -import org.elasticsearch.common.Nullable; import org.elasticsearch.common.ParseField; -import org.elasticsearch.common.Strings; -import org.elasticsearch.common.lucene.search.MoreLikeThisQuery; import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.index.analysis.Analysis; -import org.elasticsearch.index.mapper.MappedFieldType; -import org.elasticsearch.index.mapper.internal.UidFieldMapper; import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item; -import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.Iterator; -import java.util.LinkedList; import java.util.List; -import java.util.Set; - -import static org.elasticsearch.index.mapper.Uid.createUidAsBytes; /** * Parser for the The More Like This Query (MLT Query) which finds documents that are "like" a given set of documents. * * The documents are provided as a set of strings and/or a list of {@link Item}. */ -public class MoreLikeThisQueryParser extends BaseQueryParserTemp { +public class MoreLikeThisQueryParser extends BaseQueryParser { public interface Field { ParseField FIELDS = new ParseField("fields"); @@ -84,23 +61,34 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { } @Override - public Query parse(QueryShardContext context) throws IOException, QueryParsingException { - QueryParseContext parseContext = context.parseContext(); + public MoreLikeThisQueryBuilder fromXContent(QueryParseContext parseContext) throws IOException, QueryParsingException { XContentParser parser = parseContext.parser(); - MoreLikeThisQuery mltQuery = new MoreLikeThisQuery(); - mltQuery.setSimilarity(context.searchSimilarity()); - + // document inputs + List fields = null; List likeTexts = new ArrayList<>(); List unlikeTexts = new ArrayList<>(); List likeItems = new ArrayList<>(); List unlikeItems = new ArrayList<>(); - List moreLikeFields = null; - Analyzer analyzer = null; - boolean include = false; + // term selection parameters + int maxQueryTerms = MoreLikeThisQueryBuilder.DEFAULT_MAX_QUERY_TERMS; + int minTermFreq = MoreLikeThisQueryBuilder.DEFAULT_MIN_TERM_FREQ; + int minDocFreq = MoreLikeThisQueryBuilder.DEFAULT_MIN_DOC_FREQ; + int maxDocFreq = MoreLikeThisQueryBuilder.DEFAULT_MAX_DOC_FREQ; + int minWordLength = MoreLikeThisQueryBuilder.DEFAULT_MIN_WORD_LENGTH; + int maxWordLength = MoreLikeThisQueryBuilder.DEFAULT_MAX_WORD_LENGTH; + List stopWords = null; + String analyzer = null; - boolean failOnUnsupportedField = true; + // query formation parameters + String minimumShouldMatch = MoreLikeThisQueryBuilder.DEFAULT_MINIMUM_SHOULD_MATCH; + float boostTerms = MoreLikeThisQueryBuilder.DEFAULT_BOOST_TERMS; + boolean include = MoreLikeThisQueryBuilder.DEFAULT_INCLUDE; + + // other parameters + boolean failOnUnsupportedField = MoreLikeThisQueryBuilder.DEFAULT_FAIL_ON_UNSUPPORTED_FIELDS; + float boost = AbstractQueryBuilder.DEFAULT_BOOST; String queryName = null; XContentParser.Token token; @@ -116,37 +104,29 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.LIKE_TEXT)) { likeTexts.add(parser.text()); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MAX_QUERY_TERMS)) { - mltQuery.setMaxQueryTerms(parser.intValue()); + maxQueryTerms = parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MIN_TERM_FREQ)) { - mltQuery.setMinTermFrequency(parser.intValue()); + minTermFreq =parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MIN_DOC_FREQ)) { - mltQuery.setMinDocFreq(parser.intValue()); + minDocFreq = parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MAX_DOC_FREQ)) { - mltQuery.setMaxDocFreq(parser.intValue()); + maxDocFreq = parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MIN_WORD_LENGTH)) { - mltQuery.setMinWordLen(parser.intValue()); + minWordLength = parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MAX_WORD_LENGTH)) { - mltQuery.setMaxWordLen(parser.intValue()); + maxWordLength = parser.intValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.ANALYZER)) { - analyzer = context.analysisService().analyzer(parser.text()); + analyzer = parser.text(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MINIMUM_SHOULD_MATCH)) { - mltQuery.setMinimumShouldMatch(parser.text()); + minimumShouldMatch = parser.text(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.BOOST_TERMS)) { - float boostFactor = parser.floatValue(); - if (boostFactor != 0) { - mltQuery.setBoostTerms(true); - mltQuery.setBoostTermsFactor(boostFactor); - } - } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.MINIMUM_SHOULD_MATCH)) { - mltQuery.setMinimumShouldMatch(parser.text()); - } else if ("analyzer".equals(currentFieldName)) { - analyzer = context.analysisService().analyzer(parser.text()); + boostTerms = parser.floatValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.INCLUDE)) { include = parser.booleanValue(); } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.FAIL_ON_UNSUPPORTED_FIELD)) { failOnUnsupportedField = parser.booleanValue(); } else if ("boost".equals(currentFieldName)) { - mltQuery.setBoost(parser.floatValue()); + boost = parser.floatValue(); } else if ("_name".equals(currentFieldName)) { queryName = parser.text(); } else { @@ -154,11 +134,9 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { } } else if (token == XContentParser.Token.START_ARRAY) { if (parseContext.parseFieldMatcher().match(currentFieldName, Field.FIELDS)) { - moreLikeFields = new LinkedList<>(); + fields = new ArrayList<>(); while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { - String field = parser.text(); - MappedFieldType fieldType = context.fieldMapper(field); - moreLikeFields.add(fieldType == null ? field : fieldType.names().indexName()); + fields.add(parser.text()); } } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.LIKE)) { while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { @@ -183,11 +161,10 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { likeItems.add(Item.parse(parser, parseContext.parseFieldMatcher(), new Item())); } } else if (parseContext.parseFieldMatcher().match(currentFieldName, Field.STOP_WORDS)) { - Set stopWords = new HashSet<>(); + stopWords = new ArrayList<>(); while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) { stopWords.add(parser.text()); } - mltQuery.setStopWords(stopWords); } else { throw new QueryParsingException(parseContext, "[mlt] query does not support [" + currentFieldName + "]"); } @@ -205,48 +182,32 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { if (likeTexts.isEmpty() && likeItems.isEmpty()) { throw new QueryParsingException(parseContext, "more_like_this requires 'like' to be specified"); } - if (moreLikeFields != null && moreLikeFields.isEmpty()) { + if (fields != null && fields.isEmpty()) { throw new QueryParsingException(parseContext, "more_like_this requires 'fields' to be non-empty"); } - // set analyzer - if (analyzer == null) { - analyzer = context.mapperService().searchAnalyzer(); - } - mltQuery.setAnalyzer(analyzer); - - // set like text fields - boolean useDefaultField = (moreLikeFields == null); - if (useDefaultField) { - moreLikeFields = Collections.singletonList(context.defaultField()); - } - - // possibly remove unsupported fields - removeUnsupportedFields(moreLikeFields, analyzer, failOnUnsupportedField); - if (moreLikeFields.isEmpty()) { - return null; - } - mltQuery.setMoreLikeFields(moreLikeFields.toArray(Strings.EMPTY_ARRAY)); - - // support for named query - if (queryName != null) { - context.addNamedQuery(queryName, mltQuery); - } - - // handle like texts - if (!likeTexts.isEmpty()) { - mltQuery.setLikeText(likeTexts); - } - if (!unlikeTexts.isEmpty()) { - mltQuery.setUnlikeText(unlikeTexts); - } - - // handle items - if (!likeItems.isEmpty()) { - return handleItems(context, mltQuery, likeItems, unlikeItems, include, moreLikeFields, useDefaultField); - } else { - return mltQuery; + MoreLikeThisQueryBuilder moreLikeThisQueryBuilder = new MoreLikeThisQueryBuilder(fields) + .like(likeTexts.toArray(new String[likeTexts.size()])) + .unlike(unlikeTexts.toArray(new String[unlikeTexts.size()])) + .like(likeItems.toArray(new Item[likeItems.size()])) + .unlike(unlikeItems.toArray(new Item[unlikeItems.size()])) + .maxQueryTerms(maxQueryTerms) + .minTermFreq(minTermFreq) + .minDocFreq(minDocFreq) + .maxDocFreq(maxDocFreq) + .minWordLength(minWordLength) + .maxWordLength(maxWordLength) + .analyzer(analyzer) + .minimumShouldMatch(minimumShouldMatch) + .boostTerms(boostTerms) + .include(include) + .failOnUnsupportedField(failOnUnsupportedField) + .boost(boost) + .queryName(queryName); + if (stopWords != null) { + moreLikeThisQueryBuilder.stopWords(stopWords); } + return moreLikeThisQueryBuilder; } private static void parseLikeField(QueryParseContext parseContext, List texts, List items) throws IOException { @@ -260,139 +221,8 @@ public class MoreLikeThisQueryParser extends BaseQueryParserTemp { } } - private static List removeUnsupportedFields(List moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException { - for (Iterator it = moreLikeFields.iterator(); it.hasNext(); ) { - final String fieldName = it.next(); - if (!Analysis.generatesCharacterTokenStream(analyzer, fieldName)) { - if (failOnUnsupportedField) { - throw new IllegalArgumentException("more_like_this doesn't support binary/numeric fields: [" + fieldName + "]"); - } else { - it.remove(); - } - } - } - return moreLikeFields; - } - - private Query handleItems(QueryShardContext context, MoreLikeThisQuery mltQuery, List likeItems, List unlikeItems, - boolean include, List moreLikeFields, boolean useDefaultField) throws IOException { - - QueryParseContext parseContext = context.parseContext(); - // set default index, type and fields if not specified - for (Item item : likeItems) { - setDefaultIndexTypeFields(parseContext, item, moreLikeFields, useDefaultField); - } - for (Item item : unlikeItems) { - setDefaultIndexTypeFields(parseContext, item, moreLikeFields, useDefaultField); - } - - // fetching the items with multi-termvectors API - MultiTermVectorsResponse responses = fetchResponse(context.getClient(), likeItems, unlikeItems, SearchContext.current()); - - // getting the Fields for liked items - mltQuery.setLikeText(getFieldsFor(responses, likeItems)); - - // getting the Fields for unliked items - if (!unlikeItems.isEmpty()) { - org.apache.lucene.index.Fields[] unlikeFields = getFieldsFor(responses, unlikeItems); - if (unlikeFields.length > 0) { - mltQuery.setUnlikeText(unlikeFields); - } - } - - BooleanQuery boolQuery = new BooleanQuery(); - boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD); - - // exclude the items from the search - if (!include) { - handleExclude(boolQuery, likeItems); - } - return boolQuery; - } - - private static void setDefaultIndexTypeFields(QueryParseContext parseContext, Item item, List moreLikeFields, - boolean useDefaultField) { - if (item.index() == null) { - item.index(parseContext.index().name()); - } - if (item.type() == null) { - if (parseContext.shardContext().queryTypes().size() > 1) { - throw new QueryParsingException(parseContext, - "ambiguous type for item with id: " + item.id() + " and index: " + item.index()); - } else { - item.type(parseContext.shardContext().queryTypes().iterator().next()); - } - } - // default fields if not present but don't override for artificial docs - if ((item.fields() == null || item.fields().length == 0) && item.doc() == null) { - if (useDefaultField) { - item.fields("*"); - } else { - item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()])); - } - } - } - - private static void handleExclude(BooleanQuery boolQuery, List likeItems) { - // artificial docs get assigned a random id and should be disregarded - List uids = new ArrayList<>(); - for (Item item : likeItems) { - if (item.doc() != null) { - continue; - } - uids.add(createUidAsBytes(item.type(), item.id())); - } - if (!uids.isEmpty()) { - TermsQuery query = new TermsQuery(UidFieldMapper.NAME, uids.toArray(new BytesRef[0])); - boolQuery.add(query, BooleanClause.Occur.MUST_NOT); - } - } - @Override public MoreLikeThisQueryBuilder getBuilderPrototype() { return MoreLikeThisQueryBuilder.PROTOTYPE; } - - private MultiTermVectorsResponse fetchResponse(Client client, List likeItems, @Nullable List unlikeItems, - SearchContext searchContext) throws IOException { - MultiTermVectorsRequest request = new MultiTermVectorsRequest(); - for (Item item : likeItems) { - request.add(item.toTermVectorsRequest()); - } - if (unlikeItems != null) { - for (Item item : unlikeItems) { - request.add(item.toTermVectorsRequest()); - } - } - request.copyContextAndHeadersFrom(searchContext); - return client.multiTermVectors(request).actionGet(); - } - - private static Fields[] getFieldsFor(MultiTermVectorsResponse responses, List items) throws IOException { - List likeFields = new ArrayList<>(); - - Set selectedItems = new HashSet<>(); - for (Item request : items) { - selectedItems.add(new Item(request.index(), request.type(), request.id())); - } - - for (MultiTermVectorsItemResponse response : responses) { - if (!hasResponseFromRequest(response, selectedItems)) { - continue; - } - if (response.isFailed()) { - continue; - } - TermVectorsResponse getResponse = response.getResponse(); - if (!getResponse.isExists()) { - continue; - } - likeFields.add(getResponse.getFields()); - } - return likeFields.toArray(Fields.EMPTY_ARRAY); - } - - private static boolean hasResponseFromRequest(MultiTermVectorsItemResponse response, Set selectedItems) { - return selectedItems.contains(new Item(response.getIndex(), response.getType(), response.getId())); - } } diff --git a/core/src/main/java/org/elasticsearch/index/query/QueryShardContext.java b/core/src/main/java/org/elasticsearch/index/query/QueryShardContext.java index 8ce547ed45d..6dd2e499b0a 100644 --- a/core/src/main/java/org/elasticsearch/index/query/QueryShardContext.java +++ b/core/src/main/java/org/elasticsearch/index/query/QueryShardContext.java @@ -20,7 +20,6 @@ package org.elasticsearch.index.query; import com.google.common.collect.ImmutableMap; - import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryparser.classic.MapperQueryParser; import org.apache.lucene.queryparser.classic.QueryParserSettings; @@ -33,21 +32,15 @@ import org.elasticsearch.client.Client; import org.elasticsearch.cluster.metadata.IndexMetaData; import org.elasticsearch.common.ParseFieldMatcher; import org.elasticsearch.common.bytes.BytesReference; -import org.elasticsearch.common.geo.builders.ShapeBuilder; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.index.Index; import org.elasticsearch.index.analysis.AnalysisService; import org.elasticsearch.index.fielddata.IndexFieldData; -import org.elasticsearch.index.mapper.ContentPath; -import org.elasticsearch.index.mapper.MappedFieldType; -import org.elasticsearch.index.mapper.Mapper; -import org.elasticsearch.index.mapper.MapperBuilders; -import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.mapper.*; import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.mapper.object.ObjectMapper; import org.elasticsearch.index.query.support.NestedScope; -import org.elasticsearch.indices.cache.query.terms.TermsLookup; import org.elasticsearch.script.ExecutableScript; import org.elasticsearch.script.ScriptContext; import org.elasticsearch.script.ScriptService; @@ -56,11 +49,9 @@ import org.elasticsearch.search.fetch.innerhits.InnerHitsContext; import org.elasticsearch.search.internal.SearchContext; import org.elasticsearch.search.lookup.SearchLookup; -import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.HashMap; -import java.util.List; import java.util.Map; /** diff --git a/core/src/test/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilderTests.java b/core/src/test/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilderTests.java new file mode 100644 index 00000000000..6eb53ace9f9 --- /dev/null +++ b/core/src/test/java/org/elasticsearch/index/query/MoreLikeThisQueryBuilderTests.java @@ -0,0 +1,289 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.index.query; + +import org.apache.lucene.analysis.core.WhitespaceAnalyzer; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.MultiFields; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.Query; +import org.elasticsearch.ElasticsearchException; +import org.elasticsearch.action.termvectors.*; +import org.elasticsearch.common.ParseFieldMatcher; +import org.elasticsearch.common.Strings; +import org.elasticsearch.common.io.stream.BytesStreamOutput; +import org.elasticsearch.common.io.stream.StreamInput; +import org.elasticsearch.common.lucene.search.MoreLikeThisQuery; +import org.elasticsearch.common.xcontent.ToXContent; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.common.xcontent.XContentFactory; +import org.elasticsearch.common.xcontent.XContentParser; +import org.elasticsearch.index.VersionType; +import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item; +import org.hamcrest.Matchers; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; +import java.util.EnumSet; +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Stream; + +import static org.hamcrest.Matchers.is; + +public class MoreLikeThisQueryBuilderTests extends AbstractQueryTestCase { + + private static String[] randomFields; + private static Item[] randomLikeItems; + private static Item[] randomUnlikeItems; + + @Before + public void setup() { + // MLT only supports string fields, unsupported fields are tested below + randomFields = randomStringFields(); + // we also preset the item requests + randomLikeItems = new Item[randomIntBetween(1, 3)]; + for (int i = 0; i < randomLikeItems.length; i++) { + randomLikeItems[i] = generateRandomItem(); + } + // and for the unlike items too + randomUnlikeItems = new Item[randomIntBetween(1, 3)]; + for (int i = 0; i < randomUnlikeItems.length; i++) { + randomUnlikeItems[i] = generateRandomItem(); + } + } + + private static String[] randomStringFields() { + String[] mappedStringFields = new String[]{STRING_FIELD_NAME, STRING_FIELD_NAME_2}; + String[] unmappedStringFields = generateRandomStringArray(2, 5, false, false); + return Stream.concat(Arrays.stream(mappedStringFields), Arrays.stream(unmappedStringFields)).toArray(String[]::new); + } + + private Item generateRandomItem() { + String index = randomBoolean() ? getIndex().getName() : null; + String type = getRandomType(); // set to one type to avoid ambiguous types + // indexed item or artificial document + Item item; + if (randomBoolean()) { + item = new Item(index, type, randomAsciiOfLength(10)); + } else { + item = new Item(index, type, randomArtificialDoc()); + } + // if no field is specified MLT uses all mapped fields for this item + if (randomBoolean()) { + item.fields(randomFrom(randomFields)); + } + // per field analyzer + if (randomBoolean()) { + item.perFieldAnalyzer(randomPerFieldAnalyzer()); + } + if (randomBoolean()) { + item.routing(randomAsciiOfLength(10)); + } + if (randomBoolean()) { + item.version(randomInt(5)); + } + if (randomBoolean()) { + item.versionType(randomFrom(VersionType.values())); + } + return item; + } + + private XContentBuilder randomArtificialDoc() { + XContentBuilder doc; + try { + doc = XContentFactory.jsonBuilder().startObject(); + for (String field : randomFields) { + doc.field(field, randomAsciiOfLength(10)); + } + } catch (IOException e) { + throw new ElasticsearchException("Unable to generate random artificial doc!"); + } + return doc; + } + + private Map randomPerFieldAnalyzer() { + Map perFieldAnalyzer = new HashMap<>(); + for (String field : randomFields) { + perFieldAnalyzer.put(field, randomAnalyzer()); + } + return perFieldAnalyzer; + } + + @Override + protected MoreLikeThisQueryBuilder doCreateTestQueryBuilder() { + MoreLikeThisQueryBuilder queryBuilder; + if (randomBoolean()) { // for the default field + queryBuilder = new MoreLikeThisQueryBuilder(); + } else { + queryBuilder = new MoreLikeThisQueryBuilder(randomFields); + } + // like field is required + if (randomBoolean()) { + queryBuilder.like(generateRandomStringArray(5, 5, false, false)); + } else { + queryBuilder.like(randomLikeItems); + } + if (randomBoolean()) { + queryBuilder.unlike(generateRandomStringArray(5, 5, false, false)); + } + if (randomBoolean()) { + queryBuilder.unlike(randomUnlikeItems); + } + if (randomBoolean()) { + queryBuilder.maxQueryTerms(randomInt(25)); + } + if (randomBoolean()) { + queryBuilder.minTermFreq(randomInt(5)); + } + if (randomBoolean()) { + queryBuilder.minDocFreq(randomInt(5)); + } + if (randomBoolean()) { + queryBuilder.maxDocFreq(randomInt(100)); + } + if (randomBoolean()) { + queryBuilder.minWordLength(randomInt(5)); + } + if (randomBoolean()) { + queryBuilder.maxWordLength(randomInt(25)); + } + if (randomBoolean()) { + queryBuilder.stopWords(generateRandomStringArray(5, 5, false, false)); + } + if (randomBoolean()) { + queryBuilder.analyzer(randomAnalyzer()); // fix the analyzer? + } + if (randomBoolean()) { + queryBuilder.minimumShouldMatch(randomMinimumShouldMatch()); + } + if (randomBoolean()) { + queryBuilder.boostTerms(randomFloat() * 10); + } + if (randomBoolean()) { + queryBuilder.include(randomBoolean()); + } + if (randomBoolean()) { + queryBuilder.failOnUnsupportedField(randomBoolean()); + } + return queryBuilder; + } + + @Override + protected MultiTermVectorsResponse executeMultiTermVectors(MultiTermVectorsRequest mtvRequest) { + try { + MultiTermVectorsItemResponse[] responses = new MultiTermVectorsItemResponse[mtvRequest.size()]; + int i = 0; + for (TermVectorsRequest request : mtvRequest) { + TermVectorsResponse response = new TermVectorsResponse(request.index(), request.type(), request.id()); + response.setExists(true); + Fields generatedFields; + if (request.doc() != null) { + generatedFields = generateFields(randomFields, request.doc().toUtf8()); + } else { + generatedFields = generateFields(request.selectedFields().toArray(new String[0]), request.id()); + } + EnumSet flags = EnumSet.of(TermVectorsRequest.Flag.Positions, TermVectorsRequest.Flag.Offsets); + response.setFields(generatedFields, request.selectedFields(), flags, generatedFields); + responses[i++] = new MultiTermVectorsItemResponse(response, null); + } + return new MultiTermVectorsResponse(responses); + } catch (IOException ex) { + throw new ElasticsearchException("boom", ex); + } + } + + /** + * Here we could go overboard and use a pre-generated indexed random document for a given Item, + * but for now we'd prefer to simply return the id as the content of the document and that for + * every field. + */ + private static Fields generateFields(String[] fieldNames, String text) throws IOException { + MemoryIndex index = new MemoryIndex(); + for (String fieldName : fieldNames) { + index.addField(fieldName, text, new WhitespaceAnalyzer()); + } + return MultiFields.getFields(index.createSearcher().getIndexReader()); + } + + @Override + protected void doAssertLuceneQuery(MoreLikeThisQueryBuilder queryBuilder, Query query, QueryShardContext context) throws IOException { + if (!queryBuilder.likeItems().isEmpty()) { + assertThat(query, Matchers.instanceOf(BooleanQuery.class)); + } else { + // we rely on integration tests for a deeper check here + assertThat(query, Matchers.instanceOf(MoreLikeThisQuery.class)); + } + } + + @Test + public void testValidate() { + MoreLikeThisQueryBuilder queryBuilder = new MoreLikeThisQueryBuilder(Strings.EMPTY_ARRAY); + assertThat(queryBuilder.validate().validationErrors().size(), is(2)); + + queryBuilder = new MoreLikeThisQueryBuilder(Strings.EMPTY_ARRAY).like("some text"); + assertThat(queryBuilder.validate().validationErrors().size(), is(1)); + + queryBuilder = new MoreLikeThisQueryBuilder("field").like(Strings.EMPTY_ARRAY); + assertThat(queryBuilder.validate().validationErrors().size(), is(1)); + + queryBuilder = new MoreLikeThisQueryBuilder("field").like(Item.EMPTY_ARRAY); + assertThat(queryBuilder.validate().validationErrors().size(), is(1)); + + queryBuilder = new MoreLikeThisQueryBuilder("field").like("some text"); + assertNull(queryBuilder.validate()); + } + + @Test + public void testUnsupportedFields() throws IOException { + assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0); + String unsupportedField = randomFrom(INT_FIELD_NAME, DOUBLE_FIELD_NAME, DATE_FIELD_NAME); + MoreLikeThisQueryBuilder queryBuilder = new MoreLikeThisQueryBuilder(unsupportedField) + .like("some text") + .failOnUnsupportedField(true); + try { + queryBuilder.toQuery(createShardContext()); + fail("should have failed with IllegalArgumentException for field: " + unsupportedField); + } catch (IllegalArgumentException e) { + assertThat(e.getMessage(), Matchers.containsString("more_like_this doesn't support binary/numeric fields")); + } + } + + @Test + public void testItemSerialization() throws IOException { + Item expectedItem = generateRandomItem(); + BytesStreamOutput output = new BytesStreamOutput(); + expectedItem.writeTo(output); + Item newItem = Item.readItemFrom(StreamInput.wrap(output.bytes())); + assertEquals(expectedItem, newItem); + } + + @Test + public void testItemFromXContent() throws IOException { + Item expectedItem = generateRandomItem(); + String json = expectedItem.toXContent(XContentFactory.jsonBuilder(), ToXContent.EMPTY_PARAMS).string(); + XContentParser parser = XContentFactory.xContent(json).createParser(json); + Item newItem = Item.parse(parser, ParseFieldMatcher.STRICT, new Item()); + assertEquals(expectedItem, newItem); + } +} diff --git a/core/src/test/java/org/elasticsearch/search/morelikethis/ItemSerializationTests.java b/core/src/test/java/org/elasticsearch/search/morelikethis/ItemSerializationTests.java deleted file mode 100644 index 5f5f42aa7b2..00000000000 --- a/core/src/test/java/org/elasticsearch/search/morelikethis/ItemSerializationTests.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to Elasticsearch under one or more contributor - * license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright - * ownership. Elasticsearch licenses this file to you under - * the Apache License, Version 2.0 (the "License"); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.search.morelikethis; - -import com.carrotsearch.randomizedtesting.generators.RandomPicks; -import org.elasticsearch.common.ParseFieldMatcher; -import org.elasticsearch.common.xcontent.ToXContent; -import org.elasticsearch.common.xcontent.XContentFactory; -import org.elasticsearch.common.xcontent.XContentParser; -import org.elasticsearch.index.VersionType; -import org.elasticsearch.index.query.MoreLikeThisQueryBuilder.Item; -import org.elasticsearch.test.ESTestCase; -import org.junit.Test; - -import java.util.Random; - -public class ItemSerializationTests extends ESTestCase { - - private Item generateRandomItem(int arraySize, int stringSize) { - String index = randomAsciiOfLength(stringSize); - String type = randomAsciiOfLength(stringSize); - String id = String.valueOf(Math.abs(randomInt())); - String[] fields = generateRandomStringArray(arraySize, stringSize, true); - String routing = randomBoolean() ? randomAsciiOfLength(stringSize) : null; - long version = Math.abs(randomLong()); - VersionType versionType = RandomPicks.randomFrom(new Random(), VersionType.values()); - return new Item(index, type, id).fields(fields).routing(routing).version(version).versionType(versionType); - } - - @Test - public void testItemSerialization() throws Exception { - int numOfTrials = 100; - int maxArraySize = 7; - int maxStringSize = 8; - for (int i = 0; i < numOfTrials; i++) { - Item item1 = generateRandomItem(maxArraySize, maxStringSize); - String json = item1.toXContent(XContentFactory.jsonBuilder(), ToXContent.EMPTY_PARAMS).string(); - XContentParser parser = XContentFactory.xContent(json).createParser(json); - Item item2 = Item.parse(parser, ParseFieldMatcher.STRICT, new Item()); - assertEquals(item1, item2); - } - } -} diff --git a/core/src/test/java/org/elasticsearch/search/morelikethis/MoreLikeThisIT.java b/core/src/test/java/org/elasticsearch/search/morelikethis/MoreLikeThisIT.java index bbc992f75ae..dc98d0d9312 100644 --- a/core/src/test/java/org/elasticsearch/search/morelikethis/MoreLikeThisIT.java +++ b/core/src/test/java/org/elasticsearch/search/morelikethis/MoreLikeThisIT.java @@ -72,7 +72,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { logger.info("Running moreLikeThis"); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 1l); } @@ -92,7 +92,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { logger.info("Running moreLikeThis"); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 0l); } @@ -119,24 +119,24 @@ public class MoreLikeThisIT extends ESIntegTestCase { logger.info("Running moreLikeThis on index"); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 2l); logger.info("Running moreLikeThis on beta shard"); response = client().prepareSearch("beta").setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 1l); assertThat(response.getHits().getAt(0).id(), equalTo("3")); logger.info("Running moreLikeThis on release shard"); response = client().prepareSearch("release").setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 1l); assertThat(response.getHits().getAt(0).id(), equalTo("2")); logger.info("Running moreLikeThis on alias with node client"); response = internalCluster().clientNodeClient().prepareSearch("beta").setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(response, 1l); assertThat(response.getHits().getAt(0).id(), equalTo("3")); } @@ -156,11 +156,11 @@ public class MoreLikeThisIT extends ESIntegTestCase { assertThat(ensureGreen(), equalTo(ClusterHealthStatus.GREEN)); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("foo", "bar", "1"))).get(); + new MoreLikeThisQueryBuilder().like(new Item("foo", "bar", "1"))).get(); assertNoFailures(response); assertThat(response, notNullValue()); response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("foo", "bar", "1"))).get(); + new MoreLikeThisQueryBuilder().like(new Item("foo", "bar", "1"))).get(); assertNoFailures(response); assertThat(response, notNullValue()); } @@ -182,7 +182,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { client().admin().indices().prepareRefresh("foo").execute().actionGet(); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("foo", "bar", "1").routing("2"))).get(); + new MoreLikeThisQueryBuilder().like(new Item("foo", "bar", "1").routing("2"))).get(); assertNoFailures(response); assertThat(response, notNullValue()); } @@ -205,7 +205,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { .execute().actionGet(); client().admin().indices().prepareRefresh("foo").execute().actionGet(); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("foo", "bar", "1").routing("4000"))).get(); + new MoreLikeThisQueryBuilder().like(new Item("foo", "bar", "1").routing("4000"))).get(); assertNoFailures(response); assertThat(response, notNullValue()); } @@ -233,12 +233,12 @@ public class MoreLikeThisIT extends ESIntegTestCase { // Implicit list of fields -> ignore numeric fields SearchResponse searchResponse = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type", "1")).minTermFreq(1).minDocFreq(1)).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type", "1")).minTermFreq(1).minDocFreq(1)).get(); assertHitCount(searchResponse, 1l); // Explicit list of fields including numeric fields -> fail assertThrows(client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder("string_value", "int_value").addLikeItem(new Item("test", "type", "1")).minTermFreq(1).minDocFreq(1)), SearchPhaseExecutionException.class); + new MoreLikeThisQueryBuilder("string_value", "int_value").like(new Item("test", "type", "1")).minTermFreq(1).minDocFreq(1)), SearchPhaseExecutionException.class); // mlt query with no field -> OK searchResponse = client().prepareSearch().setQuery(moreLikeThisQuery().likeText("index").minTermFreq(1).minDocFreq(1)).execute().actionGet(); @@ -295,16 +295,16 @@ public class MoreLikeThisIT extends ESIntegTestCase { logger.info("Running More Like This with include true"); SearchResponse response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1).include(true).minimumShouldMatch("0%")).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1).include(true).minimumShouldMatch("0%")).get(); assertOrderedSearchHits(response, "1", "2"); response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "2")).minTermFreq(1).minDocFreq(1).include(true).minimumShouldMatch("0%")).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "2")).minTermFreq(1).minDocFreq(1).include(true).minimumShouldMatch("0%")).get(); assertOrderedSearchHits(response, "2", "1"); logger.info("Running More Like This with include false"); response = client().prepareSearch().setQuery( - new MoreLikeThisQueryBuilder().addLikeItem(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1).minimumShouldMatch("0%")).get(); + new MoreLikeThisQueryBuilder().like(new Item("test", "type1", "1")).minTermFreq(1).minDocFreq(1).minimumShouldMatch("0%")).get(); assertSearchHits(response, "2"); } @@ -355,7 +355,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { logger.info("Running MoreLikeThis"); MoreLikeThisQueryBuilder queryBuilder = QueryBuilders.moreLikeThisQuery("text").include(true).minTermFreq(1).minDocFreq(1) - .addLikeItem(new Item("test", "type0", "0")); + .like(new Item("test", "type0", "0")); String[] types = new String[numOfTypes]; for (int i = 0; i < numOfTypes; i++) { @@ -573,7 +573,7 @@ public class MoreLikeThisIT extends ESIntegTestCase { docs.add(new Item("test", "type1", i+"")); mltQuery = moreLikeThisQuery() .like(new Item("test", "type1", doc)) - .ignoreLike(docs.toArray(Item.EMPTY_ARRAY)) + .unlike(docs.toArray(Item.EMPTY_ARRAY)) .minTermFreq(0) .minDocFreq(0) .maxQueryTerms(100) diff --git a/core/src/test/java/org/elasticsearch/transport/ContextAndHeaderTransportIT.java b/core/src/test/java/org/elasticsearch/transport/ContextAndHeaderTransportIT.java index c3871abc986..b5604ec217c 100644 --- a/core/src/test/java/org/elasticsearch/transport/ContextAndHeaderTransportIT.java +++ b/core/src/test/java/org/elasticsearch/transport/ContextAndHeaderTransportIT.java @@ -230,7 +230,7 @@ public class ContextAndHeaderTransportIT extends ESIntegTestCase { transportClient().admin().indices().prepareRefresh(lookupIndex, queryIndex).get(); MoreLikeThisQueryBuilder moreLikeThisQueryBuilder = QueryBuilders.moreLikeThisQuery("name") - .addLikeItem(new Item(lookupIndex, "type", "1")) + .like(new Item(lookupIndex, "type", "1")) .minTermFreq(1) .minDocFreq(1); diff --git a/docs/reference/migration/migrate_query_refactoring.asciidoc b/docs/reference/migration/migrate_query_refactoring.asciidoc index 560de94cd90..6fe987c3c94 100644 --- a/docs/reference/migration/migrate_query_refactoring.asciidoc +++ b/docs/reference/migration/migrate_query_refactoring.asciidoc @@ -88,3 +88,14 @@ makes the type / path parameter mandatory. Moving MatchQueryBuilder.Type and MatchQueryBuilder.ZeroTermsQuery enum to MatchQuery.Type. Also reusing new Operator enum. + +==== MoreLikeThisQueryBuilder + +Removed `MoreLikeThisQueryBuilder.Item#id(String id)`, `Item#doc(BytesReference doc)`, +`Item#doc(XContentBuilder doc)`. Use provided constructors instead. + +Removed `MoreLikeThisQueryBuilder#addLike` and `addUnlike` in favor to using the `like` +and `unlike` methods. + +The deprecated `docs(Item... docs)`, `ignoreLike(Item... docs)`, +`ignoreLike(String... likeText)`, `addItem(Item... likeItems)` have been removed.