More Like This Query: Switch to using the multi-termvectors API
The term vector API can now generate term vectors on the fly, if the terms are not already stored in the index. This commit exploits this new functionality for the MLT query. Now the terms are directly retrieved using multi- termvectors API, instead of generating them from the texts retrieved using the multi-get API. Closes #7014
This commit is contained in:
parent
c4bed91262
commit
f1a6b4e9fe
|
@ -119,7 +119,7 @@ boost factor.
|
||||||
|
|
||||||
|`boost` |Sets the boost value of the query. Defaults to `1.0`.
|
|`boost` |Sets the boost value of the query. Defaults to `1.0`.
|
||||||
|
|
||||||
|`analyzer` |The analyzer that will be used to analyze the text.
|
|`analyzer` |The analyzer that will be used to analyze the `like text`.
|
||||||
Defaults to the analyzer associated with the field.
|
Defaults to the analyzer associated with the first field in `fields`.
|
||||||
|=======================================================================
|
|=======================================================================
|
||||||
|
|
||||||
|
|
|
@ -22,6 +22,7 @@ package org.elasticsearch.action.termvector;
|
||||||
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
import org.elasticsearch.ElasticsearchIllegalArgumentException;
|
||||||
import org.elasticsearch.ElasticsearchParseException;
|
import org.elasticsearch.ElasticsearchParseException;
|
||||||
import org.elasticsearch.action.*;
|
import org.elasticsearch.action.*;
|
||||||
|
import org.elasticsearch.action.get.MultiGetRequest;
|
||||||
import org.elasticsearch.common.Nullable;
|
import org.elasticsearch.common.Nullable;
|
||||||
import org.elasticsearch.common.bytes.BytesReference;
|
import org.elasticsearch.common.bytes.BytesReference;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
|
@ -52,6 +53,11 @@ public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsReque
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public MultiTermVectorsRequest add(MultiGetRequest.Item item) {
|
||||||
|
requests.add(new TermVectorRequest(item));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public ActionRequestValidationException validate() {
|
public ActionRequestValidationException validate() {
|
||||||
ActionRequestValidationException validationException = null;
|
ActionRequestValidationException validationException = null;
|
||||||
|
|
|
@ -170,26 +170,54 @@ public final class TermVectorFields extends Fields {
|
||||||
if (!fieldMap.containsKey(field)) {
|
if (!fieldMap.containsKey(field)) {
|
||||||
return null; // we don't have it.
|
return null; // we don't have it.
|
||||||
}
|
}
|
||||||
long offset = fieldMap.lget();
|
long readOffset = fieldMap.lget();
|
||||||
final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors);
|
return new TermVector(termVectors, readOffset);
|
||||||
perFieldTermVectorInput.reset();
|
}
|
||||||
perFieldTermVectorInput.skip(offset);
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return fieldMap.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private final class TermVector extends Terms {
|
||||||
|
|
||||||
|
private final BytesStreamInput perFieldTermVectorInput;
|
||||||
|
private final long readOffset;
|
||||||
|
|
||||||
|
private long numTerms;
|
||||||
|
private boolean hasPositions;
|
||||||
|
private boolean hasOffsets;
|
||||||
|
private boolean hasPayloads;
|
||||||
|
private long sumTotalTermFreq;
|
||||||
|
private long sumDocFreq;
|
||||||
|
private int docCount;
|
||||||
|
|
||||||
|
public TermVector(BytesReference termVectors, long readOffset) throws IOException {
|
||||||
|
this.perFieldTermVectorInput = new BytesStreamInput(termVectors);
|
||||||
|
this.readOffset = readOffset;
|
||||||
|
reset();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void reset() throws IOException {
|
||||||
|
this.perFieldTermVectorInput.reset();
|
||||||
|
this.perFieldTermVectorInput.skip(readOffset);
|
||||||
|
|
||||||
// read how many terms....
|
// read how many terms....
|
||||||
final long numTerms = perFieldTermVectorInput.readVLong();
|
this.numTerms = perFieldTermVectorInput.readVLong();
|
||||||
// ...if positions etc. were stored....
|
// ...if positions etc. were stored....
|
||||||
final boolean hasPositions = perFieldTermVectorInput.readBoolean();
|
this.hasPositions = perFieldTermVectorInput.readBoolean();
|
||||||
final boolean hasOffsets = perFieldTermVectorInput.readBoolean();
|
this.hasOffsets = perFieldTermVectorInput.readBoolean();
|
||||||
final boolean hasPayloads = perFieldTermVectorInput.readBoolean();
|
this.hasPayloads = perFieldTermVectorInput.readBoolean();
|
||||||
// read the field statistics
|
// read the field statistics
|
||||||
final long sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
|
this.sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
|
||||||
final long sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
|
this.sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
|
||||||
final int docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1;
|
this.docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1;
|
||||||
|
}
|
||||||
return new Terms() {
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
public TermsEnum iterator(TermsEnum reuse) throws IOException {
|
||||||
|
// reset before asking for an iterator
|
||||||
|
reset();
|
||||||
// convert bytes ref for the terms to actual data
|
// convert bytes ref for the terms to actual data
|
||||||
return new TermsEnum() {
|
return new TermsEnum() {
|
||||||
int currentTerm = 0;
|
int currentTerm = 0;
|
||||||
|
@ -214,7 +242,6 @@ public final class TermVectorFields extends Fields {
|
||||||
if (hasTermStatistic) {
|
if (hasTermStatistic) {
|
||||||
docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
|
docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
|
||||||
totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput);
|
totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
freq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
|
freq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
|
||||||
|
@ -231,7 +258,6 @@ public final class TermVectorFields extends Fields {
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeInfos(final BytesStreamInput input) throws IOException {
|
private void writeInfos(final BytesStreamInput input) throws IOException {
|
||||||
|
@ -258,7 +284,6 @@ public final class TermVectorFields extends Fields {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void growBuffers() {
|
private void growBuffers() {
|
||||||
|
|
||||||
if (hasPositions) {
|
if (hasPositions) {
|
||||||
positions = grow(positions, freq);
|
positions = grow(positions, freq);
|
||||||
}
|
}
|
||||||
|
@ -317,8 +342,8 @@ public final class TermVectorFields extends Fields {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
|
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
|
||||||
final TermVectorsDocsAndPosEnum retVal = (reuse instanceof TermVectorsDocsAndPosEnum ? (TermVectorsDocsAndPosEnum) reuse
|
final TermVectorDocsAndPosEnum retVal = (reuse instanceof TermVectorDocsAndPosEnum ? (TermVectorDocsAndPosEnum) reuse
|
||||||
: new TermVectorsDocsAndPosEnum());
|
: new TermVectorDocsAndPosEnum());
|
||||||
return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets
|
return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets
|
||||||
: null, hasPayloads ? payloads : null, freq);
|
: null, hasPayloads ? payloads : null, freq);
|
||||||
}
|
}
|
||||||
|
@ -370,16 +395,9 @@ public final class TermVectorFields extends Fields {
|
||||||
public boolean hasPayloads() {
|
public boolean hasPayloads() {
|
||||||
return hasPayloads;
|
return hasPayloads;
|
||||||
}
|
}
|
||||||
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
private final class TermVectorDocsAndPosEnum extends DocsAndPositionsEnum {
|
||||||
public int size() {
|
|
||||||
return fieldMap.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
private final class TermVectorsDocsAndPosEnum extends DocsAndPositionsEnum {
|
|
||||||
private boolean hasPositions;
|
private boolean hasPositions;
|
||||||
private boolean hasOffsets;
|
private boolean hasOffsets;
|
||||||
private boolean hasPayloads;
|
private boolean hasPayloads;
|
||||||
|
|
|
@ -24,6 +24,7 @@ import org.elasticsearch.ElasticsearchParseException;
|
||||||
import org.elasticsearch.Version;
|
import org.elasticsearch.Version;
|
||||||
import org.elasticsearch.action.ActionRequestValidationException;
|
import org.elasticsearch.action.ActionRequestValidationException;
|
||||||
import org.elasticsearch.action.ValidateActions;
|
import org.elasticsearch.action.ValidateActions;
|
||||||
|
import org.elasticsearch.action.get.MultiGetRequest;
|
||||||
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
|
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
|
||||||
import org.elasticsearch.common.io.stream.StreamInput;
|
import org.elasticsearch.common.io.stream.StreamInput;
|
||||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||||
|
@ -86,6 +87,14 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public TermVectorRequest(MultiGetRequest.Item item) {
|
||||||
|
super(item.index());
|
||||||
|
this.id = item.id();
|
||||||
|
this.type = item.type();
|
||||||
|
this.selectedFields(item.fields());
|
||||||
|
this.routing(item.routing());
|
||||||
|
}
|
||||||
|
|
||||||
public EnumSet<Flag> getFlags() {
|
public EnumSet<Flag> getFlags() {
|
||||||
return flagsEnum;
|
return flagsEnum;
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package org.elasticsearch.common.lucene.search;
|
package org.elasticsearch.common.lucene.search;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.index.Fields;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
@ -46,6 +47,7 @@ public class MoreLikeThisQuery extends Query {
|
||||||
private TFIDFSimilarity similarity;
|
private TFIDFSimilarity similarity;
|
||||||
|
|
||||||
private String[] likeText;
|
private String[] likeText;
|
||||||
|
private Fields[] likeFields;
|
||||||
private String[] moreLikeFields;
|
private String[] moreLikeFields;
|
||||||
private Analyzer analyzer;
|
private Analyzer analyzer;
|
||||||
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
|
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
|
||||||
|
@ -148,12 +150,18 @@ public class MoreLikeThisQuery extends Query {
|
||||||
mlt.setBoost(boostTerms);
|
mlt.setBoost(boostTerms);
|
||||||
mlt.setBoostFactor(boostTermsFactor);
|
mlt.setBoostFactor(boostTermsFactor);
|
||||||
|
|
||||||
|
BooleanQuery bq = new BooleanQuery();
|
||||||
|
if (this.likeFields != null) {
|
||||||
|
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
if (this.likeText != null) {
|
||||||
Reader[] readers = new Reader[likeText.length];
|
Reader[] readers = new Reader[likeText.length];
|
||||||
for (int i = 0; i < readers.length; i++) {
|
for (int i = 0; i < readers.length; i++) {
|
||||||
readers[i] = new FastStringReader(likeText[i]);
|
readers[i] = new FastStringReader(likeText[i]);
|
||||||
}
|
}
|
||||||
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
|
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
|
||||||
BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers);
|
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
|
||||||
BooleanClause[] clauses = bq.getClauses();
|
BooleanClause[] clauses = bq.getClauses();
|
||||||
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
|
||||||
|
@ -183,6 +191,14 @@ public class MoreLikeThisQuery extends Query {
|
||||||
this.likeText = likeText;
|
this.likeText = likeText;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Fields[] getLikeFields() {
|
||||||
|
return likeFields;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLikeText(Fields... likeFields) {
|
||||||
|
this.likeFields = likeFields;
|
||||||
|
}
|
||||||
|
|
||||||
public void setLikeText(List<String> likeText) {
|
public void setLikeText(List<String> likeText) {
|
||||||
setLikeText(likeText.toArray(Strings.EMPTY_ARRAY));
|
setLikeText(likeText.toArray(Strings.EMPTY_ARRAY));
|
||||||
}
|
}
|
||||||
|
|
|
@ -53,11 +53,7 @@ import org.elasticsearch.common.io.FastStringReader;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -618,6 +614,49 @@ public final class XMoreLikeThis {
|
||||||
return createQuery(createQueue(words));
|
return createQuery(createQueue(words));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a query that will return docs like the passed Terms.
|
||||||
|
*
|
||||||
|
* @return a query that will return docs like the passed Terms.
|
||||||
|
*/
|
||||||
|
public Query like(Terms... likeTerms) throws IOException {
|
||||||
|
Map<String, Int> termFreqMap = new HashMap<>();
|
||||||
|
for (Terms vector : likeTerms) {
|
||||||
|
addTermFrequencies(termFreqMap, vector);
|
||||||
|
}
|
||||||
|
return createQuery(createQueue(termFreqMap));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Return a query that will return docs like the passed Fields.
|
||||||
|
*
|
||||||
|
* @return a query that will return docs like the passed Fields.
|
||||||
|
*/
|
||||||
|
public Query like(Fields... likeFields) throws IOException {
|
||||||
|
// get all field names
|
||||||
|
Set<String> fieldNames = new HashSet<>();
|
||||||
|
for (Fields fields : likeFields) {
|
||||||
|
for (String fieldName : fields) {
|
||||||
|
fieldNames.add(fieldName);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// to create one query per field name only
|
||||||
|
BooleanQuery bq = new BooleanQuery();
|
||||||
|
for (String fieldName : fieldNames) {
|
||||||
|
Map<String, Int> termFreqMap = new HashMap<>();
|
||||||
|
this.setFieldNames(new String[]{fieldName});
|
||||||
|
for (Fields fields : likeFields) {
|
||||||
|
Terms vector = fields.terms(fieldName);
|
||||||
|
if (vector != null) {
|
||||||
|
addTermFrequencies(termFreqMap, vector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Query query = createQuery(createQueue(termFreqMap));
|
||||||
|
bq.add(query, BooleanClause.Occur.SHOULD);
|
||||||
|
}
|
||||||
|
return bq;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create the More like query from a PriorityQueue
|
* Create the More like query from a PriorityQueue
|
||||||
*/
|
*/
|
||||||
|
@ -773,7 +812,9 @@ public final class XMoreLikeThis {
|
||||||
if (isNoiseWord(term)) {
|
if (isNoiseWord(term)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
final int freq = (int) termsEnum.totalTermFreq();
|
|
||||||
|
DocsEnum docs = termsEnum.docs(null, null);
|
||||||
|
final int freq = docs.freq();
|
||||||
|
|
||||||
// increment frequency
|
// increment frequency
|
||||||
Int cnt = termFreqMap.get(term);
|
Int cnt = termFreqMap.get(term);
|
||||||
|
|
|
@ -20,7 +20,6 @@
|
||||||
package org.elasticsearch.index.query;
|
package org.elasticsearch.index.query;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.ObjectArrays;
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.queries.TermsFilter;
|
import org.apache.lucene.queries.TermsFilter;
|
||||||
|
@ -40,10 +39,12 @@ import org.elasticsearch.index.analysis.Analysis;
|
||||||
import org.elasticsearch.index.mapper.Uid;
|
import org.elasticsearch.index.mapper.Uid;
|
||||||
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
|
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
|
||||||
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
|
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
|
||||||
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.*;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -201,54 +202,25 @@ public class MoreLikeThisQueryParser implements QueryParser {
|
||||||
}
|
}
|
||||||
if (item.fields() == null && item.fetchSourceContext() == null) {
|
if (item.fields() == null && item.fetchSourceContext() == null) {
|
||||||
item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
|
item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
|
||||||
} else {
|
|
||||||
// TODO how about fields content fetched from _source?
|
|
||||||
removeUnsupportedFields(item, analyzer, failOnUnsupportedField);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// fetching the items with multi-get
|
// fetching the items with multi-termvectors API
|
||||||
List<LikeText> likeTexts = fetchService.fetch(items);
|
|
||||||
// collapse the text onto the same field name
|
|
||||||
Collection<LikeText> likeTextsCollapsed = collapseTextOnField(likeTexts);
|
|
||||||
// right now we are just building a boolean query
|
|
||||||
BooleanQuery boolQuery = new BooleanQuery();
|
BooleanQuery boolQuery = new BooleanQuery();
|
||||||
for (LikeText likeText : likeTextsCollapsed) {
|
org.apache.lucene.index.Fields[] likeFields = fetchService.fetch(items);
|
||||||
addMoreLikeThis(boolQuery, mltQuery, likeText);
|
mltQuery.setLikeText(likeFields);
|
||||||
}
|
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
|
||||||
// exclude the items from the search
|
// exclude the items from the search
|
||||||
if (!include) {
|
if (!include) {
|
||||||
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items));
|
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items));
|
||||||
ConstantScoreQuery query = new ConstantScoreQuery(filter);
|
ConstantScoreQuery query = new ConstantScoreQuery(filter);
|
||||||
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
|
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
|
||||||
}
|
}
|
||||||
// add the possible mlt query with like_text
|
|
||||||
if (mltQuery.getLikeText() != null) {
|
|
||||||
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
|
|
||||||
}
|
|
||||||
return boolQuery;
|
return boolQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
return mltQuery;
|
return mltQuery;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, LikeText likeText) {
|
|
||||||
MoreLikeThisQuery mlt = new MoreLikeThisQuery();
|
|
||||||
mlt.setMoreLikeFields(new String[] {likeText.field});
|
|
||||||
mlt.setLikeText(likeText.text);
|
|
||||||
mlt.setAnalyzer(mltQuery.getAnalyzer());
|
|
||||||
mlt.setPercentTermsToMatch(mltQuery.getPercentTermsToMatch());
|
|
||||||
mlt.setBoostTerms(mltQuery.isBoostTerms());
|
|
||||||
mlt.setBoostTermsFactor(mltQuery.getBoostTermsFactor());
|
|
||||||
mlt.setMinDocFreq(mltQuery.getMinDocFreq());
|
|
||||||
mlt.setMaxDocFreq(mltQuery.getMaxDocFreq());
|
|
||||||
mlt.setMinWordLen(mltQuery.getMinWordLen());
|
|
||||||
mlt.setMaxWordLen(mltQuery.getMaxWordLen());
|
|
||||||
mlt.setMinTermFrequency(mltQuery.getMinTermFrequency());
|
|
||||||
mlt.setMaxQueryTerms(mltQuery.getMaxQueryTerms());
|
|
||||||
mlt.setStopWords(mltQuery.getStopWords());
|
|
||||||
boolQuery.add(mlt, BooleanClause.Occur.SHOULD);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
|
private List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
|
||||||
for (Iterator<String> it = moreLikeFields.iterator(); it.hasNext(); ) {
|
for (Iterator<String> it = moreLikeFields.iterator(); it.hasNext(); ) {
|
||||||
final String fieldName = it.next();
|
final String fieldName = it.next();
|
||||||
|
@ -262,22 +234,4 @@ public class MoreLikeThisQueryParser implements QueryParser {
|
||||||
}
|
}
|
||||||
return moreLikeFields;
|
return moreLikeFields;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Collection<LikeText> collapseTextOnField (Collection<LikeText> likeTexts) {
|
|
||||||
Map<String, LikeText> collapsedTexts = new HashMap<>();
|
|
||||||
for (LikeText likeText : likeTexts) {
|
|
||||||
String field = likeText.field;
|
|
||||||
String[] text = likeText.text;
|
|
||||||
if (collapsedTexts.containsKey(field)) {
|
|
||||||
text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class);
|
|
||||||
}
|
|
||||||
collapsedTexts.put(field, new LikeText(field, text));
|
|
||||||
}
|
|
||||||
return collapsedTexts.values();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
|
|
||||||
item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
|
@ -19,15 +19,16 @@
|
||||||
|
|
||||||
package org.elasticsearch.index.search.morelikethis;
|
package org.elasticsearch.index.search.morelikethis;
|
||||||
|
|
||||||
import org.elasticsearch.action.get.GetResponse;
|
import org.apache.lucene.index.Fields;
|
||||||
import org.elasticsearch.action.get.MultiGetItemResponse;
|
|
||||||
import org.elasticsearch.action.get.MultiGetRequest;
|
import org.elasticsearch.action.get.MultiGetRequest;
|
||||||
import org.elasticsearch.action.get.MultiGetResponse;
|
import org.elasticsearch.action.termvector.MultiTermVectorsItemResponse;
|
||||||
|
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
|
||||||
|
import org.elasticsearch.action.termvector.MultiTermVectorsResponse;
|
||||||
|
import org.elasticsearch.action.termvector.TermVectorResponse;
|
||||||
import org.elasticsearch.client.Client;
|
import org.elasticsearch.client.Client;
|
||||||
import org.elasticsearch.common.component.AbstractComponent;
|
import org.elasticsearch.common.component.AbstractComponent;
|
||||||
import org.elasticsearch.common.inject.Inject;
|
import org.elasticsearch.common.inject.Inject;
|
||||||
import org.elasticsearch.common.settings.Settings;
|
import org.elasticsearch.common.settings.Settings;
|
||||||
import org.elasticsearch.index.get.GetField;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -38,21 +39,6 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class MoreLikeThisFetchService extends AbstractComponent {
|
public class MoreLikeThisFetchService extends AbstractComponent {
|
||||||
|
|
||||||
public static final class LikeText {
|
|
||||||
public final String field;
|
|
||||||
public final String[] text;
|
|
||||||
|
|
||||||
public LikeText(String field, String text) {
|
|
||||||
this.field = field;
|
|
||||||
this.text = new String[]{text};
|
|
||||||
}
|
|
||||||
|
|
||||||
public LikeText(String field, String... text) {
|
|
||||||
this.field = field;
|
|
||||||
this.text = text;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private final Client client;
|
private final Client client;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
|
@ -61,30 +47,23 @@ public class MoreLikeThisFetchService extends AbstractComponent {
|
||||||
this.client = client;
|
this.client = client;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<LikeText> fetch(List<MultiGetRequest.Item> items) throws IOException {
|
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
|
||||||
MultiGetRequest request = new MultiGetRequest();
|
MultiTermVectorsRequest request = new MultiTermVectorsRequest();
|
||||||
for (MultiGetRequest.Item item : items) {
|
for (MultiGetRequest.Item item : items) {
|
||||||
request.add(item);
|
request.add(item);
|
||||||
}
|
}
|
||||||
MultiGetResponse responses = client.multiGet(request).actionGet();
|
List<Fields> likeFields = new ArrayList<>();
|
||||||
List<LikeText> likeTexts = new ArrayList<>();
|
MultiTermVectorsResponse responses = client.multiTermVectors(request).actionGet();
|
||||||
for (MultiGetItemResponse response : responses) {
|
for (MultiTermVectorsItemResponse response : responses) {
|
||||||
if (response.isFailed()) {
|
if (response.isFailed()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
GetResponse getResponse = response.getResponse();
|
TermVectorResponse getResponse = response.getResponse();
|
||||||
if (!getResponse.isExists()) {
|
if (!getResponse.isExists()) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
likeFields.add(getResponse.getFields());
|
||||||
for (GetField getField : getResponse.getFields().values()) {
|
|
||||||
String[] text = new String[getField.getValues().size()];
|
|
||||||
for (int i = 0; i < text.length; i++) {
|
|
||||||
text[i] = getField.getValues().get(i).toString();
|
|
||||||
}
|
}
|
||||||
likeTexts.add(new LikeText(getField.getName(), text));
|
return likeFields.toArray(Fields.EMPTY_ARRAY);
|
||||||
}
|
|
||||||
}
|
|
||||||
return likeTexts;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,18 +21,23 @@ package org.elasticsearch.index.query;
|
||||||
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.index.*;
|
||||||
|
import org.apache.lucene.index.memory.MemoryIndex;
|
||||||
import org.apache.lucene.queries.*;
|
import org.apache.lucene.queries.*;
|
||||||
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
|
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
|
||||||
import org.apache.lucene.search.*;
|
import org.apache.lucene.search.*;
|
||||||
import org.apache.lucene.search.spans.*;
|
import org.apache.lucene.search.spans.*;
|
||||||
import org.apache.lucene.spatial.prefix.IntersectsPrefixTreeFilter;
|
import org.apache.lucene.spatial.prefix.IntersectsPrefixTreeFilter;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
import org.apache.lucene.util.CharsRef;
|
||||||
import org.apache.lucene.util.NumericUtils;
|
import org.apache.lucene.util.NumericUtils;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
import org.elasticsearch.ElasticsearchException;
|
import org.elasticsearch.ElasticsearchException;
|
||||||
import org.elasticsearch.action.get.MultiGetRequest;
|
import org.elasticsearch.action.get.MultiGetRequest;
|
||||||
import org.elasticsearch.common.bytes.BytesArray;
|
import org.elasticsearch.common.bytes.BytesArray;
|
||||||
import org.elasticsearch.common.compress.CompressedString;
|
import org.elasticsearch.common.compress.CompressedString;
|
||||||
|
import org.elasticsearch.common.lucene.Lucene;
|
||||||
import org.elasticsearch.common.lucene.search.*;
|
import org.elasticsearch.common.lucene.search.*;
|
||||||
import org.elasticsearch.common.lucene.search.function.BoostScoreFunction;
|
import org.elasticsearch.common.lucene.search.function.BoostScoreFunction;
|
||||||
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
|
||||||
|
@ -48,7 +53,6 @@ import org.elasticsearch.index.search.geo.GeoDistanceFilter;
|
||||||
import org.elasticsearch.index.search.geo.GeoPolygonFilter;
|
import org.elasticsearch.index.search.geo.GeoPolygonFilter;
|
||||||
import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter;
|
import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter;
|
||||||
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
|
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
|
||||||
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;
|
|
||||||
import org.elasticsearch.index.service.IndexService;
|
import org.elasticsearch.index.service.IndexService;
|
||||||
import org.elasticsearch.test.ElasticsearchSingleNodeTest;
|
import org.elasticsearch.test.ElasticsearchSingleNodeTest;
|
||||||
import org.hamcrest.Matchers;
|
import org.hamcrest.Matchers;
|
||||||
|
@ -1591,37 +1595,24 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
|
||||||
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
|
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
|
||||||
parser.setFetchService(new MockMoreLikeThisFetchService());
|
parser.setFetchService(new MockMoreLikeThisFetchService());
|
||||||
|
|
||||||
List<LikeText> likeTexts = new ArrayList<>();
|
|
||||||
likeTexts.add(new LikeText("name.first", new String[]{
|
|
||||||
"test person 1 name.first", "test person 2 name.first", "test person 3 name.first", "test person 4 name.first"}));
|
|
||||||
likeTexts.add(new LikeText("name.last", new String[]{
|
|
||||||
"test person 1 name.last", "test person 2 name.last", "test person 3 name.last", "test person 4 name.last"}));
|
|
||||||
|
|
||||||
IndexQueryParserService queryParser = queryParser();
|
IndexQueryParserService queryParser = queryParser();
|
||||||
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
|
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
|
||||||
Query parsedQuery = queryParser.parse(query).query();
|
Query parsedQuery = queryParser.parse(query).query();
|
||||||
assertThat(parsedQuery, instanceOf(BooleanQuery.class));
|
assertThat(parsedQuery, instanceOf(BooleanQuery.class));
|
||||||
BooleanQuery booleanQuery = (BooleanQuery) parsedQuery;
|
BooleanQuery booleanQuery = (BooleanQuery) parsedQuery;
|
||||||
assertThat(booleanQuery.getClauses().length, is(likeTexts.size() + 1));
|
assertThat(booleanQuery.getClauses().length, is(1));
|
||||||
|
|
||||||
// check each clause is for each item
|
BooleanClause itemClause = booleanQuery.getClauses()[0];
|
||||||
BooleanClause[] boolClauses = booleanQuery.getClauses();
|
assertThat(itemClause.getOccur(), is(BooleanClause.Occur.SHOULD));
|
||||||
for (int i = 0; i < likeTexts.size(); i++) {
|
assertThat(itemClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
|
||||||
BooleanClause booleanClause = booleanQuery.getClauses()[i];
|
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) itemClause.getQuery();
|
||||||
assertThat(booleanClause.getOccur(), is(BooleanClause.Occur.SHOULD));
|
|
||||||
assertThat(booleanClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
|
// check each Fields is for each item
|
||||||
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) booleanClause.getQuery();
|
for (int id = 1; id <= 4; id++) {
|
||||||
assertThat(mltQuery.getLikeTexts(), is(likeTexts.get(i).text));
|
Fields fields = mltQuery.getLikeFields()[id - 1];
|
||||||
assertThat(mltQuery.getMoreLikeFields()[0], equalTo(likeTexts.get(i).field));
|
assertThat(termsToString(fields.terms("name.first")), is(String.valueOf(id)));
|
||||||
|
assertThat(termsToString(fields.terms("name.last")), is(String.valueOf(id)));
|
||||||
}
|
}
|
||||||
|
|
||||||
// check last clause is for 'like_text'
|
|
||||||
BooleanClause boolClause = boolClauses[boolClauses.length - 1];
|
|
||||||
assertThat(boolClause.getOccur(), is(BooleanClause.Occur.SHOULD));
|
|
||||||
assertThat(boolClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
|
|
||||||
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) boolClause.getQuery();
|
|
||||||
assertArrayEquals("Not the same more like this 'fields'", new String[] {"name.first", "name.last"}, mltQuery.getMoreLikeFields());
|
|
||||||
assertThat(mltQuery.getLikeText(), equalTo("Apache Lucene"));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
|
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
|
||||||
|
@ -1630,17 +1621,34 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
|
||||||
super(null, ImmutableSettings.Builder.EMPTY_SETTINGS);
|
super(null, ImmutableSettings.Builder.EMPTY_SETTINGS);
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<LikeText> fetch(List<MultiGetRequest.Item> items) throws IOException {
|
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
|
||||||
List<LikeText> likeTexts = new ArrayList<>();
|
List<Fields> likeTexts = new ArrayList<>();
|
||||||
for (MultiGetRequest.Item item : items) {
|
for (MultiGetRequest.Item item : items) {
|
||||||
for (String field : item.fields()) {
|
likeTexts.add(generateFields(item.fields(), item.id()));
|
||||||
LikeText likeText = new LikeText(
|
}
|
||||||
field, item.index() + " " + item.type() + " " + item.id() + " " + field);
|
return likeTexts.toArray(Fields.EMPTY_ARRAY);
|
||||||
likeTexts.add(likeText);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return likeTexts;
|
|
||||||
|
private static Fields generateFields(String[] fieldNames, String text) throws IOException {
|
||||||
|
MemoryIndex index = new MemoryIndex();
|
||||||
|
for (String fieldName : fieldNames) {
|
||||||
|
index.addField(fieldName, text, new WhitespaceAnalyzer(Lucene.VERSION));
|
||||||
}
|
}
|
||||||
|
return MultiFields.getFields(index.createSearcher().getIndexReader());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String termsToString(Terms terms) throws IOException {
|
||||||
|
String strings = "";
|
||||||
|
TermsEnum termsEnum = terms.iterator(null);
|
||||||
|
CharsRef spare = new CharsRef();
|
||||||
|
BytesRef text;
|
||||||
|
while((text = termsEnum.next()) != null) {
|
||||||
|
UnicodeUtil.UTF8toUTF16(text, spare);
|
||||||
|
String term = spare.toString();
|
||||||
|
strings += term;
|
||||||
|
}
|
||||||
|
return strings;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
Loading…
Reference in New Issue