More Like This Query: Switch to using the multi-termvectors API

The term vector API can now generate term vectors on the fly, if the terms are
not already stored in the index. This commit exploits this new functionality
for the MLT query. Now the terms are directly retrieved using multi-
termvectors API, instead of generating them from the texts retrieved using the
multi-get API.

Closes #7014
This commit is contained in:
Alex Ksikes 2014-07-23 16:58:47 +02:00
parent c4bed91262
commit f1a6b4e9fe
9 changed files with 371 additions and 340 deletions

View File

@ -119,7 +119,7 @@ boost factor.
|`boost` |Sets the boost value of the query. Defaults to `1.0`.
|`analyzer` |The analyzer that will be used to analyze the text.
Defaults to the analyzer associated with the field.
|`analyzer` |The analyzer that will be used to analyze the `like text`.
Defaults to the analyzer associated with the first field in `fields`.
|=======================================================================

View File

@ -22,6 +22,7 @@ package org.elasticsearch.action.termvector;
import org.elasticsearch.ElasticsearchIllegalArgumentException;
import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.action.*;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
@ -52,6 +53,11 @@ public class MultiTermVectorsRequest extends ActionRequest<MultiTermVectorsReque
return this;
}
public MultiTermVectorsRequest add(MultiGetRequest.Item item) {
requests.add(new TermVectorRequest(item));
return this;
}
@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = null;

View File

@ -170,208 +170,8 @@ public final class TermVectorFields extends Fields {
if (!fieldMap.containsKey(field)) {
return null; // we don't have it.
}
long offset = fieldMap.lget();
final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors);
perFieldTermVectorInput.reset();
perFieldTermVectorInput.skip(offset);
// read how many terms....
final long numTerms = perFieldTermVectorInput.readVLong();
// ...if positions etc. were stored....
final boolean hasPositions = perFieldTermVectorInput.readBoolean();
final boolean hasOffsets = perFieldTermVectorInput.readBoolean();
final boolean hasPayloads = perFieldTermVectorInput.readBoolean();
// read the field statistics
final long sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
final long sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
final int docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1;
return new Terms() {
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
// convert bytes ref for the terms to actual data
return new TermsEnum() {
int currentTerm = 0;
int freq = 0;
int docFreq = -1;
long totalTermFrequency = -1;
int[] positions = new int[1];
int[] startOffsets = new int[1];
int[] endOffsets = new int[1];
BytesRef[] payloads = new BytesRef[1];
final BytesRef spare = new BytesRef();
@Override
public BytesRef next() throws IOException {
if (currentTerm++ < numTerms) {
// term string. first the size...
int termVectorSize = perFieldTermVectorInput.readVInt();
spare.grow(termVectorSize);
// ...then the value.
perFieldTermVectorInput.readBytes(spare.bytes, 0, termVectorSize);
spare.length = termVectorSize;
if (hasTermStatistic) {
docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput);
}
freq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
// grow the arrays to read the values. this is just
// for performance reasons. Re-use memory instead of
// realloc.
growBuffers();
// finally, read the values into the arrays
// curentPosition etc. so that we can just iterate
// later
writeInfos(perFieldTermVectorInput);
return spare;
} else {
return null;
}
}
private void writeInfos(final BytesStreamInput input) throws IOException {
for (int i = 0; i < freq; i++) {
if (hasPositions) {
positions[i] = input.readVInt();
}
if (hasOffsets) {
startOffsets[i] = input.readVInt();
endOffsets[i] = input.readVInt();
}
if (hasPayloads) {
int payloadLength = input.readVInt();
if (payloads[i] == null) {
payloads[i] = new BytesRef(payloadLength);
} else {
payloads[i].grow(payloadLength);
}
input.readBytes(payloads[i].bytes, 0, payloadLength);
payloads[i].length = payloadLength;
payloads[i].offset = 0;
}
}
}
private void growBuffers() {
if (hasPositions) {
positions = grow(positions, freq);
}
if (hasOffsets) {
startOffsets = grow(startOffsets, freq);
endOffsets = grow(endOffsets, freq);
}
if (hasPayloads) {
if (payloads.length < freq) {
final BytesRef[] newArray = new BytesRef[ArrayUtil.oversize(freq, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(payloads, 0, newArray, 0, payloads.length);
payloads = newArray;
}
}
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException("Seek is not supported");
}
@Override
public BytesRef term() throws IOException {
return spare;
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException("ordinals are not supported");
}
@Override
public int docFreq() throws IOException {
return docFreq;
}
@Override
public long totalTermFreq() throws IOException {
return totalTermFrequency;
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
return docsAndPositions(liveDocs, reuse instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) reuse : null, 0);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
final TermVectorsDocsAndPosEnum retVal = (reuse instanceof TermVectorsDocsAndPosEnum ? (TermVectorsDocsAndPosEnum) reuse
: new TermVectorsDocsAndPosEnum());
return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets
: null, hasPayloads ? payloads : null, freq);
}
};
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public long size() throws IOException {
return numTerms;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override
public boolean hasFreqs() {
return true;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPositions;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
};
long readOffset = fieldMap.lget();
return new TermVector(termVectors, readOffset);
}
@Override
@ -379,7 +179,225 @@ public final class TermVectorFields extends Fields {
return fieldMap.size();
}
private final class TermVectorsDocsAndPosEnum extends DocsAndPositionsEnum {
private final class TermVector extends Terms {
private final BytesStreamInput perFieldTermVectorInput;
private final long readOffset;
private long numTerms;
private boolean hasPositions;
private boolean hasOffsets;
private boolean hasPayloads;
private long sumTotalTermFreq;
private long sumDocFreq;
private int docCount;
public TermVector(BytesReference termVectors, long readOffset) throws IOException {
this.perFieldTermVectorInput = new BytesStreamInput(termVectors);
this.readOffset = readOffset;
reset();
}
private void reset() throws IOException {
this.perFieldTermVectorInput.reset();
this.perFieldTermVectorInput.skip(readOffset);
// read how many terms....
this.numTerms = perFieldTermVectorInput.readVLong();
// ...if positions etc. were stored....
this.hasPositions = perFieldTermVectorInput.readBoolean();
this.hasOffsets = perFieldTermVectorInput.readBoolean();
this.hasPayloads = perFieldTermVectorInput.readBoolean();
// read the field statistics
this.sumTotalTermFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
this.sumDocFreq = hasFieldStatistic ? readPotentiallyNegativeVLong(perFieldTermVectorInput) : -1;
this.docCount = hasFieldStatistic ? readPotentiallyNegativeVInt(perFieldTermVectorInput) : -1;
}
@Override
public TermsEnum iterator(TermsEnum reuse) throws IOException {
// reset before asking for an iterator
reset();
// convert bytes ref for the terms to actual data
return new TermsEnum() {
int currentTerm = 0;
int freq = 0;
int docFreq = -1;
long totalTermFrequency = -1;
int[] positions = new int[1];
int[] startOffsets = new int[1];
int[] endOffsets = new int[1];
BytesRef[] payloads = new BytesRef[1];
final BytesRef spare = new BytesRef();
@Override
public BytesRef next() throws IOException {
if (currentTerm++ < numTerms) {
// term string. first the size...
int termVectorSize = perFieldTermVectorInput.readVInt();
spare.grow(termVectorSize);
// ...then the value.
perFieldTermVectorInput.readBytes(spare.bytes, 0, termVectorSize);
spare.length = termVectorSize;
if (hasTermStatistic) {
docFreq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
totalTermFrequency = readPotentiallyNegativeVLong(perFieldTermVectorInput);
}
freq = readPotentiallyNegativeVInt(perFieldTermVectorInput);
// grow the arrays to read the values. this is just
// for performance reasons. Re-use memory instead of
// realloc.
growBuffers();
// finally, read the values into the arrays
// curentPosition etc. so that we can just iterate
// later
writeInfos(perFieldTermVectorInput);
return spare;
} else {
return null;
}
}
private void writeInfos(final BytesStreamInput input) throws IOException {
for (int i = 0; i < freq; i++) {
if (hasPositions) {
positions[i] = input.readVInt();
}
if (hasOffsets) {
startOffsets[i] = input.readVInt();
endOffsets[i] = input.readVInt();
}
if (hasPayloads) {
int payloadLength = input.readVInt();
if (payloads[i] == null) {
payloads[i] = new BytesRef(payloadLength);
} else {
payloads[i].grow(payloadLength);
}
input.readBytes(payloads[i].bytes, 0, payloadLength);
payloads[i].length = payloadLength;
payloads[i].offset = 0;
}
}
}
private void growBuffers() {
if (hasPositions) {
positions = grow(positions, freq);
}
if (hasOffsets) {
startOffsets = grow(startOffsets, freq);
endOffsets = grow(endOffsets, freq);
}
if (hasPayloads) {
if (payloads.length < freq) {
final BytesRef[] newArray = new BytesRef[ArrayUtil.oversize(freq, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
System.arraycopy(payloads, 0, newArray, 0, payloads.length);
payloads = newArray;
}
}
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public SeekStatus seekCeil(BytesRef text) throws IOException {
throw new UnsupportedOperationException();
}
@Override
public void seekExact(long ord) throws IOException {
throw new UnsupportedOperationException("Seek is not supported");
}
@Override
public BytesRef term() throws IOException {
return spare;
}
@Override
public long ord() throws IOException {
throw new UnsupportedOperationException("ordinals are not supported");
}
@Override
public int docFreq() throws IOException {
return docFreq;
}
@Override
public long totalTermFreq() throws IOException {
return totalTermFrequency;
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags) throws IOException {
return docsAndPositions(liveDocs, reuse instanceof DocsAndPositionsEnum ? (DocsAndPositionsEnum) reuse : null, 0);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, int flags) throws IOException {
final TermVectorDocsAndPosEnum retVal = (reuse instanceof TermVectorDocsAndPosEnum ? (TermVectorDocsAndPosEnum) reuse
: new TermVectorDocsAndPosEnum());
return retVal.reset(hasPositions ? positions : null, hasOffsets ? startOffsets : null, hasOffsets ? endOffsets
: null, hasPayloads ? payloads : null, freq);
}
};
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public long size() throws IOException {
return numTerms;
}
@Override
public long getSumTotalTermFreq() throws IOException {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override
public boolean hasFreqs() {
return true;
}
@Override
public boolean hasOffsets() {
return hasOffsets;
}
@Override
public boolean hasPositions() {
return hasPositions;
}
@Override
public boolean hasPayloads() {
return hasPayloads;
}
}
private final class TermVectorDocsAndPosEnum extends DocsAndPositionsEnum {
private boolean hasPositions;
private boolean hasOffsets;
private boolean hasPayloads;

View File

@ -24,6 +24,7 @@ import org.elasticsearch.ElasticsearchParseException;
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
@ -68,7 +69,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
this.id = id;
this.type = type;
}
/**
* Constructs a new term vector request for a document that will be fetch
* from the provided index. Use {@link #type(String)} and
@ -86,6 +87,14 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
}
}
public TermVectorRequest(MultiGetRequest.Item item) {
super(item.index());
this.id = item.id();
this.type = item.type();
this.selectedFields(item.fields());
this.routing(item.routing());
}
public EnumSet<Flag> getFlags() {
return flagsEnum;
}

View File

@ -20,6 +20,7 @@
package org.elasticsearch.common.lucene.search;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@ -46,6 +47,7 @@ public class MoreLikeThisQuery extends Query {
private TFIDFSimilarity similarity;
private String[] likeText;
private Fields[] likeFields;
private String[] moreLikeFields;
private Analyzer analyzer;
private float percentTermsToMatch = DEFAULT_PERCENT_TERMS_TO_MATCH;
@ -148,12 +150,18 @@ public class MoreLikeThisQuery extends Query {
mlt.setBoost(boostTerms);
mlt.setBoostFactor(boostTermsFactor);
Reader[] readers = new Reader[likeText.length];
for (int i = 0; i < readers.length; i++) {
readers[i] = new FastStringReader(likeText[i]);
BooleanQuery bq = new BooleanQuery();
if (this.likeFields != null) {
bq.add((BooleanQuery) mlt.like(this.likeFields), BooleanClause.Occur.SHOULD);
}
if (this.likeText != null) {
Reader[] readers = new Reader[likeText.length];
for (int i = 0; i < readers.length; i++) {
readers[i] = new FastStringReader(likeText[i]);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
bq.add((BooleanQuery) mlt.like(moreLikeFields[0], readers), BooleanClause.Occur.SHOULD);
}
//LUCENE 4 UPGRADE this mapps the 3.6 behavior (only use the first field)
BooleanQuery bq = (BooleanQuery) mlt.like(moreLikeFields[0], readers);
BooleanClause[] clauses = bq.getClauses();
bq.setMinimumNumberShouldMatch((int) (clauses.length * percentTermsToMatch));
@ -183,6 +191,14 @@ public class MoreLikeThisQuery extends Query {
this.likeText = likeText;
}
public Fields[] getLikeFields() {
return likeFields;
}
public void setLikeText(Fields... likeFields) {
this.likeFields = likeFields;
}
public void setLikeText(List<String> likeText) {
setLikeText(likeText.toArray(Strings.EMPTY_ARRAY));
}

View File

@ -53,11 +53,7 @@ import org.elasticsearch.common.io.FastStringReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.*;
/**
@ -618,6 +614,49 @@ public final class XMoreLikeThis {
return createQuery(createQueue(words));
}
/**
* Return a query that will return docs like the passed Terms.
*
* @return a query that will return docs like the passed Terms.
*/
public Query like(Terms... likeTerms) throws IOException {
Map<String, Int> termFreqMap = new HashMap<>();
for (Terms vector : likeTerms) {
addTermFrequencies(termFreqMap, vector);
}
return createQuery(createQueue(termFreqMap));
}
/**
* Return a query that will return docs like the passed Fields.
*
* @return a query that will return docs like the passed Fields.
*/
public Query like(Fields... likeFields) throws IOException {
// get all field names
Set<String> fieldNames = new HashSet<>();
for (Fields fields : likeFields) {
for (String fieldName : fields) {
fieldNames.add(fieldName);
}
}
// to create one query per field name only
BooleanQuery bq = new BooleanQuery();
for (String fieldName : fieldNames) {
Map<String, Int> termFreqMap = new HashMap<>();
this.setFieldNames(new String[]{fieldName});
for (Fields fields : likeFields) {
Terms vector = fields.terms(fieldName);
if (vector != null) {
addTermFrequencies(termFreqMap, vector);
}
}
Query query = createQuery(createQueue(termFreqMap));
bq.add(query, BooleanClause.Occur.SHOULD);
}
return bq;
}
/**
* Create the More like query from a PriorityQueue
*/
@ -773,7 +812,9 @@ public final class XMoreLikeThis {
if (isNoiseWord(term)) {
continue;
}
final int freq = (int) termsEnum.totalTermFreq();
DocsEnum docs = termsEnum.docs(null, null);
final int freq = docs.freq();
// increment frequency
Int cnt = termFreqMap.get(term);

View File

@ -20,7 +20,6 @@
package org.elasticsearch.index.query;
import com.google.common.collect.Lists;
import com.google.common.collect.ObjectArrays;
import com.google.common.collect.Sets;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.queries.TermsFilter;
@ -40,10 +39,12 @@ import org.elasticsearch.index.analysis.Analysis;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;
import java.io.IOException;
import java.util.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
/**
*
@ -201,54 +202,25 @@ public class MoreLikeThisQueryParser implements QueryParser {
}
if (item.fields() == null && item.fetchSourceContext() == null) {
item.fields(moreLikeFields.toArray(new String[moreLikeFields.size()]));
} else {
// TODO how about fields content fetched from _source?
removeUnsupportedFields(item, analyzer, failOnUnsupportedField);
}
}
// fetching the items with multi-get
List<LikeText> likeTexts = fetchService.fetch(items);
// collapse the text onto the same field name
Collection<LikeText> likeTextsCollapsed = collapseTextOnField(likeTexts);
// right now we are just building a boolean query
// fetching the items with multi-termvectors API
BooleanQuery boolQuery = new BooleanQuery();
for (LikeText likeText : likeTextsCollapsed) {
addMoreLikeThis(boolQuery, mltQuery, likeText);
}
org.apache.lucene.index.Fields[] likeFields = fetchService.fetch(items);
mltQuery.setLikeText(likeFields);
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
// exclude the items from the search
if (!include) {
TermsFilter filter = new TermsFilter(UidFieldMapper.NAME, Uid.createUids(items));
ConstantScoreQuery query = new ConstantScoreQuery(filter);
boolQuery.add(query, BooleanClause.Occur.MUST_NOT);
}
// add the possible mlt query with like_text
if (mltQuery.getLikeText() != null) {
boolQuery.add(mltQuery, BooleanClause.Occur.SHOULD);
}
return boolQuery;
}
return mltQuery;
}
private void addMoreLikeThis(BooleanQuery boolQuery, MoreLikeThisQuery mltQuery, LikeText likeText) {
MoreLikeThisQuery mlt = new MoreLikeThisQuery();
mlt.setMoreLikeFields(new String[] {likeText.field});
mlt.setLikeText(likeText.text);
mlt.setAnalyzer(mltQuery.getAnalyzer());
mlt.setPercentTermsToMatch(mltQuery.getPercentTermsToMatch());
mlt.setBoostTerms(mltQuery.isBoostTerms());
mlt.setBoostTermsFactor(mltQuery.getBoostTermsFactor());
mlt.setMinDocFreq(mltQuery.getMinDocFreq());
mlt.setMaxDocFreq(mltQuery.getMaxDocFreq());
mlt.setMinWordLen(mltQuery.getMinWordLen());
mlt.setMaxWordLen(mltQuery.getMaxWordLen());
mlt.setMinTermFrequency(mltQuery.getMinTermFrequency());
mlt.setMaxQueryTerms(mltQuery.getMaxQueryTerms());
mlt.setStopWords(mltQuery.getStopWords());
boolQuery.add(mlt, BooleanClause.Occur.SHOULD);
}
private List<String> removeUnsupportedFields(List<String> moreLikeFields, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
for (Iterator<String> it = moreLikeFields.iterator(); it.hasNext(); ) {
final String fieldName = it.next();
@ -262,22 +234,4 @@ public class MoreLikeThisQueryParser implements QueryParser {
}
return moreLikeFields;
}
public static Collection<LikeText> collapseTextOnField (Collection<LikeText> likeTexts) {
Map<String, LikeText> collapsedTexts = new HashMap<>();
for (LikeText likeText : likeTexts) {
String field = likeText.field;
String[] text = likeText.text;
if (collapsedTexts.containsKey(field)) {
text = ObjectArrays.concat(collapsedTexts.get(field).text, text, String.class);
}
collapsedTexts.put(field, new LikeText(field, text));
}
return collapsedTexts.values();
}
private void removeUnsupportedFields(MultiGetRequest.Item item, Analyzer analyzer, boolean failOnUnsupportedField) throws IOException {
item.fields((String[]) removeUnsupportedFields(Arrays.asList(item.fields()), analyzer, failOnUnsupportedField).toArray());
}
}

View File

@ -19,15 +19,16 @@
package org.elasticsearch.index.search.morelikethis;
import org.elasticsearch.action.get.GetResponse;
import org.elasticsearch.action.get.MultiGetItemResponse;
import org.apache.lucene.index.Fields;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.get.MultiGetResponse;
import org.elasticsearch.action.termvector.MultiTermVectorsItemResponse;
import org.elasticsearch.action.termvector.MultiTermVectorsRequest;
import org.elasticsearch.action.termvector.MultiTermVectorsResponse;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.component.AbstractComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.get.GetField;
import java.io.IOException;
import java.util.ArrayList;
@ -38,21 +39,6 @@ import java.util.List;
*/
public class MoreLikeThisFetchService extends AbstractComponent {
public static final class LikeText {
public final String field;
public final String[] text;
public LikeText(String field, String text) {
this.field = field;
this.text = new String[]{text};
}
public LikeText(String field, String... text) {
this.field = field;
this.text = text;
}
}
private final Client client;
@Inject
@ -61,30 +47,23 @@ public class MoreLikeThisFetchService extends AbstractComponent {
this.client = client;
}
public List<LikeText> fetch(List<MultiGetRequest.Item> items) throws IOException {
MultiGetRequest request = new MultiGetRequest();
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
MultiTermVectorsRequest request = new MultiTermVectorsRequest();
for (MultiGetRequest.Item item : items) {
request.add(item);
}
MultiGetResponse responses = client.multiGet(request).actionGet();
List<LikeText> likeTexts = new ArrayList<>();
for (MultiGetItemResponse response : responses) {
List<Fields> likeFields = new ArrayList<>();
MultiTermVectorsResponse responses = client.multiTermVectors(request).actionGet();
for (MultiTermVectorsItemResponse response : responses) {
if (response.isFailed()) {
continue;
}
GetResponse getResponse = response.getResponse();
TermVectorResponse getResponse = response.getResponse();
if (!getResponse.isExists()) {
continue;
}
for (GetField getField : getResponse.getFields().values()) {
String[] text = new String[getField.getValues().size()];
for (int i = 0; i < text.length; i++) {
text[i] = getField.getValues().get(i).toString();
}
likeTexts.add(new LikeText(getField.getName(), text));
}
likeFields.add(getResponse.getFields());
}
return likeTexts;
return likeFields.toArray(Fields.EMPTY_ARRAY);
}
}

View File

@ -21,18 +21,23 @@ package org.elasticsearch.index.query;
import com.google.common.collect.Lists;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.*;
import org.apache.lucene.sandbox.queries.FuzzyLikeThisQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.search.spans.*;
import org.apache.lucene.spatial.prefix.IntersectsPrefixTreeFilter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.NumericUtils;
import org.apache.lucene.util.UnicodeUtil;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.compress.CompressedString;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.*;
import org.elasticsearch.common.lucene.search.function.BoostScoreFunction;
import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
@ -48,7 +53,6 @@ import org.elasticsearch.index.search.geo.GeoDistanceFilter;
import org.elasticsearch.index.search.geo.GeoPolygonFilter;
import org.elasticsearch.index.search.geo.InMemoryGeoBoundingBoxFilter;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService;
import org.elasticsearch.index.search.morelikethis.MoreLikeThisFetchService.LikeText;
import org.elasticsearch.index.service.IndexService;
import org.elasticsearch.test.ElasticsearchSingleNodeTest;
import org.hamcrest.Matchers;
@ -1591,37 +1595,24 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
MoreLikeThisQueryParser parser = (MoreLikeThisQueryParser) queryParser.queryParser("more_like_this");
parser.setFetchService(new MockMoreLikeThisFetchService());
List<LikeText> likeTexts = new ArrayList<>();
likeTexts.add(new LikeText("name.first", new String[]{
"test person 1 name.first", "test person 2 name.first", "test person 3 name.first", "test person 4 name.first"}));
likeTexts.add(new LikeText("name.last", new String[]{
"test person 1 name.last", "test person 2 name.last", "test person 3 name.last", "test person 4 name.last"}));
IndexQueryParserService queryParser = queryParser();
String query = copyToStringFromClasspath("/org/elasticsearch/index/query/mlt-items.json");
Query parsedQuery = queryParser.parse(query).query();
assertThat(parsedQuery, instanceOf(BooleanQuery.class));
BooleanQuery booleanQuery = (BooleanQuery) parsedQuery;
assertThat(booleanQuery.getClauses().length, is(likeTexts.size() + 1));
assertThat(booleanQuery.getClauses().length, is(1));
// check each clause is for each item
BooleanClause[] boolClauses = booleanQuery.getClauses();
for (int i = 0; i < likeTexts.size(); i++) {
BooleanClause booleanClause = booleanQuery.getClauses()[i];
assertThat(booleanClause.getOccur(), is(BooleanClause.Occur.SHOULD));
assertThat(booleanClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) booleanClause.getQuery();
assertThat(mltQuery.getLikeTexts(), is(likeTexts.get(i).text));
assertThat(mltQuery.getMoreLikeFields()[0], equalTo(likeTexts.get(i).field));
BooleanClause itemClause = booleanQuery.getClauses()[0];
assertThat(itemClause.getOccur(), is(BooleanClause.Occur.SHOULD));
assertThat(itemClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) itemClause.getQuery();
// check each Fields is for each item
for (int id = 1; id <= 4; id++) {
Fields fields = mltQuery.getLikeFields()[id - 1];
assertThat(termsToString(fields.terms("name.first")), is(String.valueOf(id)));
assertThat(termsToString(fields.terms("name.last")), is(String.valueOf(id)));
}
// check last clause is for 'like_text'
BooleanClause boolClause = boolClauses[boolClauses.length - 1];
assertThat(boolClause.getOccur(), is(BooleanClause.Occur.SHOULD));
assertThat(boolClause.getQuery(), instanceOf(MoreLikeThisQuery.class));
MoreLikeThisQuery mltQuery = (MoreLikeThisQuery) boolClause.getQuery();
assertArrayEquals("Not the same more like this 'fields'", new String[] {"name.first", "name.last"}, mltQuery.getMoreLikeFields());
assertThat(mltQuery.getLikeText(), equalTo("Apache Lucene"));
}
private static class MockMoreLikeThisFetchService extends MoreLikeThisFetchService {
@ -1630,19 +1621,36 @@ public class SimpleIndexQueryParserTests extends ElasticsearchSingleNodeTest {
super(null, ImmutableSettings.Builder.EMPTY_SETTINGS);
}
public List<LikeText> fetch(List<MultiGetRequest.Item> items) throws IOException {
List<LikeText> likeTexts = new ArrayList<>();
for (MultiGetRequest.Item item: items) {
for (String field : item.fields()) {
LikeText likeText = new LikeText(
field, item.index() + " " + item.type() + " " + item.id() + " " + field);
likeTexts.add(likeText);
}
public Fields[] fetch(List<MultiGetRequest.Item> items) throws IOException {
List<Fields> likeTexts = new ArrayList<>();
for (MultiGetRequest.Item item : items) {
likeTexts.add(generateFields(item.fields(), item.id()));
}
return likeTexts;
return likeTexts.toArray(Fields.EMPTY_ARRAY);
}
}
private static Fields generateFields(String[] fieldNames, String text) throws IOException {
MemoryIndex index = new MemoryIndex();
for (String fieldName : fieldNames) {
index.addField(fieldName, text, new WhitespaceAnalyzer(Lucene.VERSION));
}
return MultiFields.getFields(index.createSearcher().getIndexReader());
}
private static String termsToString(Terms terms) throws IOException {
String strings = "";
TermsEnum termsEnum = terms.iterator(null);
CharsRef spare = new CharsRef();
BytesRef text;
while((text = termsEnum.next()) != null) {
UnicodeUtil.UTF8toUTF16(text, spare);
String term = spare.toString();
strings += term;
}
return strings;
}
@Test
public void testFuzzyLikeThisBuilder() throws Exception {
IndexQueryParserService queryParser = queryParser();