Term Vectors: terms filtering

This adds a new feature to the Term Vectors API which allows for filtering of
terms based on their tf-idf scores. With `dfs` option on, this could be useful
for finding out a good characteric vector of a document or a set of documents.
The parameters are similar to the ones used in the MLT Query.

Closes #9561
This commit is contained in:
Alex Ksikes 2015-01-30 14:27:08 +01:00
parent 82df50a039
commit d339ee4005
12 changed files with 781 additions and 36 deletions

View File

@ -85,6 +85,34 @@ Setting `dfs` to `true` (default is `false`) will return the term statistics
or the field statistics of the entire index, and not just at the shard. Use it
with caution as distributed frequencies can have a serious performance impact.
[float]
==== Terms Filtering coming[2.0]
With the parameter `filter`, the terms returned could also be filtered based
on their tf-idf scores. This could be useful in order find out a good
characteristic vector of a document. This feature works in a similar manner to
the <<mlt-query-term-selection,second phase>> of the
<<query-dsl-mlt-query,More Like This Query>>. See <<docs-termvectors-terms-filtering,example 5>>
for usage.
The following sub-parameters are supported:
[horizontal]
`max_num_terms`::
Maximum number of terms that must be returned per field. Defaults to `25`.
`min_term_freq`::
Ignore words with less than this frequency in the source doc. Defaults to `1`.
`max_term_freq`::
Ignore words with more than this frequency in the source doc. Defaults to unbounded.
`min_doc_freq`::
Ignore terms which do not occur in at least this many docs. Defaults to `1`.
`max_doc_freq`::
Ignore words which occur in more than this many docs. Defaults to unbounded.
`min_word_length`::
The minimum word length below which words will be ignored. Defaults to `0`.
`max_word_length`::
The maximum word length above which words will be ignored. Defaults to unbounded (`0`).
[float]
=== Behaviour
@ -337,3 +365,75 @@ Response:
}
}
--------------------------------------------------
[float]
[[docs-termvectors-terms-filtering]]
=== Example 5
Finally, the terms returned could be filtered based on their tf-idf scores. In
the example below we obtain the three most "interesting" keywords from the
artificial document having the given "plot" field value. Additionally, we are
asking for distributed frequencies to obtain more accurate results. Notice
that the keyword "Tony" or any stop words are not part of the response, as
their tf-idf must be too low.
[source,js]
--------------------------------------------------
GET /imdb/movies/_termvectors
{
"doc": {
"plot": "When wealthy industrialist Tony Stark is forced to build an armored suit after a life-threatening incident, he ultimately decides to use its technology to fight against evil."
},
"term_statistics" : true,
"field_statistics" : true,
"dfs": true,
"positions": false,
"offsets": false,
"filter" : {
"max_num_terms" : 3,
"min_term_freq" : 1,
"min_doc_freq" : 1
}
}
--------------------------------------------------
Response:
[source,js]
--------------------------------------------------
{
"_index": "imdb",
"_type": "movies",
"_version": 0,
"found": true,
"term_vectors": {
"plot": {
"field_statistics": {
"sum_doc_freq": 3384269,
"doc_count": 176214,
"sum_ttf": 3753460
},
"terms": {
"armored": {
"doc_freq": 27,
"ttf": 27,
"term_freq": 1,
"score": 9.74725
},
"industrialist": {
"doc_freq": 88,
"ttf": 88,
"term_freq": 1,
"score": 8.590818
},
"stark": {
"doc_freq": 44,
"ttf": 47,
"term_freq": 1,
"score": 9.272792
}
}
}
}
}
--------------------------------------------------

View File

@ -178,6 +178,7 @@ The text to find documents like it.
A list of documents following the same syntax as the <<docs-multi-get,Multi GET API>>.
[float]
[[mlt-query-term-selection]]
==== Term Selection Parameters
[horizontal]

View File

@ -21,7 +21,11 @@ package org.elasticsearch.action.termvectors;
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
import com.carrotsearch.hppc.cursors.ObjectLongCursor;
import org.apache.lucene.index.*;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.util.*;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamInput;
@ -40,7 +44,7 @@ import static org.apache.lucene.util.ArrayUtil.grow;
* <tt>-1,</tt>, if no positions were returned by the {@link TermVectorsRequest}.
* <p/>
* The data is stored in two byte arrays ({@code headerRef} and
* {@code termVectors}, both {@link ByteRef}) that have the following format:
* {@code termVectors}, both {@link BytesRef}) that have the following format:
* <p/>
* {@code headerRef}: Stores offsets per field in the {@code termVectors} array
* and some header information as {@link BytesRef}. Format is
@ -113,6 +117,7 @@ public final class TermVectorsFields extends Fields {
private final BytesReference termVectors;
final boolean hasTermStatistic;
final boolean hasFieldStatistic;
public final boolean hasScores;
/**
* @param headerRef Stores offsets per field in the {@code termVectors} and some
@ -130,6 +135,7 @@ public final class TermVectorsFields extends Fields {
assert version == -1;
hasTermStatistic = header.readBoolean();
hasFieldStatistic = header.readBoolean();
hasScores = header.readBoolean();
final int numFields = header.readVInt();
for (int i = 0; i < numFields; i++) {
fieldMap.put((header.readString()), header.readVLong());
@ -226,6 +232,7 @@ public final class TermVectorsFields extends Fields {
int[] endOffsets = new int[1];
BytesRefBuilder[] payloads = new BytesRefBuilder[1];
final BytesRefBuilder spare = new BytesRefBuilder();
BoostAttribute boostAtt = this.attributes().addAttribute(BoostAttribute.class);
@Override
public BytesRef next() throws IOException {
@ -250,6 +257,11 @@ public final class TermVectorsFields extends Fields {
// currentPosition etc. so that we can just iterate
// later
writeInfos(perFieldTermVectorInput);
// read the score if available
if (hasScores) {
boostAtt.setBoost(perFieldTermVectorInput.readFloat());
}
return spare.get();
} else {
@ -482,5 +494,4 @@ public final class TermVectorsFields extends Fields {
long readPotentiallyNegativeVLong(BytesStreamInput stream) throws IOException {
return stream.readVLong() - 1;
}
}
}

View File

@ -0,0 +1,325 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.action.termvectors;
import com.google.common.util.concurrent.AtomicLongMap;
import org.apache.lucene.index.*;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.similarities.DefaultSimilarity;
import org.apache.lucene.search.similarities.TFIDFSimilarity;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.search.dfs.AggregatedDfs;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
public class TermVectorsFilter {
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
public static final int DEFAULT_MIN_TERM_FREQ = 0;
public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_DOC_FREQ = 0;
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
private Fields fields;
private Fields topLevelFields;
private final Set<String> selectedFields;
private AggregatedDfs dfs;
private Map<Term, ScoreTerm> scoreTerms;
private AtomicLongMap<String> sizes;
private TFIDFSimilarity similarity;
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
this.fields = termVectorsByField;
this.topLevelFields = topLevelFields;
this.selectedFields = selectedFields;
this.dfs = dfs;
this.scoreTerms = new HashMap<>();
this.sizes = AtomicLongMap.create();
this.similarity = new DefaultSimilarity();
}
public void setSettings(TermVectorsRequest.FilterSettings settings) {
if (settings.maxNumTerms != null) {
setMaxNumTerms(settings.maxNumTerms);
}
if (settings.minTermFreq != null) {
setMinTermFreq(settings.minTermFreq);
}
if (settings.maxTermFreq != null) {
setMaxTermFreq(settings.maxTermFreq);
}
if (settings.minDocFreq != null) {
setMinDocFreq(settings.minDocFreq);
}
if (settings.maxDocFreq != null) {
setMaxDocFreq(settings.maxDocFreq);
}
if (settings.minWordLength != null) {
setMinWordLength(settings.minWordLength);
}
if (settings.maxWordLength != null) {
setMaxWordLength(settings.maxWordLength);
}
}
public ScoreTerm getScoreTerm(Term term) {
return scoreTerms.get(term);
}
public boolean hasScoreTerm(Term term) {
return getScoreTerm(term) != null;
}
public long size(String fieldName) {
return sizes.get(fieldName);
}
public int getMaxNumTerms() {
return maxNumTerms;
}
public int getMinTermFreq() {
return minTermFreq;
}
public int getMaxTermFreq() {
return maxTermFreq;
}
public int getMinDocFreq() {
return minDocFreq;
}
public int getMaxDocFreq() {
return maxDocFreq;
}
public int getMinWordLength() {
return minWordLength;
}
public int getMaxWordLength() {
return maxWordLength;
}
public void setMaxNumTerms(int maxNumTerms) {
this.maxNumTerms = maxNumTerms;
}
public void setMinTermFreq(int minTermFreq) {
this.minTermFreq = minTermFreq;
}
public void setMaxTermFreq(int maxTermFreq) {
this.maxTermFreq = maxTermFreq;
}
public void setMinDocFreq(int minDocFreq) {
this.minDocFreq = minDocFreq;
}
public void setMaxDocFreq(int maxDocFreq) {
this.maxDocFreq = maxDocFreq;
}
public void setMinWordLength(int minWordLength) {
this.minWordLength = minWordLength;
}
public void setMaxWordLength(int maxWordLength) {
this.maxWordLength = maxWordLength;
}
public static final class ScoreTerm {
public String field;
public String word;
public float score;
ScoreTerm(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
void update(String field, String word, float score) {
this.field = field;
this.word = word;
this.score = score;
}
}
public void selectBestTerms() throws IOException {
PostingsEnum docsEnum = null;
for (String fieldName : fields) {
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
continue;
}
Terms terms = fields.terms(fieldName);
Terms topLevelTerms = topLevelFields.terms(fieldName);
// if no terms found, take the retrieved term vector fields for stats
if (topLevelTerms == null) {
topLevelTerms = terms;
}
long numDocs = getDocCount(fieldName, topLevelTerms);
// one queue per field name
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
// select terms with highest tf-idf
TermsEnum termsEnum = terms.iterator();
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
while (termsEnum.next() != null) {
BytesRef termBytesRef = termsEnum.term();
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
Term term = new Term(fieldName, termBytesRef);
// remove noise words
int freq = getTermFreq(termsEnum, docsEnum);
if (isNoise(term.bytes().utf8ToString(), freq)) {
continue;
}
// now call on docFreq
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
if (!isAccepted(docFreq)) {
continue;
}
// filter based on score
float score = computeScore(docFreq, freq, numDocs);
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
}
// retain the best terms for quick lookups
ScoreTerm scoreTerm;
while ((scoreTerm = queue.pop()) != null) {
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
sizes.incrementAndGet(scoreTerm.field);
}
}
}
private boolean isNoise(String word, int freq) {
// filter out words based on length
int len = word.length();
if (minWordLength > 0 && len < minWordLength) {
return true;
}
if (maxWordLength > 0 && len > maxWordLength) {
return true;
}
// filter out words that don't occur enough times in the source
if (minTermFreq > 0 && freq < minTermFreq) {
return true;
}
// filter out words that occur too many times in the source
if (freq > maxTermFreq) {
return true;
}
return false;
}
private boolean isAccepted(long docFreq) {
// filter out words that don't occur in enough docs
if (minDocFreq > 0 && docFreq < minDocFreq) {
return false;
}
// filter out words that occur in too many docs
if (docFreq > maxDocFreq) {
return false;
}
// index update problem?
if (docFreq == 0) {
return false;
}
return true;
}
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
if (dfs != null) {
return dfs.fieldStatistics().get(fieldName).docCount();
}
return topLevelTerms.getDocCount();
}
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
if (dfs != null) {
return dfs.termStatistics().get(term);
}
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
}
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
docsEnum = termsEnum.postings(null, docsEnum);
docsEnum.nextDoc();
return docsEnum.freq();
}
private float computeScore(long docFreq, int freq, long numDocs) {
return freq * similarity.idf(docFreq, numDocs);
}
private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue<ScoreTerm> {
private final int limit;
ScoreTermsQueue(int maxSize) {
super(maxSize);
this.limit = maxSize;
}
@Override
protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
return a.score < b.score;
}
public void addOrUpdate(ScoreTerm scoreTerm) {
if (this.size() < limit) {
// there is still space in the queue
this.add(scoreTerm);
} else {
// otherwise update the smallest in the queue in place and update the queue
ScoreTerm scoreTermTop = this.top();
if (scoreTermTop.score < scoreTerm.score) {
scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
this.updateTop();
}
}
}
}
}

View File

@ -28,6 +28,7 @@ import org.elasticsearch.action.DocumentRequest;
import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.get.MultiGetRequest;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
@ -74,6 +75,54 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
private Map<String, String> perFieldAnalyzer;
private FilterSettings filterSettings;
public static final class FilterSettings {
public Integer maxNumTerms;
public Integer minTermFreq;
public Integer maxTermFreq;
public Integer minDocFreq;
public Integer maxDocFreq;
public Integer minWordLength;
public Integer maxWordLength;
public FilterSettings() {
}
public FilterSettings(@Nullable Integer maxNumTerms, @Nullable Integer minTermFreq, @Nullable Integer maxTermFreq,
@Nullable Integer minDocFreq, @Nullable Integer maxDocFreq, @Nullable Integer minWordLength,
@Nullable Integer maxWordLength) {
this.maxNumTerms = maxNumTerms;
this.minTermFreq = minTermFreq;
this.maxTermFreq = maxTermFreq;
this.minDocFreq = minDocFreq;
this.maxDocFreq = maxDocFreq;
this.minWordLength = minWordLength;
this.maxWordLength = maxWordLength;
}
public void readFrom(StreamInput in) throws IOException {
maxNumTerms = in.readOptionalVInt();
minTermFreq = in.readOptionalVInt();
maxTermFreq = in.readOptionalVInt();
minDocFreq = in.readOptionalVInt();
maxDocFreq = in.readOptionalVInt();
minWordLength = in.readOptionalVInt();
maxWordLength = in.readOptionalVInt();
}
public void writeTo(StreamOutput out) throws IOException {
out.writeOptionalVInt(maxNumTerms);
out.writeOptionalVInt(minTermFreq);
out.writeOptionalVInt(maxTermFreq);
out.writeOptionalVInt(minDocFreq);
out.writeOptionalVInt(maxDocFreq);
out.writeOptionalVInt(minWordLength);
out.writeOptionalVInt(maxWordLength);
}
}
private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
Flag.FieldStatistics);
@ -375,6 +424,21 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
return this;
}
/**
* Return the settings for filtering out terms.
*/
public FilterSettings filterSettings() {
return this.filterSettings;
}
/**
* Sets the settings for filtering out terms.
*/
public TermVectorsRequest filterSettings(FilterSettings settings) {
this.filterSettings = settings != null ? settings : null;
return this;
}
public long version() {
return version;
}
@ -454,6 +518,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
if (in.readBoolean()) {
perFieldAnalyzer = readPerFieldAnalyzer(in.readMap());
}
if (in.readBoolean()) {
filterSettings = new FilterSettings();
filterSettings.readFrom(in);
}
realtime = in.readBoolean();
versionType = VersionType.fromValue(in.readByte());
version = in.readLong();
@ -488,6 +556,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
if (perFieldAnalyzer != null) {
out.writeGenericValue(perFieldAnalyzer);
}
out.writeBoolean(filterSettings != null);
if (filterSettings != null) {
filterSettings.writeTo(out);
}
out.writeBoolean(realtime());
out.writeByte(versionType.getValue());
out.writeLong(version);
@ -533,6 +605,8 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
termVectorsRequest.dfs(parser.booleanValue());
} else if (currentFieldName.equals("per_field_analyzer") || currentFieldName.equals("perFieldAnalyzer")) {
termVectorsRequest.perFieldAnalyzer(readPerFieldAnalyzer(parser.map()));
} else if (currentFieldName.equals("filter")) {
termVectorsRequest.filterSettings(readFilterSettings(parser, termVectorsRequest));
} else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.
termVectorsRequest.index = parser.text();
} else if ("_type".equals(currentFieldName)) {
@ -577,4 +651,35 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
}
return mapStrStr;
}
private static FilterSettings readFilterSettings(XContentParser parser, TermVectorsRequest termVectorsRequest) throws IOException {
FilterSettings settings = new FilterSettings();
XContentParser.Token token;
String currentFieldName = null;
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (currentFieldName != null) {
if (currentFieldName.equals("max_num_terms")) {
settings.maxNumTerms = parser.intValue();
} else if (currentFieldName.equals("min_term_freq")) {
settings.minTermFreq = parser.intValue();
} else if (currentFieldName.equals("max_term_freq")) {
settings.maxTermFreq = parser.intValue();
} else if (currentFieldName.equals("min_doc_freq")) {
settings.minDocFreq = parser.intValue();
} else if (currentFieldName.equals("max_doc_freq")) {
settings.maxDocFreq = parser.intValue();
} else if (currentFieldName.equals("min_word_length")) {
settings.minWordLength = parser.intValue();
} else if (currentFieldName.equals("max_word_length")) {
settings.maxWordLength = parser.intValue();
} else {
throw new ElasticsearchParseException("The parameter " + currentFieldName
+ " is not valid for filter parameter for term vector request!");
}
}
}
return settings;
}
}

View File

@ -200,6 +200,14 @@ public class TermVectorsRequestBuilder extends ActionRequestBuilder<TermVectorsR
return this;
}
/**
* Sets the settings for filtering out terms.
*/
public TermVectorsRequestBuilder setFilterSettings(TermVectorsRequest.FilterSettings filterSettings) {
request.filterSettings(filterSettings);
return this;
}
@Override
protected void doExecute(ActionListener<TermVectorsResponse> listener) {
client.termVectors(request, listener);

View File

@ -20,11 +20,11 @@
package org.elasticsearch.action.termvectors;
import com.google.common.collect.Iterators;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BoostAttribute;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRefBuilder;
@ -55,6 +55,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
public static final XContentBuilderString TTF = new XContentBuilderString("ttf");
public static final XContentBuilderString DOC_FREQ = new XContentBuilderString("doc_freq");
public static final XContentBuilderString TERM_FREQ = new XContentBuilderString("term_freq");
public static final XContentBuilderString SCORE = new XContentBuilderString("score");
// field statistics strings
public static final XContentBuilderString FIELD_STATISTICS = new XContentBuilderString("field_statistics");
@ -86,6 +87,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
private boolean exists = false;
private boolean artificial = false;
private long tookInMillis;
private boolean hasScores = false;
private boolean sourceCopied = false;
@ -146,7 +148,9 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
headerRef = headerRef.copyBytesArray();
termVectors = termVectors.copyBytesArray();
}
return new TermVectorsFields(headerRef, termVectors);
TermVectorsFields termVectorsFields = new TermVectorsFields(headerRef, termVectors);
hasScores = termVectorsFields.hasScores;
return termVectorsFields;
} else {
return new Fields() {
@Override
@ -202,14 +206,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
buildFieldStatistics(builder, curTerms);
builder.startObject(FieldStrings.TERMS);
TermsEnum termIter = curTerms.iterator();
BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
for (int i = 0; i < curTerms.size(); i++) {
buildTerm(builder, spare, curTerms, termIter);
buildTerm(builder, spare, curTerms, termIter, boostAtt);
}
builder.endObject();
builder.endObject();
}
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter) throws IOException {
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
// start term, optimized writing
BytesRef term = termIter.next();
spare.copyUTF8Bytes(term);
@ -222,6 +227,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
initMemory(curTerms, termFreq);
initValues(curTerms, posEnum, termFreq);
buildValues(builder, curTerms, termFreq);
buildScore(builder, boostAtt);
builder.endObject();
}
@ -332,6 +338,12 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
public long getTookInMillis() {
return tookInMillis;
}
private void buildScore(XContentBuilder builder, BoostAttribute boostAtt) throws IOException {
if (hasScores) {
builder.field(FieldStrings.SCORE, boostAtt.getBoost());
}
}
public boolean isExists() {
return exists;
@ -342,14 +354,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
}
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null);
}
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs,
TermVectorsFilter termVectorsFilter) throws IOException {
TermVectorsWriter tvw = new TermVectorsWriter(this);
if (termVectorsByField != null) {
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs);
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter);
}
}

View File

@ -50,10 +50,13 @@ final class TermVectorsWriter {
response = termVectorsResponse;
}
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
@Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
int numFieldsWritten = 0;
PostingsEnum docsAndPosEnum = null;
PostingsEnum docsEnum = null;
boolean hasScores = termVectorsFilter != null;
for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue;
@ -71,7 +74,13 @@ final class TermVectorsWriter {
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
startField(field, fieldTermVector.size(), positions, offsets, payloads);
long termsSize = fieldTermVector.size();
if (hasScores) {
termsSize = Math.min(termsSize, termVectorsFilter.size(field));
}
startField(field, termsSize, positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
if (dfs != null) {
writeFieldStatistics(dfs.fieldStatistics().get(field));
@ -81,15 +90,21 @@ final class TermVectorsWriter {
}
TermsEnum iterator = fieldTermVector.iterator();
final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) { // iterate all terms of the
// current field
// get the doc frequency
BytesRef term = iterator.term();
boolean foundTerm = topLevelIterator.seekExact(term);
startTerm(term);
while (iterator.next() != null) { // iterate all terms of the current field
BytesRef termBytesRef = iterator.term();
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
Term term = new Term(field, termBytesRef);
// with filtering we only keep the best terms
if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
continue;
}
startTerm(termBytesRef);
if (flags.contains(Flag.TermStatistics)) {
// get the doc frequency
if (dfs != null) {
writeTermStatistics(dfs.termStatistics().get(new Term(field, term.utf8ToString())));
writeTermStatistics(dfs.termStatistics().get(term));
} else {
writeTermStatistics(topLevelIterator);
}
@ -102,14 +117,17 @@ final class TermVectorsWriter {
// get the frequency from a PostingsEnum.
docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
}
if (hasScores) {
writeScoreTerm(termVectorsFilter.getScoreTerm(term));
}
}
numFieldsWritten++;
}
response.setTermVectorsField(output);
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics)));
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics) throws IOException {
private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics, boolean scores) throws IOException {
// now, write the information about offset of the terms in the
// termVectors field
BytesStreamOutput header = new BytesStreamOutput();
@ -117,6 +135,7 @@ final class TermVectorsWriter {
header.writeInt(CURRENT_VERSION);
header.writeBoolean(getTermStatistics);
header.writeBoolean(getFieldStatistics);
header.writeBoolean(scores);
header.writeVInt(numFieldsWritten);
for (int i = 0; i < fields.size(); i++) {
header.writeString(fields.get(i));
@ -205,7 +224,6 @@ final class TermVectorsWriter {
private void startTerm(BytesRef term) throws IOException {
output.writeVInt(term.length);
output.writeBytes(term.bytes, term.offset, term.length);
}
private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException {
@ -250,6 +268,10 @@ final class TermVectorsWriter {
writePotentiallyNegativeVInt(dc);
}
private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
output.writeFloat(Math.max(0, scoreTerm.score));
}
private void writePotentiallyNegativeVInt(int value) throws IOException {
// term freq etc. can be negative if not present... we transport that
// further...
@ -261,5 +283,4 @@ final class TermVectorsWriter {
// further...
output.writeVLong(Math.max(0, value + 1));
}
}

View File

@ -229,6 +229,14 @@ public abstract class StreamInput extends InputStream {
return null;
}
@Nullable
public Integer readOptionalVInt() throws IOException {
if (readBoolean()) {
return readVInt();
}
return null;
}
private final CharsRefBuilder spare = new CharsRefBuilder();
public String readString() throws IOException {

View File

@ -176,6 +176,15 @@ public abstract class StreamOutput extends OutputStream {
}
}
public void writeOptionalVInt(@Nullable Integer integer) throws IOException {
if (integer == null) {
writeBoolean(false);
} else {
writeBoolean(true);
writeVInt(integer);
}
}
public void writeOptionalText(@Nullable Text text) throws IOException {
if (text == null) {
writeInt(-1);

View File

@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.termvectors.TermVectorsFilter;
import org.elasticsearch.action.termvectors.TermVectorsRequest;
import org.elasticsearch.action.termvectors.TermVectorsResponse;
import org.elasticsearch.action.termvectors.dfs.DfsOnlyRequest;
@ -82,8 +83,10 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm).version(request.version()).versionType(request.versionType()));
Fields termVectorsByField = null;
boolean docFromTranslog = get.source() != null;
AggregatedDfs dfs = null;
TermVectorsFilter termVectorsFilter = null;
/* fetched from translog is treated as an artificial document */
if (docFromTranslog) {
@ -102,22 +105,18 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
/* from an artificial document */
if (request.doc() != null) {
Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
// if no document indexed in shard, take the queried document itself for stats
if (topLevelFields == null) {
topLevelFields = termVectorsByField;
}
if (termVectorsByField != null && useDfs(request)) {
dfs = getAggregatedDfs(termVectorsByField, request);
}
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
termVectorsResponse.setExists(true);
termVectorsResponse.setArtificial(!docFromTranslog);
termVectorsResponse.setExists(true);
}
/* or from an existing document */
else if (docIdAndVersion != null) {
// fields with stored term vectors
Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
Set<String> selectedFields = request.selectedFields();
// generate tvs for fields where analyzer is overridden
if (selectedFields == null && request.perFieldAnalyzer() != null) {
@ -127,15 +126,31 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
if (selectedFields != null) {
termVectorsByField = addGeneratedTermVectors(get, termVectorsByField, request, selectedFields);
}
if (termVectorsByField != null && useDfs(request)) {
dfs = getAggregatedDfs(termVectorsByField, request);
}
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
termVectorsResponse.setDocVersion(docIdAndVersion.version);
termVectorsResponse.setExists(true);
} else {
}
/* no term vectors generated or found */
else {
termVectorsResponse.setExists(false);
}
/* if there are term vectors, optional compute dfs and/or terms filtering */
if (termVectorsByField != null) {
if (useDfs(request)) {
dfs = getAggregatedDfs(termVectorsByField, request);
}
if (request.filterSettings() != null) {
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs);
termVectorsFilter.setSettings(request.filterSettings());
try {
termVectorsFilter.selectBestTerms();
} catch (IOException e) {
throw new ElasticsearchException("failed to select best terms", e);
}
}
// write term vectors
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs, termVectorsFilter);
}
} catch (Throwable ex) {
throw new ElasticsearchException("failed to execute term vector request", ex);
} finally {

View File

@ -1225,4 +1225,133 @@ public class GetTermVectorsTests extends AbstractTermVectorsTests {
assertThat(response.getIndex(), equalTo("test"));
assertThat(response.getVersion(), equalTo(2l));
}
@Test
public void testFilterLength() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
ImmutableSettings.Builder settings = settingsBuilder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=string"));
ensureYellow();
int numTerms = scaledRandomIntBetween(10, 50);
logger.info("Indexing one document with tags of increasing length ...");
List<String> tags = new ArrayList<>();
for (int i = 0; i < numTerms; i++) {
String tag = "a";
for (int j = 0; j < i; j++) {
tag += "a";
}
tags.add(tag);
}
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
logger.info("Checking best tags by longest to shortest size ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
filterSettings.maxNumTerms = numTerms;
TermVectorsResponse response;
for (int i = 0; i < numTerms; i++) {
filterSettings.minWordLength = numTerms - i;
response = client().prepareTermVectors("test", "type1", "1")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), tags.subList((numTerms - i - 1), numTerms));
}
}
@Test
public void testFilterTermFreq() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
ImmutableSettings.Builder settings = settingsBuilder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword");
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=string"));
ensureYellow();
logger.info("Indexing one document with tags of increasing frequencies ...");
int numTerms = scaledRandomIntBetween(10, 50);
List<String> tags = new ArrayList<>();
List<String> uniqueTags = new ArrayList<>();
String tag;
for (int i = 0; i < numTerms; i++) {
tag = "tag_" + i;
tags.add(tag);
for (int j = 0; j < i; j++) {
tags.add(tag);
}
uniqueTags.add(tag);
}
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
logger.info("Checking best tags by highest to lowest term freq ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
TermVectorsResponse response;
for (int i = 0; i < numTerms; i++) {
filterSettings.maxNumTerms = i + 1;
response = client().prepareTermVectors("test", "type1", "1")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), uniqueTags.subList((numTerms - i - 1), numTerms));
}
}
@Test
public void testFilterDocFreq() throws ExecutionException, InterruptedException, IOException {
logger.info("Setting up the index ...");
ImmutableSettings.Builder settings = settingsBuilder()
.put(indexSettings())
.put("index.analysis.analyzer", "keyword")
.put("index.number_of_shards", 1); // no dfs
assertAcked(prepareCreate("test")
.setSettings(settings)
.addMapping("type1", "tags", "type=string"));
ensureYellow();
int numDocs = scaledRandomIntBetween(10, 50); // as many terms as there are docs
logger.info("Indexing {} documents with tags of increasing dfs ...", numDocs);
List<IndexRequestBuilder> builders = new ArrayList<>();
List<String> tags = new ArrayList<>();
for (int i = 0; i < numDocs; i++) {
tags.add("tag_" + i);
builders.add(client().prepareIndex("test", "type1", i + "").setSource("tags", tags));
}
indexRandom(true, builders);
logger.info("Checking best terms by highest to lowest idf ...");
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
TermVectorsResponse response;
for (int i = 0; i < numDocs; i++) {
filterSettings.maxNumTerms = i + 1;
response = client().prepareTermVectors("test", "type1", (numDocs - 1) + "")
.setSelectedFields("tags")
.setFieldStatistics(true)
.setTermStatistics(true)
.setFilterSettings(filterSettings)
.get();
checkBestTerms(response.getFields().terms("tags"), tags.subList((numDocs - i - 1), numDocs));
}
}
private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException {
final TermsEnum termsEnum = terms.iterator();
List<String> bestTerms = new ArrayList<>();
BytesRef text;
while((text = termsEnum.next()) != null) {
bestTerms.add(text.utf8ToString());
}
Collections.sort(expectedTerms);
Collections.sort(bestTerms);
assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray());
}
}