mirror of
https://github.com/honeymoose/OpenSearch.git
synced 2025-03-25 17:38:44 +00:00
Term Vectors: terms filtering
This adds a new feature to the Term Vectors API which allows for filtering of terms based on their tf-idf scores. With `dfs` option on, this could be useful for finding out a good characteric vector of a document or a set of documents. The parameters are similar to the ones used in the MLT Query. Closes #9561
This commit is contained in:
parent
82df50a039
commit
d339ee4005
@ -85,6 +85,34 @@ Setting `dfs` to `true` (default is `false`) will return the term statistics
|
||||
or the field statistics of the entire index, and not just at the shard. Use it
|
||||
with caution as distributed frequencies can have a serious performance impact.
|
||||
|
||||
[float]
|
||||
==== Terms Filtering coming[2.0]
|
||||
|
||||
With the parameter `filter`, the terms returned could also be filtered based
|
||||
on their tf-idf scores. This could be useful in order find out a good
|
||||
characteristic vector of a document. This feature works in a similar manner to
|
||||
the <<mlt-query-term-selection,second phase>> of the
|
||||
<<query-dsl-mlt-query,More Like This Query>>. See <<docs-termvectors-terms-filtering,example 5>>
|
||||
for usage.
|
||||
|
||||
The following sub-parameters are supported:
|
||||
|
||||
[horizontal]
|
||||
`max_num_terms`::
|
||||
Maximum number of terms that must be returned per field. Defaults to `25`.
|
||||
`min_term_freq`::
|
||||
Ignore words with less than this frequency in the source doc. Defaults to `1`.
|
||||
`max_term_freq`::
|
||||
Ignore words with more than this frequency in the source doc. Defaults to unbounded.
|
||||
`min_doc_freq`::
|
||||
Ignore terms which do not occur in at least this many docs. Defaults to `1`.
|
||||
`max_doc_freq`::
|
||||
Ignore words which occur in more than this many docs. Defaults to unbounded.
|
||||
`min_word_length`::
|
||||
The minimum word length below which words will be ignored. Defaults to `0`.
|
||||
`max_word_length`::
|
||||
The maximum word length above which words will be ignored. Defaults to unbounded (`0`).
|
||||
|
||||
[float]
|
||||
=== Behaviour
|
||||
|
||||
@ -337,3 +365,75 @@ Response:
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
[float]
|
||||
[[docs-termvectors-terms-filtering]]
|
||||
=== Example 5
|
||||
|
||||
Finally, the terms returned could be filtered based on their tf-idf scores. In
|
||||
the example below we obtain the three most "interesting" keywords from the
|
||||
artificial document having the given "plot" field value. Additionally, we are
|
||||
asking for distributed frequencies to obtain more accurate results. Notice
|
||||
that the keyword "Tony" or any stop words are not part of the response, as
|
||||
their tf-idf must be too low.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
GET /imdb/movies/_termvectors
|
||||
{
|
||||
"doc": {
|
||||
"plot": "When wealthy industrialist Tony Stark is forced to build an armored suit after a life-threatening incident, he ultimately decides to use its technology to fight against evil."
|
||||
},
|
||||
"term_statistics" : true,
|
||||
"field_statistics" : true,
|
||||
"dfs": true,
|
||||
"positions": false,
|
||||
"offsets": false,
|
||||
"filter" : {
|
||||
"max_num_terms" : 3,
|
||||
"min_term_freq" : 1,
|
||||
"min_doc_freq" : 1
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
||||
Response:
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
{
|
||||
"_index": "imdb",
|
||||
"_type": "movies",
|
||||
"_version": 0,
|
||||
"found": true,
|
||||
"term_vectors": {
|
||||
"plot": {
|
||||
"field_statistics": {
|
||||
"sum_doc_freq": 3384269,
|
||||
"doc_count": 176214,
|
||||
"sum_ttf": 3753460
|
||||
},
|
||||
"terms": {
|
||||
"armored": {
|
||||
"doc_freq": 27,
|
||||
"ttf": 27,
|
||||
"term_freq": 1,
|
||||
"score": 9.74725
|
||||
},
|
||||
"industrialist": {
|
||||
"doc_freq": 88,
|
||||
"ttf": 88,
|
||||
"term_freq": 1,
|
||||
"score": 8.590818
|
||||
},
|
||||
"stark": {
|
||||
"doc_freq": 44,
|
||||
"ttf": 47,
|
||||
"term_freq": 1,
|
||||
"score": 9.272792
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
--------------------------------------------------
|
||||
|
@ -178,6 +178,7 @@ The text to find documents like it.
|
||||
A list of documents following the same syntax as the <<docs-multi-get,Multi GET API>>.
|
||||
|
||||
[float]
|
||||
[[mlt-query-term-selection]]
|
||||
==== Term Selection Parameters
|
||||
|
||||
[horizontal]
|
||||
|
@ -21,7 +21,11 @@ package org.elasticsearch.action.termvectors;
|
||||
|
||||
import com.carrotsearch.hppc.ObjectLongOpenHashMap;
|
||||
import com.carrotsearch.hppc.cursors.ObjectLongCursor;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
import org.apache.lucene.util.*;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.stream.BytesStreamInput;
|
||||
@ -40,7 +44,7 @@ import static org.apache.lucene.util.ArrayUtil.grow;
|
||||
* <tt>-1,</tt>, if no positions were returned by the {@link TermVectorsRequest}.
|
||||
* <p/>
|
||||
* The data is stored in two byte arrays ({@code headerRef} and
|
||||
* {@code termVectors}, both {@link ByteRef}) that have the following format:
|
||||
* {@code termVectors}, both {@link BytesRef}) that have the following format:
|
||||
* <p/>
|
||||
* {@code headerRef}: Stores offsets per field in the {@code termVectors} array
|
||||
* and some header information as {@link BytesRef}. Format is
|
||||
@ -113,6 +117,7 @@ public final class TermVectorsFields extends Fields {
|
||||
private final BytesReference termVectors;
|
||||
final boolean hasTermStatistic;
|
||||
final boolean hasFieldStatistic;
|
||||
public final boolean hasScores;
|
||||
|
||||
/**
|
||||
* @param headerRef Stores offsets per field in the {@code termVectors} and some
|
||||
@ -130,6 +135,7 @@ public final class TermVectorsFields extends Fields {
|
||||
assert version == -1;
|
||||
hasTermStatistic = header.readBoolean();
|
||||
hasFieldStatistic = header.readBoolean();
|
||||
hasScores = header.readBoolean();
|
||||
final int numFields = header.readVInt();
|
||||
for (int i = 0; i < numFields; i++) {
|
||||
fieldMap.put((header.readString()), header.readVLong());
|
||||
@ -226,6 +232,7 @@ public final class TermVectorsFields extends Fields {
|
||||
int[] endOffsets = new int[1];
|
||||
BytesRefBuilder[] payloads = new BytesRefBuilder[1];
|
||||
final BytesRefBuilder spare = new BytesRefBuilder();
|
||||
BoostAttribute boostAtt = this.attributes().addAttribute(BoostAttribute.class);
|
||||
|
||||
@Override
|
||||
public BytesRef next() throws IOException {
|
||||
@ -250,6 +257,11 @@ public final class TermVectorsFields extends Fields {
|
||||
// currentPosition etc. so that we can just iterate
|
||||
// later
|
||||
writeInfos(perFieldTermVectorInput);
|
||||
|
||||
// read the score if available
|
||||
if (hasScores) {
|
||||
boostAtt.setBoost(perFieldTermVectorInput.readFloat());
|
||||
}
|
||||
return spare.get();
|
||||
|
||||
} else {
|
||||
@ -482,5 +494,4 @@ public final class TermVectorsFields extends Fields {
|
||||
long readPotentiallyNegativeVLong(BytesStreamInput stream) throws IOException {
|
||||
return stream.readVLong() - 1;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,325 @@
|
||||
/*
|
||||
* Licensed to Elasticsearch under one or more contributor
|
||||
* license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright
|
||||
* ownership. Elasticsearch licenses this file to you under
|
||||
* the Apache License, Version 2.0 (the "License"); you may
|
||||
* not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*/
|
||||
package org.elasticsearch.action.termvectors;
|
||||
|
||||
import com.google.common.util.concurrent.AtomicLongMap;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.similarities.DefaultSimilarity;
|
||||
import org.apache.lucene.search.similarities.TFIDFSimilarity;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.search.dfs.AggregatedDfs;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
public class TermVectorsFilter {
|
||||
public static final int DEFAULT_MAX_QUERY_TERMS = 25;
|
||||
public static final int DEFAULT_MIN_TERM_FREQ = 0;
|
||||
public static final int DEFAULT_MAX_TERM_FREQ = Integer.MAX_VALUE;
|
||||
public static final int DEFAULT_MIN_DOC_FREQ = 0;
|
||||
public static final int DEFAULT_MAX_DOC_FREQ = Integer.MAX_VALUE;
|
||||
public static final int DEFAULT_MIN_WORD_LENGTH = 0;
|
||||
public static final int DEFAULT_MAX_WORD_LENGTH = 0;
|
||||
|
||||
private int maxNumTerms = DEFAULT_MAX_QUERY_TERMS;
|
||||
private int minTermFreq = DEFAULT_MIN_TERM_FREQ;
|
||||
private int maxTermFreq = DEFAULT_MAX_TERM_FREQ;
|
||||
private int minDocFreq = DEFAULT_MIN_DOC_FREQ;
|
||||
private int maxDocFreq = DEFAULT_MAX_DOC_FREQ;
|
||||
private int minWordLength = DEFAULT_MIN_WORD_LENGTH;
|
||||
private int maxWordLength = DEFAULT_MAX_WORD_LENGTH;
|
||||
|
||||
private Fields fields;
|
||||
private Fields topLevelFields;
|
||||
private final Set<String> selectedFields;
|
||||
private AggregatedDfs dfs;
|
||||
private Map<Term, ScoreTerm> scoreTerms;
|
||||
private AtomicLongMap<String> sizes;
|
||||
private TFIDFSimilarity similarity;
|
||||
|
||||
public TermVectorsFilter(Fields termVectorsByField, Fields topLevelFields, Set<String> selectedFields, @Nullable AggregatedDfs dfs) {
|
||||
this.fields = termVectorsByField;
|
||||
this.topLevelFields = topLevelFields;
|
||||
this.selectedFields = selectedFields;
|
||||
|
||||
this.dfs = dfs;
|
||||
this.scoreTerms = new HashMap<>();
|
||||
this.sizes = AtomicLongMap.create();
|
||||
this.similarity = new DefaultSimilarity();
|
||||
}
|
||||
|
||||
public void setSettings(TermVectorsRequest.FilterSettings settings) {
|
||||
if (settings.maxNumTerms != null) {
|
||||
setMaxNumTerms(settings.maxNumTerms);
|
||||
}
|
||||
if (settings.minTermFreq != null) {
|
||||
setMinTermFreq(settings.minTermFreq);
|
||||
}
|
||||
if (settings.maxTermFreq != null) {
|
||||
setMaxTermFreq(settings.maxTermFreq);
|
||||
}
|
||||
if (settings.minDocFreq != null) {
|
||||
setMinDocFreq(settings.minDocFreq);
|
||||
}
|
||||
if (settings.maxDocFreq != null) {
|
||||
setMaxDocFreq(settings.maxDocFreq);
|
||||
}
|
||||
if (settings.minWordLength != null) {
|
||||
setMinWordLength(settings.minWordLength);
|
||||
}
|
||||
if (settings.maxWordLength != null) {
|
||||
setMaxWordLength(settings.maxWordLength);
|
||||
}
|
||||
}
|
||||
|
||||
public ScoreTerm getScoreTerm(Term term) {
|
||||
return scoreTerms.get(term);
|
||||
}
|
||||
|
||||
public boolean hasScoreTerm(Term term) {
|
||||
return getScoreTerm(term) != null;
|
||||
}
|
||||
|
||||
public long size(String fieldName) {
|
||||
return sizes.get(fieldName);
|
||||
}
|
||||
|
||||
public int getMaxNumTerms() {
|
||||
return maxNumTerms;
|
||||
}
|
||||
|
||||
public int getMinTermFreq() {
|
||||
return minTermFreq;
|
||||
}
|
||||
|
||||
public int getMaxTermFreq() {
|
||||
return maxTermFreq;
|
||||
}
|
||||
|
||||
public int getMinDocFreq() {
|
||||
return minDocFreq;
|
||||
}
|
||||
|
||||
public int getMaxDocFreq() {
|
||||
return maxDocFreq;
|
||||
}
|
||||
|
||||
public int getMinWordLength() {
|
||||
return minWordLength;
|
||||
}
|
||||
|
||||
public int getMaxWordLength() {
|
||||
return maxWordLength;
|
||||
}
|
||||
|
||||
public void setMaxNumTerms(int maxNumTerms) {
|
||||
this.maxNumTerms = maxNumTerms;
|
||||
}
|
||||
|
||||
public void setMinTermFreq(int minTermFreq) {
|
||||
this.minTermFreq = minTermFreq;
|
||||
}
|
||||
|
||||
public void setMaxTermFreq(int maxTermFreq) {
|
||||
this.maxTermFreq = maxTermFreq;
|
||||
}
|
||||
|
||||
public void setMinDocFreq(int minDocFreq) {
|
||||
this.minDocFreq = minDocFreq;
|
||||
}
|
||||
|
||||
public void setMaxDocFreq(int maxDocFreq) {
|
||||
this.maxDocFreq = maxDocFreq;
|
||||
}
|
||||
|
||||
public void setMinWordLength(int minWordLength) {
|
||||
this.minWordLength = minWordLength;
|
||||
}
|
||||
|
||||
public void setMaxWordLength(int maxWordLength) {
|
||||
this.maxWordLength = maxWordLength;
|
||||
}
|
||||
|
||||
public static final class ScoreTerm {
|
||||
public String field;
|
||||
public String word;
|
||||
public float score;
|
||||
|
||||
ScoreTerm(String field, String word, float score) {
|
||||
this.field = field;
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
void update(String field, String word, float score) {
|
||||
this.field = field;
|
||||
this.word = word;
|
||||
this.score = score;
|
||||
}
|
||||
}
|
||||
|
||||
public void selectBestTerms() throws IOException {
|
||||
PostingsEnum docsEnum = null;
|
||||
|
||||
for (String fieldName : fields) {
|
||||
if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Terms terms = fields.terms(fieldName);
|
||||
Terms topLevelTerms = topLevelFields.terms(fieldName);
|
||||
|
||||
// if no terms found, take the retrieved term vector fields for stats
|
||||
if (topLevelTerms == null) {
|
||||
topLevelTerms = terms;
|
||||
}
|
||||
|
||||
long numDocs = getDocCount(fieldName, topLevelTerms);
|
||||
|
||||
// one queue per field name
|
||||
ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
|
||||
|
||||
// select terms with highest tf-idf
|
||||
TermsEnum termsEnum = terms.iterator();
|
||||
TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
|
||||
while (termsEnum.next() != null) {
|
||||
BytesRef termBytesRef = termsEnum.term();
|
||||
boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
|
||||
assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
|
||||
|
||||
Term term = new Term(fieldName, termBytesRef);
|
||||
|
||||
// remove noise words
|
||||
int freq = getTermFreq(termsEnum, docsEnum);
|
||||
if (isNoise(term.bytes().utf8ToString(), freq)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// now call on docFreq
|
||||
long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
|
||||
if (!isAccepted(docFreq)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// filter based on score
|
||||
float score = computeScore(docFreq, freq, numDocs);
|
||||
queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
|
||||
}
|
||||
|
||||
// retain the best terms for quick lookups
|
||||
ScoreTerm scoreTerm;
|
||||
while ((scoreTerm = queue.pop()) != null) {
|
||||
scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
|
||||
sizes.incrementAndGet(scoreTerm.field);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private boolean isNoise(String word, int freq) {
|
||||
// filter out words based on length
|
||||
int len = word.length();
|
||||
if (minWordLength > 0 && len < minWordLength) {
|
||||
return true;
|
||||
}
|
||||
if (maxWordLength > 0 && len > maxWordLength) {
|
||||
return true;
|
||||
}
|
||||
// filter out words that don't occur enough times in the source
|
||||
if (minTermFreq > 0 && freq < minTermFreq) {
|
||||
return true;
|
||||
}
|
||||
// filter out words that occur too many times in the source
|
||||
if (freq > maxTermFreq) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean isAccepted(long docFreq) {
|
||||
// filter out words that don't occur in enough docs
|
||||
if (minDocFreq > 0 && docFreq < minDocFreq) {
|
||||
return false;
|
||||
}
|
||||
// filter out words that occur in too many docs
|
||||
if (docFreq > maxDocFreq) {
|
||||
return false;
|
||||
}
|
||||
// index update problem?
|
||||
if (docFreq == 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private long getDocCount(String fieldName, Terms topLevelTerms) throws IOException {
|
||||
if (dfs != null) {
|
||||
return dfs.fieldStatistics().get(fieldName).docCount();
|
||||
}
|
||||
return topLevelTerms.getDocCount();
|
||||
}
|
||||
|
||||
private TermStatistics getTermStatistics(TermsEnum termsEnum, Term term) throws IOException {
|
||||
if (dfs != null) {
|
||||
return dfs.termStatistics().get(term);
|
||||
}
|
||||
return new TermStatistics(termsEnum.term(), termsEnum.docFreq(), termsEnum.totalTermFreq());
|
||||
}
|
||||
|
||||
private int getTermFreq(TermsEnum termsEnum, PostingsEnum docsEnum) throws IOException {
|
||||
docsEnum = termsEnum.postings(null, docsEnum);
|
||||
docsEnum.nextDoc();
|
||||
return docsEnum.freq();
|
||||
}
|
||||
|
||||
private float computeScore(long docFreq, int freq, long numDocs) {
|
||||
return freq * similarity.idf(docFreq, numDocs);
|
||||
}
|
||||
|
||||
private static class ScoreTermsQueue extends org.apache.lucene.util.PriorityQueue<ScoreTerm> {
|
||||
private final int limit;
|
||||
|
||||
ScoreTermsQueue(int maxSize) {
|
||||
super(maxSize);
|
||||
this.limit = maxSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean lessThan(ScoreTerm a, ScoreTerm b) {
|
||||
return a.score < b.score;
|
||||
}
|
||||
|
||||
public void addOrUpdate(ScoreTerm scoreTerm) {
|
||||
if (this.size() < limit) {
|
||||
// there is still space in the queue
|
||||
this.add(scoreTerm);
|
||||
} else {
|
||||
// otherwise update the smallest in the queue in place and update the queue
|
||||
ScoreTerm scoreTermTop = this.top();
|
||||
if (scoreTermTop.score < scoreTerm.score) {
|
||||
scoreTermTop.update(scoreTerm.field, scoreTerm.word, scoreTerm.score);
|
||||
this.updateTop();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -28,6 +28,7 @@ import org.elasticsearch.action.DocumentRequest;
|
||||
import org.elasticsearch.action.ValidateActions;
|
||||
import org.elasticsearch.action.get.MultiGetRequest;
|
||||
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
|
||||
import org.elasticsearch.common.Nullable;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.io.stream.StreamInput;
|
||||
import org.elasticsearch.common.io.stream.StreamOutput;
|
||||
@ -74,6 +75,54 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
|
||||
private Map<String, String> perFieldAnalyzer;
|
||||
|
||||
private FilterSettings filterSettings;
|
||||
|
||||
public static final class FilterSettings {
|
||||
public Integer maxNumTerms;
|
||||
public Integer minTermFreq;
|
||||
public Integer maxTermFreq;
|
||||
public Integer minDocFreq;
|
||||
public Integer maxDocFreq;
|
||||
public Integer minWordLength;
|
||||
public Integer maxWordLength;
|
||||
|
||||
public FilterSettings() {
|
||||
|
||||
}
|
||||
|
||||
public FilterSettings(@Nullable Integer maxNumTerms, @Nullable Integer minTermFreq, @Nullable Integer maxTermFreq,
|
||||
@Nullable Integer minDocFreq, @Nullable Integer maxDocFreq, @Nullable Integer minWordLength,
|
||||
@Nullable Integer maxWordLength) {
|
||||
this.maxNumTerms = maxNumTerms;
|
||||
this.minTermFreq = minTermFreq;
|
||||
this.maxTermFreq = maxTermFreq;
|
||||
this.minDocFreq = minDocFreq;
|
||||
this.maxDocFreq = maxDocFreq;
|
||||
this.minWordLength = minWordLength;
|
||||
this.maxWordLength = maxWordLength;
|
||||
}
|
||||
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
maxNumTerms = in.readOptionalVInt();
|
||||
minTermFreq = in.readOptionalVInt();
|
||||
maxTermFreq = in.readOptionalVInt();
|
||||
minDocFreq = in.readOptionalVInt();
|
||||
maxDocFreq = in.readOptionalVInt();
|
||||
minWordLength = in.readOptionalVInt();
|
||||
maxWordLength = in.readOptionalVInt();
|
||||
}
|
||||
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
out.writeOptionalVInt(maxNumTerms);
|
||||
out.writeOptionalVInt(minTermFreq);
|
||||
out.writeOptionalVInt(maxTermFreq);
|
||||
out.writeOptionalVInt(minDocFreq);
|
||||
out.writeOptionalVInt(maxDocFreq);
|
||||
out.writeOptionalVInt(minWordLength);
|
||||
out.writeOptionalVInt(maxWordLength);
|
||||
}
|
||||
}
|
||||
|
||||
private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
|
||||
Flag.FieldStatistics);
|
||||
|
||||
@ -375,6 +424,21 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the settings for filtering out terms.
|
||||
*/
|
||||
public FilterSettings filterSettings() {
|
||||
return this.filterSettings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the settings for filtering out terms.
|
||||
*/
|
||||
public TermVectorsRequest filterSettings(FilterSettings settings) {
|
||||
this.filterSettings = settings != null ? settings : null;
|
||||
return this;
|
||||
}
|
||||
|
||||
public long version() {
|
||||
return version;
|
||||
}
|
||||
@ -454,6 +518,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
if (in.readBoolean()) {
|
||||
perFieldAnalyzer = readPerFieldAnalyzer(in.readMap());
|
||||
}
|
||||
if (in.readBoolean()) {
|
||||
filterSettings = new FilterSettings();
|
||||
filterSettings.readFrom(in);
|
||||
}
|
||||
realtime = in.readBoolean();
|
||||
versionType = VersionType.fromValue(in.readByte());
|
||||
version = in.readLong();
|
||||
@ -488,6 +556,10 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
if (perFieldAnalyzer != null) {
|
||||
out.writeGenericValue(perFieldAnalyzer);
|
||||
}
|
||||
out.writeBoolean(filterSettings != null);
|
||||
if (filterSettings != null) {
|
||||
filterSettings.writeTo(out);
|
||||
}
|
||||
out.writeBoolean(realtime());
|
||||
out.writeByte(versionType.getValue());
|
||||
out.writeLong(version);
|
||||
@ -533,6 +605,8 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
termVectorsRequest.dfs(parser.booleanValue());
|
||||
} else if (currentFieldName.equals("per_field_analyzer") || currentFieldName.equals("perFieldAnalyzer")) {
|
||||
termVectorsRequest.perFieldAnalyzer(readPerFieldAnalyzer(parser.map()));
|
||||
} else if (currentFieldName.equals("filter")) {
|
||||
termVectorsRequest.filterSettings(readFilterSettings(parser, termVectorsRequest));
|
||||
} else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.
|
||||
termVectorsRequest.index = parser.text();
|
||||
} else if ("_type".equals(currentFieldName)) {
|
||||
@ -577,4 +651,35 @@ public class TermVectorsRequest extends SingleShardOperationRequest<TermVectorsR
|
||||
}
|
||||
return mapStrStr;
|
||||
}
|
||||
|
||||
private static FilterSettings readFilterSettings(XContentParser parser, TermVectorsRequest termVectorsRequest) throws IOException {
|
||||
FilterSettings settings = new FilterSettings();
|
||||
XContentParser.Token token;
|
||||
String currentFieldName = null;
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
|
||||
if (token == XContentParser.Token.FIELD_NAME) {
|
||||
currentFieldName = parser.currentName();
|
||||
} else if (currentFieldName != null) {
|
||||
if (currentFieldName.equals("max_num_terms")) {
|
||||
settings.maxNumTerms = parser.intValue();
|
||||
} else if (currentFieldName.equals("min_term_freq")) {
|
||||
settings.minTermFreq = parser.intValue();
|
||||
} else if (currentFieldName.equals("max_term_freq")) {
|
||||
settings.maxTermFreq = parser.intValue();
|
||||
} else if (currentFieldName.equals("min_doc_freq")) {
|
||||
settings.minDocFreq = parser.intValue();
|
||||
} else if (currentFieldName.equals("max_doc_freq")) {
|
||||
settings.maxDocFreq = parser.intValue();
|
||||
} else if (currentFieldName.equals("min_word_length")) {
|
||||
settings.minWordLength = parser.intValue();
|
||||
} else if (currentFieldName.equals("max_word_length")) {
|
||||
settings.maxWordLength = parser.intValue();
|
||||
} else {
|
||||
throw new ElasticsearchParseException("The parameter " + currentFieldName
|
||||
+ " is not valid for filter parameter for term vector request!");
|
||||
}
|
||||
}
|
||||
}
|
||||
return settings;
|
||||
}
|
||||
}
|
||||
|
@ -200,6 +200,14 @@ public class TermVectorsRequestBuilder extends ActionRequestBuilder<TermVectorsR
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the settings for filtering out terms.
|
||||
*/
|
||||
public TermVectorsRequestBuilder setFilterSettings(TermVectorsRequest.FilterSettings filterSettings) {
|
||||
request.filterSettings(filterSettings);
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doExecute(ActionListener<TermVectorsResponse> listener) {
|
||||
client.termVectors(request, listener);
|
||||
|
@ -20,11 +20,11 @@
|
||||
package org.elasticsearch.action.termvectors;
|
||||
|
||||
import com.google.common.collect.Iterators;
|
||||
|
||||
import org.apache.lucene.index.Fields;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BoostAttribute;
|
||||
import org.apache.lucene.util.ArrayUtil;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
import org.apache.lucene.util.CharsRefBuilder;
|
||||
@ -55,6 +55,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
public static final XContentBuilderString TTF = new XContentBuilderString("ttf");
|
||||
public static final XContentBuilderString DOC_FREQ = new XContentBuilderString("doc_freq");
|
||||
public static final XContentBuilderString TERM_FREQ = new XContentBuilderString("term_freq");
|
||||
public static final XContentBuilderString SCORE = new XContentBuilderString("score");
|
||||
|
||||
// field statistics strings
|
||||
public static final XContentBuilderString FIELD_STATISTICS = new XContentBuilderString("field_statistics");
|
||||
@ -86,6 +87,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
private boolean exists = false;
|
||||
private boolean artificial = false;
|
||||
private long tookInMillis;
|
||||
private boolean hasScores = false;
|
||||
|
||||
private boolean sourceCopied = false;
|
||||
|
||||
@ -146,7 +148,9 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
headerRef = headerRef.copyBytesArray();
|
||||
termVectors = termVectors.copyBytesArray();
|
||||
}
|
||||
return new TermVectorsFields(headerRef, termVectors);
|
||||
TermVectorsFields termVectorsFields = new TermVectorsFields(headerRef, termVectors);
|
||||
hasScores = termVectorsFields.hasScores;
|
||||
return termVectorsFields;
|
||||
} else {
|
||||
return new Fields() {
|
||||
@Override
|
||||
@ -202,14 +206,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
buildFieldStatistics(builder, curTerms);
|
||||
builder.startObject(FieldStrings.TERMS);
|
||||
TermsEnum termIter = curTerms.iterator();
|
||||
BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
|
||||
for (int i = 0; i < curTerms.size(); i++) {
|
||||
buildTerm(builder, spare, curTerms, termIter);
|
||||
buildTerm(builder, spare, curTerms, termIter, boostAtt);
|
||||
}
|
||||
builder.endObject();
|
||||
builder.endObject();
|
||||
}
|
||||
|
||||
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter) throws IOException {
|
||||
private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
|
||||
// start term, optimized writing
|
||||
BytesRef term = termIter.next();
|
||||
spare.copyUTF8Bytes(term);
|
||||
@ -222,6 +227,7 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
initMemory(curTerms, termFreq);
|
||||
initValues(curTerms, posEnum, termFreq);
|
||||
buildValues(builder, curTerms, termFreq);
|
||||
buildScore(builder, boostAtt);
|
||||
builder.endObject();
|
||||
}
|
||||
|
||||
@ -332,6 +338,12 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
public long getTookInMillis() {
|
||||
return tookInMillis;
|
||||
}
|
||||
|
||||
private void buildScore(XContentBuilder builder, BoostAttribute boostAtt) throws IOException {
|
||||
if (hasScores) {
|
||||
builder.field(FieldStrings.SCORE, boostAtt.getBoost());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isExists() {
|
||||
return exists;
|
||||
@ -342,14 +354,15 @@ public class TermVectorsResponse extends ActionResponse implements ToXContent {
|
||||
}
|
||||
|
||||
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
|
||||
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null);
|
||||
setFields(termVectorsByField, selectedFields, flags, topLevelFields, null, null);
|
||||
}
|
||||
|
||||
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
|
||||
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs,
|
||||
TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
TermVectorsWriter tvw = new TermVectorsWriter(this);
|
||||
|
||||
if (termVectorsByField != null) {
|
||||
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs);
|
||||
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields, dfs, termVectorsFilter);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -50,10 +50,13 @@ final class TermVectorsWriter {
|
||||
response = termVectorsResponse;
|
||||
}
|
||||
|
||||
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs) throws IOException {
|
||||
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields,
|
||||
@Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
|
||||
int numFieldsWritten = 0;
|
||||
PostingsEnum docsAndPosEnum = null;
|
||||
PostingsEnum docsEnum = null;
|
||||
boolean hasScores = termVectorsFilter != null;
|
||||
|
||||
for (String field : termVectorsByField) {
|
||||
if ((selectedFields != null) && (!selectedFields.contains(field))) {
|
||||
continue;
|
||||
@ -71,7 +74,13 @@ final class TermVectorsWriter {
|
||||
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
|
||||
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
|
||||
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
|
||||
startField(field, fieldTermVector.size(), positions, offsets, payloads);
|
||||
|
||||
long termsSize = fieldTermVector.size();
|
||||
if (hasScores) {
|
||||
termsSize = Math.min(termsSize, termVectorsFilter.size(field));
|
||||
}
|
||||
startField(field, termsSize, positions, offsets, payloads);
|
||||
|
||||
if (flags.contains(Flag.FieldStatistics)) {
|
||||
if (dfs != null) {
|
||||
writeFieldStatistics(dfs.fieldStatistics().get(field));
|
||||
@ -81,15 +90,21 @@ final class TermVectorsWriter {
|
||||
}
|
||||
TermsEnum iterator = fieldTermVector.iterator();
|
||||
final boolean useDocsAndPos = positions || offsets || payloads;
|
||||
while (iterator.next() != null) { // iterate all terms of the
|
||||
// current field
|
||||
// get the doc frequency
|
||||
BytesRef term = iterator.term();
|
||||
boolean foundTerm = topLevelIterator.seekExact(term);
|
||||
startTerm(term);
|
||||
while (iterator.next() != null) { // iterate all terms of the current field
|
||||
BytesRef termBytesRef = iterator.term();
|
||||
boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
|
||||
Term term = new Term(field, termBytesRef);
|
||||
|
||||
// with filtering we only keep the best terms
|
||||
if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
startTerm(termBytesRef);
|
||||
if (flags.contains(Flag.TermStatistics)) {
|
||||
// get the doc frequency
|
||||
if (dfs != null) {
|
||||
writeTermStatistics(dfs.termStatistics().get(new Term(field, term.utf8ToString())));
|
||||
writeTermStatistics(dfs.termStatistics().get(term));
|
||||
} else {
|
||||
writeTermStatistics(topLevelIterator);
|
||||
}
|
||||
@ -102,14 +117,17 @@ final class TermVectorsWriter {
|
||||
// get the frequency from a PostingsEnum.
|
||||
docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
|
||||
}
|
||||
if (hasScores) {
|
||||
writeScoreTerm(termVectorsFilter.getScoreTerm(term));
|
||||
}
|
||||
}
|
||||
numFieldsWritten++;
|
||||
}
|
||||
response.setTermVectorsField(output);
|
||||
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics)));
|
||||
response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
|
||||
}
|
||||
|
||||
private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics) throws IOException {
|
||||
private BytesReference writeHeader(int numFieldsWritten, boolean getTermStatistics, boolean getFieldStatistics, boolean scores) throws IOException {
|
||||
// now, write the information about offset of the terms in the
|
||||
// termVectors field
|
||||
BytesStreamOutput header = new BytesStreamOutput();
|
||||
@ -117,6 +135,7 @@ final class TermVectorsWriter {
|
||||
header.writeInt(CURRENT_VERSION);
|
||||
header.writeBoolean(getTermStatistics);
|
||||
header.writeBoolean(getFieldStatistics);
|
||||
header.writeBoolean(scores);
|
||||
header.writeVInt(numFieldsWritten);
|
||||
for (int i = 0; i < fields.size(); i++) {
|
||||
header.writeString(fields.get(i));
|
||||
@ -205,7 +224,6 @@ final class TermVectorsWriter {
|
||||
private void startTerm(BytesRef term) throws IOException {
|
||||
output.writeVInt(term.length);
|
||||
output.writeBytes(term.bytes, term.offset, term.length);
|
||||
|
||||
}
|
||||
|
||||
private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException {
|
||||
@ -250,6 +268,10 @@ final class TermVectorsWriter {
|
||||
writePotentiallyNegativeVInt(dc);
|
||||
}
|
||||
|
||||
private void writeScoreTerm(TermVectorsFilter.ScoreTerm scoreTerm) throws IOException {
|
||||
output.writeFloat(Math.max(0, scoreTerm.score));
|
||||
}
|
||||
|
||||
private void writePotentiallyNegativeVInt(int value) throws IOException {
|
||||
// term freq etc. can be negative if not present... we transport that
|
||||
// further...
|
||||
@ -261,5 +283,4 @@ final class TermVectorsWriter {
|
||||
// further...
|
||||
output.writeVLong(Math.max(0, value + 1));
|
||||
}
|
||||
|
||||
}
|
@ -229,6 +229,14 @@ public abstract class StreamInput extends InputStream {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public Integer readOptionalVInt() throws IOException {
|
||||
if (readBoolean()) {
|
||||
return readVInt();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private final CharsRefBuilder spare = new CharsRefBuilder();
|
||||
|
||||
public String readString() throws IOException {
|
||||
|
@ -176,6 +176,15 @@ public abstract class StreamOutput extends OutputStream {
|
||||
}
|
||||
}
|
||||
|
||||
public void writeOptionalVInt(@Nullable Integer integer) throws IOException {
|
||||
if (integer == null) {
|
||||
writeBoolean(false);
|
||||
} else {
|
||||
writeBoolean(true);
|
||||
writeVInt(integer);
|
||||
}
|
||||
}
|
||||
|
||||
public void writeOptionalText(@Nullable Text text) throws IOException {
|
||||
if (text == null) {
|
||||
writeInt(-1);
|
||||
|
@ -23,6 +23,7 @@ import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.memory.MemoryIndex;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsFilter;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsRequest;
|
||||
import org.elasticsearch.action.termvectors.TermVectorsResponse;
|
||||
import org.elasticsearch.action.termvectors.dfs.DfsOnlyRequest;
|
||||
@ -82,8 +83,10 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
|
||||
|
||||
Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm).version(request.version()).versionType(request.versionType()));
|
||||
|
||||
Fields termVectorsByField = null;
|
||||
boolean docFromTranslog = get.source() != null;
|
||||
AggregatedDfs dfs = null;
|
||||
TermVectorsFilter termVectorsFilter = null;
|
||||
|
||||
/* fetched from translog is treated as an artificial document */
|
||||
if (docFromTranslog) {
|
||||
@ -102,22 +105,18 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
|
||||
Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
|
||||
/* from an artificial document */
|
||||
if (request.doc() != null) {
|
||||
Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
|
||||
termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
|
||||
// if no document indexed in shard, take the queried document itself for stats
|
||||
if (topLevelFields == null) {
|
||||
topLevelFields = termVectorsByField;
|
||||
}
|
||||
if (termVectorsByField != null && useDfs(request)) {
|
||||
dfs = getAggregatedDfs(termVectorsByField, request);
|
||||
}
|
||||
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
|
||||
termVectorsResponse.setExists(true);
|
||||
termVectorsResponse.setArtificial(!docFromTranslog);
|
||||
termVectorsResponse.setExists(true);
|
||||
}
|
||||
/* or from an existing document */
|
||||
else if (docIdAndVersion != null) {
|
||||
// fields with stored term vectors
|
||||
Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
|
||||
termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
|
||||
Set<String> selectedFields = request.selectedFields();
|
||||
// generate tvs for fields where analyzer is overridden
|
||||
if (selectedFields == null && request.perFieldAnalyzer() != null) {
|
||||
@ -127,15 +126,31 @@ public class ShardTermVectorsService extends AbstractIndexShardComponent {
|
||||
if (selectedFields != null) {
|
||||
termVectorsByField = addGeneratedTermVectors(get, termVectorsByField, request, selectedFields);
|
||||
}
|
||||
if (termVectorsByField != null && useDfs(request)) {
|
||||
dfs = getAggregatedDfs(termVectorsByField, request);
|
||||
}
|
||||
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs);
|
||||
termVectorsResponse.setDocVersion(docIdAndVersion.version);
|
||||
termVectorsResponse.setExists(true);
|
||||
} else {
|
||||
}
|
||||
/* no term vectors generated or found */
|
||||
else {
|
||||
termVectorsResponse.setExists(false);
|
||||
}
|
||||
/* if there are term vectors, optional compute dfs and/or terms filtering */
|
||||
if (termVectorsByField != null) {
|
||||
if (useDfs(request)) {
|
||||
dfs = getAggregatedDfs(termVectorsByField, request);
|
||||
}
|
||||
|
||||
if (request.filterSettings() != null) {
|
||||
termVectorsFilter = new TermVectorsFilter(termVectorsByField, topLevelFields, request.selectedFields(), dfs);
|
||||
termVectorsFilter.setSettings(request.filterSettings());
|
||||
try {
|
||||
termVectorsFilter.selectBestTerms();
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("failed to select best terms", e);
|
||||
}
|
||||
}
|
||||
// write term vectors
|
||||
termVectorsResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields, dfs, termVectorsFilter);
|
||||
}
|
||||
} catch (Throwable ex) {
|
||||
throw new ElasticsearchException("failed to execute term vector request", ex);
|
||||
} finally {
|
||||
|
@ -1225,4 +1225,133 @@ public class GetTermVectorsTests extends AbstractTermVectorsTests {
|
||||
assertThat(response.getIndex(), equalTo("test"));
|
||||
assertThat(response.getVersion(), equalTo(2l));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterLength() throws ExecutionException, InterruptedException, IOException {
|
||||
logger.info("Setting up the index ...");
|
||||
ImmutableSettings.Builder settings = settingsBuilder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer", "keyword");
|
||||
assertAcked(prepareCreate("test")
|
||||
.setSettings(settings)
|
||||
.addMapping("type1", "tags", "type=string"));
|
||||
ensureYellow();
|
||||
|
||||
int numTerms = scaledRandomIntBetween(10, 50);
|
||||
logger.info("Indexing one document with tags of increasing length ...");
|
||||
List<String> tags = new ArrayList<>();
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
String tag = "a";
|
||||
for (int j = 0; j < i; j++) {
|
||||
tag += "a";
|
||||
}
|
||||
tags.add(tag);
|
||||
}
|
||||
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
|
||||
|
||||
logger.info("Checking best tags by longest to shortest size ...");
|
||||
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
|
||||
filterSettings.maxNumTerms = numTerms;
|
||||
TermVectorsResponse response;
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
filterSettings.minWordLength = numTerms - i;
|
||||
response = client().prepareTermVectors("test", "type1", "1")
|
||||
.setSelectedFields("tags")
|
||||
.setFieldStatistics(true)
|
||||
.setTermStatistics(true)
|
||||
.setFilterSettings(filterSettings)
|
||||
.get();
|
||||
checkBestTerms(response.getFields().terms("tags"), tags.subList((numTerms - i - 1), numTerms));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterTermFreq() throws ExecutionException, InterruptedException, IOException {
|
||||
logger.info("Setting up the index ...");
|
||||
ImmutableSettings.Builder settings = settingsBuilder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer", "keyword");
|
||||
assertAcked(prepareCreate("test")
|
||||
.setSettings(settings)
|
||||
.addMapping("type1", "tags", "type=string"));
|
||||
ensureYellow();
|
||||
|
||||
logger.info("Indexing one document with tags of increasing frequencies ...");
|
||||
int numTerms = scaledRandomIntBetween(10, 50);
|
||||
List<String> tags = new ArrayList<>();
|
||||
List<String> uniqueTags = new ArrayList<>();
|
||||
String tag;
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
tag = "tag_" + i;
|
||||
tags.add(tag);
|
||||
for (int j = 0; j < i; j++) {
|
||||
tags.add(tag);
|
||||
}
|
||||
uniqueTags.add(tag);
|
||||
}
|
||||
indexRandom(true, client().prepareIndex("test", "type1", "1").setSource("tags", tags));
|
||||
|
||||
logger.info("Checking best tags by highest to lowest term freq ...");
|
||||
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
|
||||
TermVectorsResponse response;
|
||||
for (int i = 0; i < numTerms; i++) {
|
||||
filterSettings.maxNumTerms = i + 1;
|
||||
response = client().prepareTermVectors("test", "type1", "1")
|
||||
.setSelectedFields("tags")
|
||||
.setFieldStatistics(true)
|
||||
.setTermStatistics(true)
|
||||
.setFilterSettings(filterSettings)
|
||||
.get();
|
||||
checkBestTerms(response.getFields().terms("tags"), uniqueTags.subList((numTerms - i - 1), numTerms));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFilterDocFreq() throws ExecutionException, InterruptedException, IOException {
|
||||
logger.info("Setting up the index ...");
|
||||
ImmutableSettings.Builder settings = settingsBuilder()
|
||||
.put(indexSettings())
|
||||
.put("index.analysis.analyzer", "keyword")
|
||||
.put("index.number_of_shards", 1); // no dfs
|
||||
assertAcked(prepareCreate("test")
|
||||
.setSettings(settings)
|
||||
.addMapping("type1", "tags", "type=string"));
|
||||
ensureYellow();
|
||||
|
||||
int numDocs = scaledRandomIntBetween(10, 50); // as many terms as there are docs
|
||||
logger.info("Indexing {} documents with tags of increasing dfs ...", numDocs);
|
||||
List<IndexRequestBuilder> builders = new ArrayList<>();
|
||||
List<String> tags = new ArrayList<>();
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
tags.add("tag_" + i);
|
||||
builders.add(client().prepareIndex("test", "type1", i + "").setSource("tags", tags));
|
||||
}
|
||||
indexRandom(true, builders);
|
||||
|
||||
logger.info("Checking best terms by highest to lowest idf ...");
|
||||
TermVectorsRequest.FilterSettings filterSettings = new TermVectorsRequest.FilterSettings();
|
||||
TermVectorsResponse response;
|
||||
for (int i = 0; i < numDocs; i++) {
|
||||
filterSettings.maxNumTerms = i + 1;
|
||||
response = client().prepareTermVectors("test", "type1", (numDocs - 1) + "")
|
||||
.setSelectedFields("tags")
|
||||
.setFieldStatistics(true)
|
||||
.setTermStatistics(true)
|
||||
.setFilterSettings(filterSettings)
|
||||
.get();
|
||||
checkBestTerms(response.getFields().terms("tags"), tags.subList((numDocs - i - 1), numDocs));
|
||||
}
|
||||
}
|
||||
|
||||
private void checkBestTerms(Terms terms, List<String> expectedTerms) throws IOException {
|
||||
final TermsEnum termsEnum = terms.iterator();
|
||||
List<String> bestTerms = new ArrayList<>();
|
||||
BytesRef text;
|
||||
while((text = termsEnum.next()) != null) {
|
||||
bestTerms.add(text.utf8ToString());
|
||||
}
|
||||
Collections.sort(expectedTerms);
|
||||
Collections.sort(bestTerms);
|
||||
assertArrayEquals(expectedTerms.toArray(), bestTerms.toArray());
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user