Term Vectors: support for realtime

By default term vectors are now realtime, as opposed to previously near
realtime. If they are not found in the index, they will be generated on the
fly. The document is fetched from the transaction log and treated as an
artificial document. One can set `realtime` parameter to `false` in order to
disable this functionality. This consequently makes the MLT query realtime in
fetching documents, as it previsouly used to be before switching from using
the multi get API to the mtv API.

Closes #7846
This commit is contained in:
Alex Ksikes 2014-09-23 23:21:42 +02:00
parent 1cc5da43b3
commit c4830cf862
10 changed files with 113 additions and 28 deletions

View File

@ -3,9 +3,9 @@
Returns information and statistics on terms in the fields of a particular
document. The document could be stored in the index or artificially provided
by the user Note that for documents stored in the index, this
is a near realtime API as the term vectors are not available until the next
refresh.
by the user coming[1.4.0]. Term vectors are now <<realtime,realtime>>, as opposed to
previously near realtime coming[1.5.0]. The functionality is disabled by setting
`realtime` parameter to `false`.
[source,js]
--------------------------------------------------

View File

@ -74,6 +74,11 @@
"type" : "string",
"description" : "Parent id of documents. Applies to all returned documents unless otherwise specified in body \"params\" or \"docs\".",
"required" : false
},
"realtime": {
"type" : "boolean",
"description" : "Specifies if requests are real-time as opposed to near-real-time (default: true).",
"required" : false
}
}
},

View File

@ -72,6 +72,11 @@
"type" : "string",
"description" : "Parent id of documents.",
"required" : false
},
"realtime": {
"type" : "boolean",
"description" : "Specifies if request is real-time as opposed to near-real-time (default: true).",
"required" : false
}
}
},

View File

@ -29,6 +29,7 @@ setup:
index: testidx
type: doc
id: 1
realtime: 0
- match: { "_index": "testidx" }
- match: { "_type": "doc" }

View File

@ -0,0 +1,40 @@
---
"Realtime Term Vectors":
- do:
indices.create:
index: test_1
body:
settings:
index:
refresh_interval: -1
number_of_replicas: 0
- do:
cluster.health:
wait_for_status: green
- do:
index:
index: test_1
type: test
id: 1
body: { foo: bar }
- do:
termvector:
index: test_1
type: test
id: 1
realtime: 0
- is_false: found
- do:
termvector:
index: test_1
type: test
id: 1
realtime: 1
- is_true: found

View File

@ -63,6 +63,8 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
// TODO: change to String[]
private Set<String> selectedFields;
Boolean realtime;
private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
Flag.FieldStatistics);
@ -95,6 +97,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
if (other.selectedFields != null) {
this.selectedFields = new HashSet<>(other.selectedFields);
}
this.realtime = other.realtime();
}
public TermVectorRequest(MultiGetRequest.Item item) {
@ -150,9 +153,18 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
* Sets an artificial document from which term vectors are requested for.
*/
public TermVectorRequest doc(XContentBuilder documentBuilder) {
return this.doc(documentBuilder.bytes(), true);
}
/**
* Sets an artificial document from which term vectors are requested for.
*/
public TermVectorRequest doc(BytesReference doc, boolean generateRandomId) {
// assign a random id to this artificial document, for routing
this.id(String.valueOf(randomInt.getAndAdd(1)));
this.doc = documentBuilder.bytes();
if (generateRandomId) {
this.id(String.valueOf(randomInt.getAndAdd(1)));
}
this.doc = doc;
return this;
}
@ -293,6 +305,15 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
return this;
}
public boolean realtime() {
return this.realtime == null ? true : this.realtime;
}
public TermVectorRequest realtime(Boolean realtime) {
this.realtime = realtime;
return this;
}
private void setFlag(Flag flag, boolean set) {
if (set && !flagsEnum.contains(flag)) {
flagsEnum.add(flag);
@ -353,6 +374,9 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
selectedFields.add(in.readString());
}
}
if (in.getVersion().onOrAfter(Version.V_1_5_0)) {
this.realtime = in.readBoolean();
}
}
@Override
@ -386,6 +410,9 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
} else {
out.writeVInt(0);
}
if (out.getVersion().onOrAfter(Version.V_1_5_0)) {
out.writeBoolean(realtime());
}
}
public static enum Flag {

View File

@ -126,6 +126,11 @@ public class TermVectorRequestBuilder extends ActionRequestBuilder<TermVectorReq
return this;
}
public TermVectorRequestBuilder setRealtime(Boolean realtime) {
request.realtime(realtime);
return this;
}
@Override
protected void doExecute(ActionListener<TermVectorResponse> listener) {
client.termVector(request, listener);

View File

@ -74,6 +74,16 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
IndexReader topLevelReader = searcher.reader();
final TermVectorResponse termVectorResponse = new TermVectorResponse(concreteIndex, request.type(), request.id());
final Term uidTerm = new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(request.type(), request.id()));
Engine.GetResult get = indexShard.get(new Engine.Get(request.realtime(), uidTerm));
boolean docFromTranslog = get.source() != null;
/* fetched from translog is treated as an artificial document */
if (docFromTranslog) {
request.doc(get.source().source, false);
termVectorResponse.setDocVersion(get.version());
}
/* handle potential wildcards in fields */
if (request.selectedFields() != null) {
handleFieldWildcards(request);
@ -81,27 +91,25 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
try {
Fields topLevelFields = MultiFields.getFields(topLevelReader);
Versions.DocIdAndVersion docIdAndVersion = get.docIdAndVersion();
/* from an artificial document */
if (request.doc() != null) {
Fields termVectorsByField = generateTermVectorsFromDoc(request);
Fields termVectorsByField = generateTermVectorsFromDoc(request, !docFromTranslog);
// if no document indexed in shard, take the queried document itself for stats
if (topLevelFields == null) {
topLevelFields = termVectorsByField;
}
termVectorResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields);
termVectorResponse.setExists(true);
termVectorResponse.setArtificial(true);
return termVectorResponse;
termVectorResponse.setArtificial(!docFromTranslog);
}
/* or from an existing document */
final Term uidTerm = new Term(UidFieldMapper.NAME, Uid.createUidAsBytes(request.type(), request.id()));
Versions.DocIdAndVersion docIdAndVersion = Versions.loadDocIdAndVersion(topLevelReader, uidTerm);
if (docIdAndVersion != null) {
else if (docIdAndVersion != null) {
// fields with stored term vectors
Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
// fields without term vectors
if (request.selectedFields() != null) {
termVectorsByField = addGeneratedTermVectors(termVectorsByField, request, uidTerm, false);
termVectorsByField = addGeneratedTermVectors(get, termVectorsByField, request);
}
termVectorResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields);
termVectorResponse.setDocVersion(docIdAndVersion.version);
@ -113,6 +121,7 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
throw new ElasticsearchException("failed to execute term vector request", ex);
} finally {
searcher.close();
get.release();
}
return termVectorResponse;
}
@ -137,7 +146,7 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
return true;
}
private Fields addGeneratedTermVectors(Fields termVectorsByField, TermVectorRequest request, Term uidTerm, boolean realTime) throws IOException {
private Fields addGeneratedTermVectors(Engine.GetResult get, Fields termVectorsByField, TermVectorRequest request) throws IOException {
/* only keep valid fields */
Set<String> validFields = new HashSet<>();
for (String field : request.selectedFields()) {
@ -157,18 +166,9 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
}
/* generate term vectors from fetched document fields */
Engine.GetResult get = indexShard.get(new Engine.Get(realTime, uidTerm));
Fields generatedTermVectors;
try {
if (!get.exists()) {
return termVectorsByField;
}
GetResult getResult = indexShard.getService().get(
get, request.id(), request.type(), validFields.toArray(Strings.EMPTY_ARRAY), null, false);
generatedTermVectors = generateTermVectors(getResult.getFields().values(), request.offsets());
} finally {
get.release();
}
GetResult getResult = indexShard.getService().get(
get, request.id(), request.type(), validFields.toArray(Strings.EMPTY_ARRAY), null, false);
Fields generatedTermVectors = generateTermVectors(getResult.getFields().values(), request.offsets());
/* merge with existing Fields */
if (termVectorsByField == null) {
@ -195,7 +195,7 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
return MultiFields.getFields(index.createSearcher().getIndexReader());
}
private Fields generateTermVectorsFromDoc(TermVectorRequest request) throws IOException {
private Fields generateTermVectorsFromDoc(TermVectorRequest request, boolean doAllFields) throws IOException {
// parse the document, at the moment we do update the mapping, just like percolate
ParsedDocument parsedDocument = parseDocument(indexShard.shardId().getIndex(), request.type(), request.doc());
@ -214,6 +214,9 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
if (!isValidField(fieldMapper)) {
continue;
}
if (request.selectedFields() == null && !doAllFields && !fieldMapper.fieldType().storeTermVectors()) {
continue;
}
if (request.selectedFields() != null && !request.selectedFields().contains(field.name())) {
continue;
}

View File

@ -77,6 +77,7 @@ public class RestTermVectorAction extends BaseRestHandler {
termVectorRequest.positions(request.paramAsBoolean("positions", termVectorRequest.positions()));
termVectorRequest.payloads(request.paramAsBoolean("payloads", termVectorRequest.payloads()));
termVectorRequest.routing(request.param("routing"));
termVectorRequest.realtime(request.paramAsBoolean("realtime", null));
termVectorRequest.parent(request.param("parent"));
termVectorRequest.preference(request.param("preference"));
termVectorRequest.termStatistics(request.paramAsBoolean("termStatistics", termVectorRequest.termStatistics()));

View File

@ -32,8 +32,6 @@ import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.ToXContent;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.elasticsearch.index.service.IndexService;
import org.elasticsearch.indices.IndicesService;
import org.junit.Test;
import java.io.IOException;