Term Vectors API: Computes term vectors on the fly if not stored in the index.

Adds the ability to the Term Vector API to generate term vectors for some
chosen fields, even though they haven't been explicitely stored in the index.

Relates to #5184
Closes #6567
This commit is contained in:
Alex Ksikes 2014-06-17 19:08:38 +02:00
parent 6a25d9b7b5
commit f22f3db30f
4 changed files with 400 additions and 76 deletions

View File

@ -19,7 +19,7 @@ retrieved either with a parameter in the url
curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?fields=text,...'
--------------------------------------------------
or adding by adding the requested fields in the request body (see
or by adding the requested fields in the request body (see
example below).
[float]
@ -38,9 +38,11 @@ statistics are returned for all fields but no term statistics.
* term payloads (`payloads` : true), as base64 encoded bytes
If the requested information wasn't stored in the index, it will be
omitted without further warning. See <<mapping-types,type mapping>>
computed on the fly if possible. See <<mapping-types,type mapping>>
for how to configure your index to store term vectors.
coming[1.4.0,The ability to computed term vectors on the fly is only available from 1.4.0 onwards (see below)]
[WARNING]
======
Start and end offsets assume UTF-16 encoding is being used. If you want to use
@ -84,7 +86,7 @@ are therefore only useful as relative measures whereas the absolute
numbers have no meaning in this context.
[float]
=== Example
=== Example 1
First, we create an index that stores term vectors, payloads etc. :
@ -222,3 +224,22 @@ Response:
}
}
--------------------------------------------------
[float]
=== Example 2 coming[1.4.0]
Additionally, term vectors which are not explicitly stored in the index are automatically
computed on the fly. The following request returns all information and statistics for the
fields in document `1`, even though the terms haven't been explicitly stored in the index.
Note that for the field `text`, the terms are not re-generated.
[source,js]
--------------------------------------------------
curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' -d '{
"fields" : ["text", "some_field_without_term_vectors"],
"offsets" : true,
"positions" : true,
"term_statistics" : true,
"field_statistics" : true
}'
--------------------------------------------------

View File

@ -197,23 +197,23 @@ final class TermVectorWriter {
private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException {
int docFreq = topLevelIterator.docFreq();
assert (docFreq >= 0);
assert (docFreq >= -1);
writePotentiallyNegativeVInt(docFreq);
long ttf = topLevelIterator.totalTermFreq();
assert (ttf >= 0);
assert (ttf >= -1);
writePotentiallyNegativeVLong(ttf);
}
private void writeFieldStatistics(Terms topLevelTerms) throws IOException {
long sttf = topLevelTerms.getSumTotalTermFreq();
assert (sttf >= 0);
assert (sttf >= -1);
writePotentiallyNegativeVLong(sttf);
long sdf = topLevelTerms.getSumDocFreq();
assert (sdf >= 0);
assert (sdf >= -1);
writePotentiallyNegativeVLong(sdf);
int dc = topLevelTerms.getDocCount();
assert (dc >= 0);
assert (dc >= -1);
writePotentiallyNegativeVInt(dc);
}

View File

@ -19,35 +19,40 @@
package org.elasticsearch.index.termvectors;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.*;
import org.apache.lucene.index.memory.MemoryIndex;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.lucene.uid.Versions;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.get.GetField;
import org.elasticsearch.index.get.GetResult;
import org.elasticsearch.index.mapper.FieldMapper;
import org.elasticsearch.index.mapper.Uid;
import org.elasticsearch.index.mapper.core.StringFieldMapper;
import org.elasticsearch.index.mapper.internal.UidFieldMapper;
import org.elasticsearch.index.settings.IndexSettings;
import org.elasticsearch.index.shard.AbstractIndexShardComponent;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.shard.service.IndexShard;
import java.io.IOException;
import java.util.*;
/**
*/
public class ShardTermVectorService extends AbstractIndexShardComponent {
private IndexShard indexShard;
private MapperService mapperService;
@Inject
public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings, MapperService mapperService) {
public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings) {
super(shardId, indexSettings);
}
@ -66,8 +71,11 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
Fields topLevelFields = MultiFields.getFields(topLevelReader);
Versions.DocIdAndVersion docIdAndVersion = Versions.loadDocIdAndVersion(topLevelReader, uidTerm);
if (docIdAndVersion != null) {
Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId);
/* generate term vectors if not available */
if (request.selectedFields() != null) {
termVectorsByField = generateTermVectorsIfNeeded(termVectorsByField, request, uidTerm, false);
}
termVectorResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields);
termVectorResponse.setExists(true);
termVectorResponse.setDocVersion(docIdAndVersion.version);
@ -81,4 +89,102 @@ public class ShardTermVectorService extends AbstractIndexShardComponent {
}
return termVectorResponse;
}
private Fields generateTermVectorsIfNeeded(Fields termVectorsByField, TermVectorRequest request, Term uidTerm, boolean realTime) throws IOException {
List<String> validFields = new ArrayList<>();
for (String field : request.selectedFields()) {
FieldMapper fieldMapper = indexShard.mapperService().smartNameFieldMapper(field);
if (!(fieldMapper instanceof StringFieldMapper)) {
continue;
}
if (fieldMapper.fieldType().storeTermVectors()) {
continue;
}
// only disallow fields which are not indexed
if (!fieldMapper.fieldType().indexed()) {
continue;
}
validFields.add(field);
}
if (validFields.isEmpty()) {
return termVectorsByField;
}
Engine.GetResult get = indexShard.get(new Engine.Get(realTime, uidTerm));
Fields generatedTermVectors;
try {
if (!get.exists()) {
return termVectorsByField;
}
// TODO: support for fetchSourceContext?
GetResult getResult = indexShard.getService().get(
get, request.id(), request.type(), validFields.toArray(Strings.EMPTY_ARRAY), null);
generatedTermVectors = generateTermVectors(getResult.getFields().values(), request.offsets());
} finally {
get.release();
}
if (termVectorsByField == null) {
return generatedTermVectors;
} else {
return mergeFields(request.selectedFields().toArray(Strings.EMPTY_ARRAY), termVectorsByField, generatedTermVectors);
}
}
private Fields generateTermVectors(Collection<GetField> getFields, boolean withOffsets) throws IOException {
// store document in memory index
MemoryIndex index = new MemoryIndex(withOffsets);
for (GetField getField : getFields) {
String field = getField.getName();
Analyzer analyzer = indexShard.mapperService().smartNameFieldMapper(field).indexAnalyzer();
if (analyzer == null) {
analyzer = indexShard.mapperService().analysisService().defaultIndexAnalyzer();
}
for (Object text : getField.getValues()) {
index.addField(field, text.toString(), analyzer);
}
}
// and read vectors from it
return MultiFields.getFields(index.createSearcher().getIndexReader());
}
private Fields mergeFields(String[] fieldNames, Fields... fieldsObject) throws IOException {
ParallelFields parallelFields = new ParallelFields();
for (Fields fieldObject : fieldsObject) {
assert fieldObject != null;
for (String fieldName : fieldNames) {
Terms terms = fieldObject.terms(fieldName);
if (terms != null) {
parallelFields.addField(fieldName, terms);
}
}
}
return parallelFields;
}
// Poached from Lucene ParallelAtomicReader
private static final class ParallelFields extends Fields {
final Map<String,Terms> fields = new TreeMap<>();
ParallelFields() {
}
void addField(String fieldName, Terms terms) {
fields.put(fieldName, terms);
}
@Override
public Iterator<String> iterator() {
return Collections.unmodifiableSet(fields.keySet()).iterator();
}
@Override
public Terms terms(String field) {
return fields.get(field);
}
@Override
public int size() {
return fields.size();
}
}
}

View File

@ -26,10 +26,11 @@ import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.io.IOException;
@ -37,11 +38,12 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.*;
public class GetTermVectorTests extends AbstractTermVectorTests {
@ -64,11 +66,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
for (int i = 0; i < 20; i++) {
ActionFuture<TermVectorResponse> termVector = client().termVector(new TermVectorRequest("test", "type1", "" + i));
TermVectorResponse actionGet = termVector.actionGet();
assertThat(actionGet, Matchers.notNullValue());
assertThat(actionGet.isExists(), Matchers.equalTo(false));
assertThat(actionGet, notNullValue());
assertThat(actionGet.isExists(), equalTo(false));
}
}
@Test
@ -84,23 +84,23 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
assertAcked(prepareCreate("test").addMapping("type1", mapping));
ensureYellow();
// when indexing a field that simply has a question mark, the term
// vectors will be null
client().prepareIndex("test", "type1", "0").setSource("existingfield", "?").execute().actionGet();
refresh();
String[] selectedFields = { "existingfield" };
ActionFuture<TermVectorResponse> termVector = client().termVector(
new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields));
// lets see if the null term vectors are caught...
termVector.actionGet();
TermVectorResponse actionGet = termVector.actionGet();
assertThat(actionGet.isExists(), Matchers.equalTo(true));
ActionFuture<TermVectorResponse> termVector = client().termVector(new TermVectorRequest("test", "type1", "0")
.selectedFields(new String[]{"existingfield"}));
// lets see if the null term vectors are caught...
TermVectorResponse actionGet = termVector.actionGet();
assertThat(actionGet, notNullValue());
assertThat(actionGet.isExists(), equalTo(true));
assertThat(actionGet.getFields().terms("existingfield"), nullValue());
}
@Test
public void testExistingFieldButNotInDocNPE() throws Exception {
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1")
.startObject("properties")
.startObject("existingfield")
@ -110,21 +110,62 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
.endObject()
.endObject().endObject();
assertAcked(prepareCreate("test").addMapping("type1", mapping));
ensureYellow();
// when indexing a field that simply has a question mark, the term
// vectors will be null
client().prepareIndex("test", "type1", "0").setSource("anotherexistingfield", 1).execute().actionGet();
refresh();
String[] selectedFields = { "existingfield" };
ActionFuture<TermVectorResponse> termVector = client().termVector(
new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields));
ActionFuture<TermVectorResponse> termVector = client().termVector(new TermVectorRequest("test", "type1", "0")
.selectedFields(new String[]{"existingfield"}));
// lets see if the null term vectors are caught...
TermVectorResponse actionGet = termVector.actionGet();
assertThat(actionGet.isExists(), Matchers.equalTo(true));
assertThat(actionGet, notNullValue());
assertThat(actionGet.isExists(), equalTo(true));
assertThat(actionGet.getFields().terms("existingfield"), nullValue());
}
@Test
public void testNotIndexedField() throws Exception {
// must be of type string and indexed.
assertAcked(prepareCreate("test").addMapping("type1",
"field0", "type=integer,", // no tvs
"field1", "type=string,index=no", // no tvs
"field2", "type=string,index=no,store=yes", // no tvs
"field3", "type=string,index=no,term_vector=yes", // no tvs
"field4", "type=string,index=not_analyzed", // yes tvs
"field5", "type=string,index=analyzed")); // yes tvs
ensureYellow();
List<IndexRequestBuilder> indexBuilders = new ArrayList<>();
for (int i = 0; i < 6; i++) {
indexBuilders.add(client().prepareIndex()
.setIndex("test")
.setType("type1")
.setId(String.valueOf(i))
.setSource("field" + i, i));
}
indexRandom(true, indexBuilders);
for (int i = 0; i < 4; i++) {
TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i))
.setSelectedFields("field" + i)
.get();
assertThat(resp, notNullValue());
assertThat(resp.isExists(), equalTo(true));
assertThat("field" + i + " :", resp.getFields().terms("field" + i), nullValue());
}
for (int i = 4; i < 6; i++) {
TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i))
.setSelectedFields("field" + i)
.get();
assertThat("field" + i + " :", resp.getFields().terms("field" + i), notNullValue());
}
}
@Test
public void testSimpleTermVectors() throws ElasticsearchException, IOException {
@ -151,11 +192,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
.endObject()).execute().actionGet();
refresh();
}
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
for (int i = 0; i < 10; i++) {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true)
.setOffsets(true).setPositions(true).setSelectedFields();
@ -163,35 +199,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(1));
Terms terms = fields.terms("field");
assertThat(terms.size(), equalTo(8l));
TermsEnum iterator = terms.iterator(null);
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, Matchers.notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, Matchers.notNullValue());
// do not test ttf or doc frequency, because here we have many
// shards and do not know how documents are distributed
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
assertThat(iterator.next(), Matchers.nullValue());
checkBrownFoxTermVector(fields, "field", true);
}
}
@ -287,9 +295,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(infoString, next, Matchers.notNullValue());
assertThat(infoString, next, notNullValue());
assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString()));
assertThat(infoString, next, Matchers.notNullValue());
assertThat(infoString, next, notNullValue());
// do not test ttf or doc frequency, because here we have
// many shards and do not know how documents are distributed
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
@ -316,7 +324,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
} else {
assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1));
}
// only return something useful if requested and stored
if (isPayloadRequested && storePayloads) {
assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef(
@ -337,9 +344,8 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
}
}
assertThat(iterator.next(), Matchers.nullValue());
assertThat(iterator.next(), nullValue());
}
}
}
@ -427,7 +433,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
List<BytesRef> curPayloads = payloads.get(term);
assertThat(term, curPayloads, Matchers.notNullValue());
assertThat(term, curPayloads, notNullValue());
assertNotNull(docsAndPositions);
for (int k = 0; k < docsAndPositions.freq(); k++) {
docsAndPositions.nextPosition();
@ -440,8 +446,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
}
}
}
assertThat(iterator.next(), Matchers.nullValue());
assertThat(iterator.next(), nullValue());
}
private String createRandomDelimiter(String[] tokens) {
String delimiter = "";
boolean isTokenOrWhitespace = true;
@ -459,6 +466,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
}
return delimiter;
}
private String createString(String[] tokens, Map<String, List<BytesRef>> payloads, int encoding, char delimiter) {
String resultString = "";
ObjectIntOpenHashMap<String> payloadCounter = new ObjectIntOpenHashMap<>();
@ -543,4 +551,193 @@ public class GetTermVectorTests extends AbstractTermVectorTests {
}
return finalTokens;
}
// like testSimpleTermVectors but we create fields with no term vectors
@Test
public void testSimpleTermVectorsWithGenerate() throws ElasticsearchException, IOException {
String[] fieldNames = new String[10];
for (int i = 0; i < fieldNames.length; i++) {
fieldNames[i] = "field" + String.valueOf(i);
}
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties");
XContentBuilder source = XContentFactory.jsonBuilder().startObject();
for (String field : fieldNames) {
mapping.startObject(field)
.field("type", "string")
.field("term_vector", randomBoolean() ? "with_positions_offsets_payloads" : "no")
.field("analyzer", "tv_test")
.endObject();
source.field(field, "the quick brown fox jumps over the lazy dog");
}
mapping.endObject().endObject().endObject();
source.endObject();
assertAcked(prepareCreate("test")
.addMapping("type1", mapping)
.setSettings(settingsBuilder()
.put(indexSettings())
.put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
ensureGreen();
for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(source)
.execute().actionGet();
refresh();
}
for (int i = 0; i < 10; i++) {
TermVectorResponse response = client().prepareTermVector("test", "type1", Integer.toString(i))
.setPayloads(true)
.setOffsets(true)
.setPositions(true)
.setSelectedFields(fieldNames)
.execute().actionGet();
assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true));
Fields fields = response.getFields();
assertThat(fields.size(), equalTo(fieldNames.length));
for (String fieldName : fieldNames) {
// MemoryIndex does not support payloads
checkBrownFoxTermVector(fields, fieldName, false);
}
}
}
private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws ElasticsearchException, IOException {
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
Terms terms = fields.terms(fieldName);
assertThat(terms.size(), equalTo(8l));
TermsEnum iterator = terms.iterator(null);
for (int j = 0; j < values.length; j++) {
String string = values[j];
BytesRef next = iterator.next();
assertThat(next, notNullValue());
assertThat("expected " + string, string, equalTo(next.utf8ToString()));
assertThat(next, notNullValue());
// do not test ttf or doc frequency, because here we have many
// shards and do not know how documents are distributed
DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
assertThat(docsAndPositions.nextDoc(), equalTo(0));
assertThat(freq[j], equalTo(docsAndPositions.freq()));
int[] termPos = pos[j];
int[] termStartOffset = startOffset[j];
int[] termEndOffset = endOffset[j];
assertThat(termPos.length, equalTo(freq[j]));
assertThat(termStartOffset.length, equalTo(freq[j]));
assertThat(termEndOffset.length, equalTo(freq[j]));
for (int k = 0; k < freq[j]; k++) {
int nextPosition = docsAndPositions.nextPosition();
assertThat("term: " + string, nextPosition, equalTo(termPos[k]));
assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k]));
assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k]));
if (withPayloads) {
assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word")));
}
}
}
assertThat(iterator.next(), nullValue());
}
@Test
public void testDuelWithAndWithoutTermVectors() throws ElasticsearchException, IOException, ExecutionException, InterruptedException {
// setup indices
String[] indexNames = new String[] {"with_tv", "without_tv"};
ImmutableSettings.Builder settings = settingsBuilder()
.put(indexSettings())
.put("index.analysis.analyzer", "standard");
assertAcked(prepareCreate(indexNames[0])
.setSettings(settings)
.addMapping("type1", "field1", "type=string,term_vector=with_positions_offsets"));
assertAcked(prepareCreate(indexNames[1])
.setSettings(settings)
.addMapping("type1", "field1", "type=string,term_vector=no"));
ensureGreen();
// index documents with and without term vectors
String[] content = new String[]{
"Generating a random permutation of a sequence (such as when shuffling cards).",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Generating random numbers: see Random number generation.",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Transforming a data stream (such as when using a scrambler in telecommunications)."};
List<IndexRequestBuilder> indexBuilders = new ArrayList<>();
for (int i = 0; i < content.length; i++) {
for (String indexName : indexNames) {
indexBuilders.add(client().prepareIndex()
.setIndex(indexName)
.setType("type1")
.setId(String.valueOf(i))
.setSource("field1", content[i]));
}
}
indexRandom(true, indexBuilders);
// request tvs and compare from each index
for (int i = 0; i < content.length; i++) {
Fields[] fields = new Fields[2];
int idx = 0;
for (String indexName : indexNames) {
TermVectorResponse resp = client().prepareTermVector(indexName, "type1", String.valueOf(i))
.setOffsets(true)
.setPositions(true)
.setSelectedFields("field1")
.get();
assertThat("doc with index: test_with_tv, type1 and id: " + i, resp.isExists(), equalTo(true));
fields[idx++] = resp.getFields();
}
compareTermVectors("field1", fields[0], fields[1]);
}
}
private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
Terms terms0 = fields0.terms(fieldName);
Terms terms1 = fields1.terms(fieldName);
assertThat(terms0, notNullValue());
assertThat(terms1, notNullValue());
assertThat(terms0.size(), equalTo(terms1.size()));
TermsEnum iter0 = terms0.iterator(null);
TermsEnum iter1 = terms1.iterator(null);
for (int i = 0; i < terms0.size(); i++) {
BytesRef next0 = iter0.next();
assertThat(next0, notNullValue());
BytesRef next1 = iter1.next();
assertThat(next1, notNullValue());
// compare field value
String string0 = next0.utf8ToString();
String string1 = next1.utf8ToString();
assertThat("expected: " + string0, string0, equalTo(string1));
// compare df and ttf
assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
// compare freq and docs
DocsAndPositionsEnum docsAndPositions0 = iter0.docsAndPositions(null, null);
DocsAndPositionsEnum docsAndPositions1 = iter1.docsAndPositions(null, null);
assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
// compare position, start offsets and end offsets
for (int j = 0; j < docsAndPositions0.freq(); j++) {
assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
}
}
assertThat(iter0.next(), nullValue());
assertThat(iter1.next(), nullValue());
}
}