diff --git a/docs/reference/docs/termvectors.asciidoc b/docs/reference/docs/termvectors.asciidoc index b44f4fc23fc..0031e4fc8da 100644 --- a/docs/reference/docs/termvectors.asciidoc +++ b/docs/reference/docs/termvectors.asciidoc @@ -19,7 +19,7 @@ retrieved either with a parameter in the url curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?fields=text,...' -------------------------------------------------- -or adding by adding the requested fields in the request body (see +or by adding the requested fields in the request body (see example below). [float] @@ -38,9 +38,11 @@ statistics are returned for all fields but no term statistics. * term payloads (`payloads` : true), as base64 encoded bytes If the requested information wasn't stored in the index, it will be -omitted without further warning. See <> +computed on the fly if possible. See <> for how to configure your index to store term vectors. +coming[1.4.0,The ability to computed term vectors on the fly is only available from 1.4.0 onwards (see below)] + [WARNING] ====== Start and end offsets assume UTF-16 encoding is being used. If you want to use @@ -84,7 +86,7 @@ are therefore only useful as relative measures whereas the absolute numbers have no meaning in this context. [float] -=== Example +=== Example 1 First, we create an index that stores term vectors, payloads etc. : @@ -222,3 +224,22 @@ Response: } } -------------------------------------------------- + +[float] +=== Example 2 coming[1.4.0] + +Additionally, term vectors which are not explicitly stored in the index are automatically +computed on the fly. The following request returns all information and statistics for the +fields in document `1`, even though the terms haven't been explicitly stored in the index. +Note that for the field `text`, the terms are not re-generated. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' -d '{ + "fields" : ["text", "some_field_without_term_vectors"], + "offsets" : true, + "positions" : true, + "term_statistics" : true, + "field_statistics" : true +}' +-------------------------------------------------- diff --git a/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java b/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java index 3d6b05ea9d6..10509662cbc 100644 --- a/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java +++ b/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java @@ -197,23 +197,23 @@ final class TermVectorWriter { private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException { int docFreq = topLevelIterator.docFreq(); - assert (docFreq >= 0); + assert (docFreq >= -1); writePotentiallyNegativeVInt(docFreq); long ttf = topLevelIterator.totalTermFreq(); - assert (ttf >= 0); + assert (ttf >= -1); writePotentiallyNegativeVLong(ttf); } private void writeFieldStatistics(Terms topLevelTerms) throws IOException { long sttf = topLevelTerms.getSumTotalTermFreq(); - assert (sttf >= 0); + assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = topLevelTerms.getSumDocFreq(); - assert (sdf >= 0); + assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = topLevelTerms.getDocCount(); - assert (dc >= 0); + assert (dc >= -1); writePotentiallyNegativeVInt(dc); } diff --git a/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java b/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java index f31b3583896..2ed0c88e72b 100644 --- a/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java +++ b/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java @@ -19,35 +19,40 @@ package org.elasticsearch.index.termvectors; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.*; +import org.apache.lucene.index.memory.MemoryIndex; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.termvector.TermVectorRequest; import org.elasticsearch.action.termvector.TermVectorResponse; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.engine.Engine; -import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.get.GetField; +import org.elasticsearch.index.get.GetResult; +import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.Uid; +import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.mapper.internal.UidFieldMapper; import org.elasticsearch.index.settings.IndexSettings; import org.elasticsearch.index.shard.AbstractIndexShardComponent; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.service.IndexShard; +import java.io.IOException; +import java.util.*; + /** */ public class ShardTermVectorService extends AbstractIndexShardComponent { private IndexShard indexShard; - private MapperService mapperService; @Inject - public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings, MapperService mapperService) { + public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings) { super(shardId, indexSettings); } @@ -66,8 +71,11 @@ public class ShardTermVectorService extends AbstractIndexShardComponent { Fields topLevelFields = MultiFields.getFields(topLevelReader); Versions.DocIdAndVersion docIdAndVersion = Versions.loadDocIdAndVersion(topLevelReader, uidTerm); if (docIdAndVersion != null) { - Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId); + /* generate term vectors if not available */ + if (request.selectedFields() != null) { + termVectorsByField = generateTermVectorsIfNeeded(termVectorsByField, request, uidTerm, false); + } termVectorResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields); termVectorResponse.setExists(true); termVectorResponse.setDocVersion(docIdAndVersion.version); @@ -81,4 +89,102 @@ public class ShardTermVectorService extends AbstractIndexShardComponent { } return termVectorResponse; } + + private Fields generateTermVectorsIfNeeded(Fields termVectorsByField, TermVectorRequest request, Term uidTerm, boolean realTime) throws IOException { + List validFields = new ArrayList<>(); + for (String field : request.selectedFields()) { + FieldMapper fieldMapper = indexShard.mapperService().smartNameFieldMapper(field); + if (!(fieldMapper instanceof StringFieldMapper)) { + continue; + } + if (fieldMapper.fieldType().storeTermVectors()) { + continue; + } + // only disallow fields which are not indexed + if (!fieldMapper.fieldType().indexed()) { + continue; + } + validFields.add(field); + } + if (validFields.isEmpty()) { + return termVectorsByField; + } + + Engine.GetResult get = indexShard.get(new Engine.Get(realTime, uidTerm)); + Fields generatedTermVectors; + try { + if (!get.exists()) { + return termVectorsByField; + } + // TODO: support for fetchSourceContext? + GetResult getResult = indexShard.getService().get( + get, request.id(), request.type(), validFields.toArray(Strings.EMPTY_ARRAY), null); + generatedTermVectors = generateTermVectors(getResult.getFields().values(), request.offsets()); + } finally { + get.release(); + } + if (termVectorsByField == null) { + return generatedTermVectors; + } else { + return mergeFields(request.selectedFields().toArray(Strings.EMPTY_ARRAY), termVectorsByField, generatedTermVectors); + } + } + + private Fields generateTermVectors(Collection getFields, boolean withOffsets) throws IOException { + // store document in memory index + MemoryIndex index = new MemoryIndex(withOffsets); + for (GetField getField : getFields) { + String field = getField.getName(); + Analyzer analyzer = indexShard.mapperService().smartNameFieldMapper(field).indexAnalyzer(); + if (analyzer == null) { + analyzer = indexShard.mapperService().analysisService().defaultIndexAnalyzer(); + } + for (Object text : getField.getValues()) { + index.addField(field, text.toString(), analyzer); + } + } + // and read vectors from it + return MultiFields.getFields(index.createSearcher().getIndexReader()); + } + + private Fields mergeFields(String[] fieldNames, Fields... fieldsObject) throws IOException { + ParallelFields parallelFields = new ParallelFields(); + for (Fields fieldObject : fieldsObject) { + assert fieldObject != null; + for (String fieldName : fieldNames) { + Terms terms = fieldObject.terms(fieldName); + if (terms != null) { + parallelFields.addField(fieldName, terms); + } + } + } + return parallelFields; + } + + // Poached from Lucene ParallelAtomicReader + private static final class ParallelFields extends Fields { + final Map fields = new TreeMap<>(); + + ParallelFields() { + } + + void addField(String fieldName, Terms terms) { + fields.put(fieldName, terms); + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) { + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + } } diff --git a/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java b/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java index 2b0a073f24c..653ce09a950 100644 --- a/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java +++ b/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java @@ -26,10 +26,11 @@ import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ActionFuture; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.mapper.core.AbstractFieldMapper; -import org.hamcrest.Matchers; import org.junit.Test; import java.io.IOException; @@ -37,11 +38,12 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutionException; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows; -import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.*; public class GetTermVectorTests extends AbstractTermVectorTests { @@ -64,11 +66,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { for (int i = 0; i < 20; i++) { ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "" + i)); TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet, Matchers.notNullValue()); - assertThat(actionGet.isExists(), Matchers.equalTo(false)); - + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(false)); } - } @Test @@ -84,23 +84,23 @@ public class GetTermVectorTests extends AbstractTermVectorTests { assertAcked(prepareCreate("test").addMapping("type1", mapping)); ensureYellow(); + // when indexing a field that simply has a question mark, the term // vectors will be null client().prepareIndex("test", "type1", "0").setSource("existingfield", "?").execute().actionGet(); refresh(); - String[] selectedFields = { "existingfield" }; - ActionFuture termVector = client().termVector( - new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields)); - // lets see if the null term vectors are caught... - termVector.actionGet(); - TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet.isExists(), Matchers.equalTo(true)); + ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "0") + .selectedFields(new String[]{"existingfield"})); + // lets see if the null term vectors are caught... + TermVectorResponse actionGet = termVector.actionGet(); + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(true)); + assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } @Test public void testExistingFieldButNotInDocNPE() throws Exception { - XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("existingfield") @@ -110,21 +110,62 @@ public class GetTermVectorTests extends AbstractTermVectorTests { .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping)); + ensureYellow(); + // when indexing a field that simply has a question mark, the term // vectors will be null client().prepareIndex("test", "type1", "0").setSource("anotherexistingfield", 1).execute().actionGet(); refresh(); - String[] selectedFields = { "existingfield" }; - ActionFuture termVector = client().termVector( - new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields)); + ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "0") + .selectedFields(new String[]{"existingfield"})); + // lets see if the null term vectors are caught... TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet.isExists(), Matchers.equalTo(true)); - + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(true)); + assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } + @Test + public void testNotIndexedField() throws Exception { + // must be of type string and indexed. + assertAcked(prepareCreate("test").addMapping("type1", + "field0", "type=integer,", // no tvs + "field1", "type=string,index=no", // no tvs + "field2", "type=string,index=no,store=yes", // no tvs + "field3", "type=string,index=no,term_vector=yes", // no tvs + "field4", "type=string,index=not_analyzed", // yes tvs + "field5", "type=string,index=analyzed")); // yes tvs + ensureYellow(); + + List indexBuilders = new ArrayList<>(); + for (int i = 0; i < 6; i++) { + indexBuilders.add(client().prepareIndex() + .setIndex("test") + .setType("type1") + .setId(String.valueOf(i)) + .setSource("field" + i, i)); + } + indexRandom(true, indexBuilders); + + for (int i = 0; i < 4; i++) { + TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i)) + .setSelectedFields("field" + i) + .get(); + assertThat(resp, notNullValue()); + assertThat(resp.isExists(), equalTo(true)); + assertThat("field" + i + " :", resp.getFields().terms("field" + i), nullValue()); + } + + for (int i = 4; i < 6; i++) { + TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i)) + .setSelectedFields("field" + i) + .get(); + assertThat("field" + i + " :", resp.getFields().terms("field" + i), notNullValue()); + } + } @Test public void testSimpleTermVectors() throws ElasticsearchException, IOException { @@ -151,11 +192,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests { .endObject()).execute().actionGet(); refresh(); } - String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; - int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; - int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; - int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; - int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; for (int i = 0; i < 10; i++) { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true) .setOffsets(true).setPositions(true).setSelectedFields(); @@ -163,35 +199,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - assertThat(terms.size(), equalTo(8l)); - TermsEnum iterator = terms.iterator(null); - for (int j = 0; j < values.length; j++) { - String string = values[j]; - BytesRef next = iterator.next(); - assertThat(next, Matchers.notNullValue()); - assertThat("expected " + string, string, equalTo(next.utf8ToString())); - assertThat(next, Matchers.notNullValue()); - // do not test ttf or doc frequency, because here we have many - // shards and do not know how documents are distributed - DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - assertThat(freq[j], equalTo(docsAndPositions.freq())); - int[] termPos = pos[j]; - int[] termStartOffset = startOffset[j]; - int[] termEndOffset = endOffset[j]; - assertThat(termPos.length, equalTo(freq[j])); - assertThat(termStartOffset.length, equalTo(freq[j])); - assertThat(termEndOffset.length, equalTo(freq[j])); - for (int k = 0; k < freq[j]; k++) { - int nextPosition = docsAndPositions.nextPosition(); - assertThat("term: " + string, nextPosition, equalTo(termPos[k])); - assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); - assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } - } - assertThat(iterator.next(), Matchers.nullValue()); + checkBrownFoxTermVector(fields, "field", true); } } @@ -287,9 +295,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); - assertThat(infoString, next, Matchers.notNullValue()); + assertThat(infoString, next, notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); - assertThat(infoString, next, Matchers.notNullValue()); + assertThat(infoString, next, notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); @@ -316,7 +324,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } - // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef( @@ -337,9 +344,8 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } } - assertThat(iterator.next(), Matchers.nullValue()); + assertThat(iterator.next(), nullValue()); } - } } @@ -427,7 +433,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); List curPayloads = payloads.get(term); - assertThat(term, curPayloads, Matchers.notNullValue()); + assertThat(term, curPayloads, notNullValue()); assertNotNull(docsAndPositions); for (int k = 0; k < docsAndPositions.freq(); k++) { docsAndPositions.nextPosition(); @@ -440,8 +446,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } } } - assertThat(iterator.next(), Matchers.nullValue()); + assertThat(iterator.next(), nullValue()); } + private String createRandomDelimiter(String[] tokens) { String delimiter = ""; boolean isTokenOrWhitespace = true; @@ -459,6 +466,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } return delimiter; } + private String createString(String[] tokens, Map> payloads, int encoding, char delimiter) { String resultString = ""; ObjectIntOpenHashMap payloadCounter = new ObjectIntOpenHashMap<>(); @@ -543,4 +551,193 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } return finalTokens; } + + // like testSimpleTermVectors but we create fields with no term vectors + @Test + public void testSimpleTermVectorsWithGenerate() throws ElasticsearchException, IOException { + String[] fieldNames = new String[10]; + for (int i = 0; i < fieldNames.length; i++) { + fieldNames[i] = "field" + String.valueOf(i); + } + + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties"); + XContentBuilder source = XContentFactory.jsonBuilder().startObject(); + for (String field : fieldNames) { + mapping.startObject(field) + .field("type", "string") + .field("term_vector", randomBoolean() ? "with_positions_offsets_payloads" : "no") + .field("analyzer", "tv_test") + .endObject(); + source.field(field, "the quick brown fox jumps over the lazy dog"); + } + mapping.endObject().endObject().endObject(); + source.endObject(); + + assertAcked(prepareCreate("test") + .addMapping("type1", mapping) + .setSettings(settingsBuilder() + .put(indexSettings()) + .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); + + ensureGreen(); + + for (int i = 0; i < 10; i++) { + client().prepareIndex("test", "type1", Integer.toString(i)) + .setSource(source) + .execute().actionGet(); + refresh(); + } + + for (int i = 0; i < 10; i++) { + TermVectorResponse response = client().prepareTermVector("test", "type1", Integer.toString(i)) + .setPayloads(true) + .setOffsets(true) + .setPositions(true) + .setSelectedFields(fieldNames) + .execute().actionGet(); + assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); + Fields fields = response.getFields(); + assertThat(fields.size(), equalTo(fieldNames.length)); + for (String fieldName : fieldNames) { + // MemoryIndex does not support payloads + checkBrownFoxTermVector(fields, fieldName, false); + } + } + } + + private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws ElasticsearchException, IOException { + String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; + int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; + int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; + int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; + int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; + + Terms terms = fields.terms(fieldName); + assertThat(terms.size(), equalTo(8l)); + TermsEnum iterator = terms.iterator(null); + for (int j = 0; j < values.length; j++) { + String string = values[j]; + BytesRef next = iterator.next(); + assertThat(next, notNullValue()); + assertThat("expected " + string, string, equalTo(next.utf8ToString())); + assertThat(next, notNullValue()); + // do not test ttf or doc frequency, because here we have many + // shards and do not know how documents are distributed + DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); + assertThat(docsAndPositions.nextDoc(), equalTo(0)); + assertThat(freq[j], equalTo(docsAndPositions.freq())); + int[] termPos = pos[j]; + int[] termStartOffset = startOffset[j]; + int[] termEndOffset = endOffset[j]; + assertThat(termPos.length, equalTo(freq[j])); + assertThat(termStartOffset.length, equalTo(freq[j])); + assertThat(termEndOffset.length, equalTo(freq[j])); + for (int k = 0; k < freq[j]; k++) { + int nextPosition = docsAndPositions.nextPosition(); + assertThat("term: " + string, nextPosition, equalTo(termPos[k])); + assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); + assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); + if (withPayloads) { + assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); + } + } + } + assertThat(iterator.next(), nullValue()); + } + + @Test + public void testDuelWithAndWithoutTermVectors() throws ElasticsearchException, IOException, ExecutionException, InterruptedException { + // setup indices + String[] indexNames = new String[] {"with_tv", "without_tv"}; + ImmutableSettings.Builder settings = settingsBuilder() + .put(indexSettings()) + .put("index.analysis.analyzer", "standard"); + assertAcked(prepareCreate(indexNames[0]) + .setSettings(settings) + .addMapping("type1", "field1", "type=string,term_vector=with_positions_offsets")); + assertAcked(prepareCreate(indexNames[1]) + .setSettings(settings) + .addMapping("type1", "field1", "type=string,term_vector=no")); + ensureGreen(); + + // index documents with and without term vectors + String[] content = new String[]{ + "Generating a random permutation of a sequence (such as when shuffling cards).", + "Selecting a random sample of a population (important in statistical sampling).", + "Allocating experimental units via random assignment to a treatment or control condition.", + "Generating random numbers: see Random number generation.", + "Selecting a random sample of a population (important in statistical sampling).", + "Allocating experimental units via random assignment to a treatment or control condition.", + "Transforming a data stream (such as when using a scrambler in telecommunications)."}; + + List indexBuilders = new ArrayList<>(); + for (int i = 0; i < content.length; i++) { + for (String indexName : indexNames) { + indexBuilders.add(client().prepareIndex() + .setIndex(indexName) + .setType("type1") + .setId(String.valueOf(i)) + .setSource("field1", content[i])); + } + } + indexRandom(true, indexBuilders); + + // request tvs and compare from each index + for (int i = 0; i < content.length; i++) { + Fields[] fields = new Fields[2]; + int idx = 0; + for (String indexName : indexNames) { + TermVectorResponse resp = client().prepareTermVector(indexName, "type1", String.valueOf(i)) + .setOffsets(true) + .setPositions(true) + .setSelectedFields("field1") + .get(); + assertThat("doc with index: test_with_tv, type1 and id: " + i, resp.isExists(), equalTo(true)); + fields[idx++] = resp.getFields(); + } + compareTermVectors("field1", fields[0], fields[1]); + } + } + + private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { + Terms terms0 = fields0.terms(fieldName); + Terms terms1 = fields1.terms(fieldName); + assertThat(terms0, notNullValue()); + assertThat(terms1, notNullValue()); + assertThat(terms0.size(), equalTo(terms1.size())); + + TermsEnum iter0 = terms0.iterator(null); + TermsEnum iter1 = terms1.iterator(null); + for (int i = 0; i < terms0.size(); i++) { + BytesRef next0 = iter0.next(); + assertThat(next0, notNullValue()); + BytesRef next1 = iter1.next(); + assertThat(next1, notNullValue()); + + // compare field value + String string0 = next0.utf8ToString(); + String string1 = next1.utf8ToString(); + assertThat("expected: " + string0, string0, equalTo(string1)); + + // compare df and ttf + assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); + assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); + + // compare freq and docs + DocsAndPositionsEnum docsAndPositions0 = iter0.docsAndPositions(null, null); + DocsAndPositionsEnum docsAndPositions1 = iter1.docsAndPositions(null, null); + assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); + assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); + + // compare position, start offsets and end offsets + for (int j = 0; j < docsAndPositions0.freq(); j++) { + assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); + assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); + assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); + } + } + assertThat(iter0.next(), nullValue()); + assertThat(iter1.next(), nullValue()); + } }