From f22f3db30f9dcbedf1b1a4991b948a2ad8097830 Mon Sep 17 00:00:00 2001 From: Alex Ksikes Date: Tue, 17 Jun 2014 19:08:38 +0200 Subject: [PATCH] Term Vectors API: Computes term vectors on the fly if not stored in the index. Adds the ability to the Term Vector API to generate term vectors for some chosen fields, even though they haven't been explicitely stored in the index. Relates to #5184 Closes #6567 --- docs/reference/docs/termvectors.asciidoc | 27 +- .../action/termvector/TermVectorWriter.java | 10 +- .../termvectors/ShardTermVectorService.java | 122 ++++++- .../action/termvector/GetTermVectorTests.java | 317 ++++++++++++++---- 4 files changed, 400 insertions(+), 76 deletions(-) diff --git a/docs/reference/docs/termvectors.asciidoc b/docs/reference/docs/termvectors.asciidoc index b44f4fc23fc..0031e4fc8da 100644 --- a/docs/reference/docs/termvectors.asciidoc +++ b/docs/reference/docs/termvectors.asciidoc @@ -19,7 +19,7 @@ retrieved either with a parameter in the url curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?fields=text,...' -------------------------------------------------- -or adding by adding the requested fields in the request body (see +or by adding the requested fields in the request body (see example below). [float] @@ -38,9 +38,11 @@ statistics are returned for all fields but no term statistics. * term payloads (`payloads` : true), as base64 encoded bytes If the requested information wasn't stored in the index, it will be -omitted without further warning. See <> +computed on the fly if possible. See <> for how to configure your index to store term vectors. +coming[1.4.0,The ability to computed term vectors on the fly is only available from 1.4.0 onwards (see below)] + [WARNING] ====== Start and end offsets assume UTF-16 encoding is being used. If you want to use @@ -84,7 +86,7 @@ are therefore only useful as relative measures whereas the absolute numbers have no meaning in this context. [float] -=== Example +=== Example 1 First, we create an index that stores term vectors, payloads etc. : @@ -222,3 +224,22 @@ Response: } } -------------------------------------------------- + +[float] +=== Example 2 coming[1.4.0] + +Additionally, term vectors which are not explicitly stored in the index are automatically +computed on the fly. The following request returns all information and statistics for the +fields in document `1`, even though the terms haven't been explicitly stored in the index. +Note that for the field `text`, the terms are not re-generated. + +[source,js] +-------------------------------------------------- +curl -XGET 'http://localhost:9200/twitter/tweet/1/_termvector?pretty=true' -d '{ + "fields" : ["text", "some_field_without_term_vectors"], + "offsets" : true, + "positions" : true, + "term_statistics" : true, + "field_statistics" : true +}' +-------------------------------------------------- diff --git a/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java b/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java index 3d6b05ea9d6..10509662cbc 100644 --- a/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java +++ b/src/main/java/org/elasticsearch/action/termvector/TermVectorWriter.java @@ -197,23 +197,23 @@ final class TermVectorWriter { private void writeTermStatistics(TermsEnum topLevelIterator) throws IOException { int docFreq = topLevelIterator.docFreq(); - assert (docFreq >= 0); + assert (docFreq >= -1); writePotentiallyNegativeVInt(docFreq); long ttf = topLevelIterator.totalTermFreq(); - assert (ttf >= 0); + assert (ttf >= -1); writePotentiallyNegativeVLong(ttf); } private void writeFieldStatistics(Terms topLevelTerms) throws IOException { long sttf = topLevelTerms.getSumTotalTermFreq(); - assert (sttf >= 0); + assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = topLevelTerms.getSumDocFreq(); - assert (sdf >= 0); + assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = topLevelTerms.getDocCount(); - assert (dc >= 0); + assert (dc >= -1); writePotentiallyNegativeVInt(dc); } diff --git a/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java b/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java index f31b3583896..2ed0c88e72b 100644 --- a/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java +++ b/src/main/java/org/elasticsearch/index/termvectors/ShardTermVectorService.java @@ -19,35 +19,40 @@ package org.elasticsearch.index.termvectors; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.MultiFields; -import org.apache.lucene.index.Term; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.*; +import org.apache.lucene.index.memory.MemoryIndex; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.termvector.TermVectorRequest; import org.elasticsearch.action.termvector.TermVectorResponse; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.lucene.uid.Versions; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.index.engine.Engine; -import org.elasticsearch.index.mapper.MapperService; +import org.elasticsearch.index.get.GetField; +import org.elasticsearch.index.get.GetResult; +import org.elasticsearch.index.mapper.FieldMapper; import org.elasticsearch.index.mapper.Uid; +import org.elasticsearch.index.mapper.core.StringFieldMapper; import org.elasticsearch.index.mapper.internal.UidFieldMapper; import org.elasticsearch.index.settings.IndexSettings; import org.elasticsearch.index.shard.AbstractIndexShardComponent; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.index.shard.service.IndexShard; +import java.io.IOException; +import java.util.*; + /** */ public class ShardTermVectorService extends AbstractIndexShardComponent { private IndexShard indexShard; - private MapperService mapperService; @Inject - public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings, MapperService mapperService) { + public ShardTermVectorService(ShardId shardId, @IndexSettings Settings indexSettings) { super(shardId, indexSettings); } @@ -66,8 +71,11 @@ public class ShardTermVectorService extends AbstractIndexShardComponent { Fields topLevelFields = MultiFields.getFields(topLevelReader); Versions.DocIdAndVersion docIdAndVersion = Versions.loadDocIdAndVersion(topLevelReader, uidTerm); if (docIdAndVersion != null) { - Fields termVectorsByField = docIdAndVersion.context.reader().getTermVectors(docIdAndVersion.docId); + /* generate term vectors if not available */ + if (request.selectedFields() != null) { + termVectorsByField = generateTermVectorsIfNeeded(termVectorsByField, request, uidTerm, false); + } termVectorResponse.setFields(termVectorsByField, request.selectedFields(), request.getFlags(), topLevelFields); termVectorResponse.setExists(true); termVectorResponse.setDocVersion(docIdAndVersion.version); @@ -81,4 +89,102 @@ public class ShardTermVectorService extends AbstractIndexShardComponent { } return termVectorResponse; } + + private Fields generateTermVectorsIfNeeded(Fields termVectorsByField, TermVectorRequest request, Term uidTerm, boolean realTime) throws IOException { + List validFields = new ArrayList<>(); + for (String field : request.selectedFields()) { + FieldMapper fieldMapper = indexShard.mapperService().smartNameFieldMapper(field); + if (!(fieldMapper instanceof StringFieldMapper)) { + continue; + } + if (fieldMapper.fieldType().storeTermVectors()) { + continue; + } + // only disallow fields which are not indexed + if (!fieldMapper.fieldType().indexed()) { + continue; + } + validFields.add(field); + } + if (validFields.isEmpty()) { + return termVectorsByField; + } + + Engine.GetResult get = indexShard.get(new Engine.Get(realTime, uidTerm)); + Fields generatedTermVectors; + try { + if (!get.exists()) { + return termVectorsByField; + } + // TODO: support for fetchSourceContext? + GetResult getResult = indexShard.getService().get( + get, request.id(), request.type(), validFields.toArray(Strings.EMPTY_ARRAY), null); + generatedTermVectors = generateTermVectors(getResult.getFields().values(), request.offsets()); + } finally { + get.release(); + } + if (termVectorsByField == null) { + return generatedTermVectors; + } else { + return mergeFields(request.selectedFields().toArray(Strings.EMPTY_ARRAY), termVectorsByField, generatedTermVectors); + } + } + + private Fields generateTermVectors(Collection getFields, boolean withOffsets) throws IOException { + // store document in memory index + MemoryIndex index = new MemoryIndex(withOffsets); + for (GetField getField : getFields) { + String field = getField.getName(); + Analyzer analyzer = indexShard.mapperService().smartNameFieldMapper(field).indexAnalyzer(); + if (analyzer == null) { + analyzer = indexShard.mapperService().analysisService().defaultIndexAnalyzer(); + } + for (Object text : getField.getValues()) { + index.addField(field, text.toString(), analyzer); + } + } + // and read vectors from it + return MultiFields.getFields(index.createSearcher().getIndexReader()); + } + + private Fields mergeFields(String[] fieldNames, Fields... fieldsObject) throws IOException { + ParallelFields parallelFields = new ParallelFields(); + for (Fields fieldObject : fieldsObject) { + assert fieldObject != null; + for (String fieldName : fieldNames) { + Terms terms = fieldObject.terms(fieldName); + if (terms != null) { + parallelFields.addField(fieldName, terms); + } + } + } + return parallelFields; + } + + // Poached from Lucene ParallelAtomicReader + private static final class ParallelFields extends Fields { + final Map fields = new TreeMap<>(); + + ParallelFields() { + } + + void addField(String fieldName, Terms terms) { + fields.put(fieldName, terms); + } + + @Override + public Iterator iterator() { + return Collections.unmodifiableSet(fields.keySet()).iterator(); + } + + @Override + public Terms terms(String field) { + return fields.get(field); + } + + @Override + public int size() { + return fields.size(); + } + } } diff --git a/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java b/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java index 2b0a073f24c..653ce09a950 100644 --- a/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java +++ b/src/test/java/org/elasticsearch/action/termvector/GetTermVectorTests.java @@ -26,10 +26,11 @@ import org.apache.lucene.index.*; import org.apache.lucene.util.BytesRef; import org.elasticsearch.ElasticsearchException; import org.elasticsearch.action.ActionFuture; +import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.index.mapper.core.AbstractFieldMapper; -import org.hamcrest.Matchers; import org.junit.Test; import java.io.IOException; @@ -37,11 +38,12 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.ExecutionException; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows; -import static org.hamcrest.Matchers.equalTo; +import static org.hamcrest.Matchers.*; public class GetTermVectorTests extends AbstractTermVectorTests { @@ -64,11 +66,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { for (int i = 0; i < 20; i++) { ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "" + i)); TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet, Matchers.notNullValue()); - assertThat(actionGet.isExists(), Matchers.equalTo(false)); - + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(false)); } - } @Test @@ -84,23 +84,23 @@ public class GetTermVectorTests extends AbstractTermVectorTests { assertAcked(prepareCreate("test").addMapping("type1", mapping)); ensureYellow(); + // when indexing a field that simply has a question mark, the term // vectors will be null client().prepareIndex("test", "type1", "0").setSource("existingfield", "?").execute().actionGet(); refresh(); - String[] selectedFields = { "existingfield" }; - ActionFuture termVector = client().termVector( - new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields)); - // lets see if the null term vectors are caught... - termVector.actionGet(); - TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet.isExists(), Matchers.equalTo(true)); + ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "0") + .selectedFields(new String[]{"existingfield"})); + // lets see if the null term vectors are caught... + TermVectorResponse actionGet = termVector.actionGet(); + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(true)); + assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } @Test public void testExistingFieldButNotInDocNPE() throws Exception { - XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1") .startObject("properties") .startObject("existingfield") @@ -110,21 +110,62 @@ public class GetTermVectorTests extends AbstractTermVectorTests { .endObject() .endObject().endObject(); assertAcked(prepareCreate("test").addMapping("type1", mapping)); + ensureYellow(); + // when indexing a field that simply has a question mark, the term // vectors will be null client().prepareIndex("test", "type1", "0").setSource("anotherexistingfield", 1).execute().actionGet(); refresh(); - String[] selectedFields = { "existingfield" }; - ActionFuture termVector = client().termVector( - new TermVectorRequest("test", "type1", "0").selectedFields(selectedFields)); + ActionFuture termVector = client().termVector(new TermVectorRequest("test", "type1", "0") + .selectedFields(new String[]{"existingfield"})); + // lets see if the null term vectors are caught... TermVectorResponse actionGet = termVector.actionGet(); - assertThat(actionGet.isExists(), Matchers.equalTo(true)); - + assertThat(actionGet, notNullValue()); + assertThat(actionGet.isExists(), equalTo(true)); + assertThat(actionGet.getFields().terms("existingfield"), nullValue()); } + @Test + public void testNotIndexedField() throws Exception { + // must be of type string and indexed. + assertAcked(prepareCreate("test").addMapping("type1", + "field0", "type=integer,", // no tvs + "field1", "type=string,index=no", // no tvs + "field2", "type=string,index=no,store=yes", // no tvs + "field3", "type=string,index=no,term_vector=yes", // no tvs + "field4", "type=string,index=not_analyzed", // yes tvs + "field5", "type=string,index=analyzed")); // yes tvs + ensureYellow(); + + List indexBuilders = new ArrayList<>(); + for (int i = 0; i < 6; i++) { + indexBuilders.add(client().prepareIndex() + .setIndex("test") + .setType("type1") + .setId(String.valueOf(i)) + .setSource("field" + i, i)); + } + indexRandom(true, indexBuilders); + + for (int i = 0; i < 4; i++) { + TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i)) + .setSelectedFields("field" + i) + .get(); + assertThat(resp, notNullValue()); + assertThat(resp.isExists(), equalTo(true)); + assertThat("field" + i + " :", resp.getFields().terms("field" + i), nullValue()); + } + + for (int i = 4; i < 6; i++) { + TermVectorResponse resp = client().prepareTermVector("test", "type1", String.valueOf(i)) + .setSelectedFields("field" + i) + .get(); + assertThat("field" + i + " :", resp.getFields().terms("field" + i), notNullValue()); + } + } @Test public void testSimpleTermVectors() throws ElasticsearchException, IOException { @@ -151,11 +192,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests { .endObject()).execute().actionGet(); refresh(); } - String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; - int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; - int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; - int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; - int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; for (int i = 0; i < 10; i++) { TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true) .setOffsets(true).setPositions(true).setSelectedFields(); @@ -163,35 +199,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); Fields fields = response.getFields(); assertThat(fields.size(), equalTo(1)); - Terms terms = fields.terms("field"); - assertThat(terms.size(), equalTo(8l)); - TermsEnum iterator = terms.iterator(null); - for (int j = 0; j < values.length; j++) { - String string = values[j]; - BytesRef next = iterator.next(); - assertThat(next, Matchers.notNullValue()); - assertThat("expected " + string, string, equalTo(next.utf8ToString())); - assertThat(next, Matchers.notNullValue()); - // do not test ttf or doc frequency, because here we have many - // shards and do not know how documents are distributed - DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); - assertThat(docsAndPositions.nextDoc(), equalTo(0)); - assertThat(freq[j], equalTo(docsAndPositions.freq())); - int[] termPos = pos[j]; - int[] termStartOffset = startOffset[j]; - int[] termEndOffset = endOffset[j]; - assertThat(termPos.length, equalTo(freq[j])); - assertThat(termStartOffset.length, equalTo(freq[j])); - assertThat(termEndOffset.length, equalTo(freq[j])); - for (int k = 0; k < freq[j]; k++) { - int nextPosition = docsAndPositions.nextPosition(); - assertThat("term: " + string, nextPosition, equalTo(termPos[k])); - assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); - assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); - assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); - } - } - assertThat(iterator.next(), Matchers.nullValue()); + checkBrownFoxTermVector(fields, "field", true); } } @@ -287,9 +295,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { for (int j = 0; j < values.length; j++) { String string = values[j]; BytesRef next = iterator.next(); - assertThat(infoString, next, Matchers.notNullValue()); + assertThat(infoString, next, notNullValue()); assertThat(infoString + "expected " + string, string, equalTo(next.utf8ToString())); - assertThat(infoString, next, Matchers.notNullValue()); + assertThat(infoString, next, notNullValue()); // do not test ttf or doc frequency, because here we have // many shards and do not know how documents are distributed DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); @@ -316,7 +324,6 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } else { assertThat(infoString + "positions for term: ", nextPosition, equalTo(-1)); } - // only return something useful if requested and stored if (isPayloadRequested && storePayloads) { assertThat(infoString + "payloads for term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef( @@ -337,9 +344,8 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } } - assertThat(iterator.next(), Matchers.nullValue()); + assertThat(iterator.next(), nullValue()); } - } } @@ -427,7 +433,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); assertThat(docsAndPositions.nextDoc(), equalTo(0)); List curPayloads = payloads.get(term); - assertThat(term, curPayloads, Matchers.notNullValue()); + assertThat(term, curPayloads, notNullValue()); assertNotNull(docsAndPositions); for (int k = 0; k < docsAndPositions.freq(); k++) { docsAndPositions.nextPosition(); @@ -440,8 +446,9 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } } } - assertThat(iterator.next(), Matchers.nullValue()); + assertThat(iterator.next(), nullValue()); } + private String createRandomDelimiter(String[] tokens) { String delimiter = ""; boolean isTokenOrWhitespace = true; @@ -459,6 +466,7 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } return delimiter; } + private String createString(String[] tokens, Map> payloads, int encoding, char delimiter) { String resultString = ""; ObjectIntOpenHashMap payloadCounter = new ObjectIntOpenHashMap<>(); @@ -543,4 +551,193 @@ public class GetTermVectorTests extends AbstractTermVectorTests { } return finalTokens; } + + // like testSimpleTermVectors but we create fields with no term vectors + @Test + public void testSimpleTermVectorsWithGenerate() throws ElasticsearchException, IOException { + String[] fieldNames = new String[10]; + for (int i = 0; i < fieldNames.length; i++) { + fieldNames[i] = "field" + String.valueOf(i); + } + + XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties"); + XContentBuilder source = XContentFactory.jsonBuilder().startObject(); + for (String field : fieldNames) { + mapping.startObject(field) + .field("type", "string") + .field("term_vector", randomBoolean() ? "with_positions_offsets_payloads" : "no") + .field("analyzer", "tv_test") + .endObject(); + source.field(field, "the quick brown fox jumps over the lazy dog"); + } + mapping.endObject().endObject().endObject(); + source.endObject(); + + assertAcked(prepareCreate("test") + .addMapping("type1", mapping) + .setSettings(settingsBuilder() + .put(indexSettings()) + .put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") + .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); + + ensureGreen(); + + for (int i = 0; i < 10; i++) { + client().prepareIndex("test", "type1", Integer.toString(i)) + .setSource(source) + .execute().actionGet(); + refresh(); + } + + for (int i = 0; i < 10; i++) { + TermVectorResponse response = client().prepareTermVector("test", "type1", Integer.toString(i)) + .setPayloads(true) + .setOffsets(true) + .setPositions(true) + .setSelectedFields(fieldNames) + .execute().actionGet(); + assertThat("doc id: " + i + " doesn't exists but should", response.isExists(), equalTo(true)); + Fields fields = response.getFields(); + assertThat(fields.size(), equalTo(fieldNames.length)); + for (String fieldName : fieldNames) { + // MemoryIndex does not support payloads + checkBrownFoxTermVector(fields, fieldName, false); + } + } + } + + private void checkBrownFoxTermVector(Fields fields, String fieldName, boolean withPayloads) throws ElasticsearchException, IOException { + String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"}; + int[] freq = {1, 1, 1, 1, 1, 1, 1, 2}; + int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}}; + int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}}; + int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}}; + + Terms terms = fields.terms(fieldName); + assertThat(terms.size(), equalTo(8l)); + TermsEnum iterator = terms.iterator(null); + for (int j = 0; j < values.length; j++) { + String string = values[j]; + BytesRef next = iterator.next(); + assertThat(next, notNullValue()); + assertThat("expected " + string, string, equalTo(next.utf8ToString())); + assertThat(next, notNullValue()); + // do not test ttf or doc frequency, because here we have many + // shards and do not know how documents are distributed + DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null); + assertThat(docsAndPositions.nextDoc(), equalTo(0)); + assertThat(freq[j], equalTo(docsAndPositions.freq())); + int[] termPos = pos[j]; + int[] termStartOffset = startOffset[j]; + int[] termEndOffset = endOffset[j]; + assertThat(termPos.length, equalTo(freq[j])); + assertThat(termStartOffset.length, equalTo(freq[j])); + assertThat(termEndOffset.length, equalTo(freq[j])); + for (int k = 0; k < freq[j]; k++) { + int nextPosition = docsAndPositions.nextPosition(); + assertThat("term: " + string, nextPosition, equalTo(termPos[k])); + assertThat("term: " + string, docsAndPositions.startOffset(), equalTo(termStartOffset[k])); + assertThat("term: " + string, docsAndPositions.endOffset(), equalTo(termEndOffset[k])); + if (withPayloads) { + assertThat("term: " + string, docsAndPositions.getPayload(), equalTo(new BytesRef("word"))); + } + } + } + assertThat(iterator.next(), nullValue()); + } + + @Test + public void testDuelWithAndWithoutTermVectors() throws ElasticsearchException, IOException, ExecutionException, InterruptedException { + // setup indices + String[] indexNames = new String[] {"with_tv", "without_tv"}; + ImmutableSettings.Builder settings = settingsBuilder() + .put(indexSettings()) + .put("index.analysis.analyzer", "standard"); + assertAcked(prepareCreate(indexNames[0]) + .setSettings(settings) + .addMapping("type1", "field1", "type=string,term_vector=with_positions_offsets")); + assertAcked(prepareCreate(indexNames[1]) + .setSettings(settings) + .addMapping("type1", "field1", "type=string,term_vector=no")); + ensureGreen(); + + // index documents with and without term vectors + String[] content = new String[]{ + "Generating a random permutation of a sequence (such as when shuffling cards).", + "Selecting a random sample of a population (important in statistical sampling).", + "Allocating experimental units via random assignment to a treatment or control condition.", + "Generating random numbers: see Random number generation.", + "Selecting a random sample of a population (important in statistical sampling).", + "Allocating experimental units via random assignment to a treatment or control condition.", + "Transforming a data stream (such as when using a scrambler in telecommunications)."}; + + List indexBuilders = new ArrayList<>(); + for (int i = 0; i < content.length; i++) { + for (String indexName : indexNames) { + indexBuilders.add(client().prepareIndex() + .setIndex(indexName) + .setType("type1") + .setId(String.valueOf(i)) + .setSource("field1", content[i])); + } + } + indexRandom(true, indexBuilders); + + // request tvs and compare from each index + for (int i = 0; i < content.length; i++) { + Fields[] fields = new Fields[2]; + int idx = 0; + for (String indexName : indexNames) { + TermVectorResponse resp = client().prepareTermVector(indexName, "type1", String.valueOf(i)) + .setOffsets(true) + .setPositions(true) + .setSelectedFields("field1") + .get(); + assertThat("doc with index: test_with_tv, type1 and id: " + i, resp.isExists(), equalTo(true)); + fields[idx++] = resp.getFields(); + } + compareTermVectors("field1", fields[0], fields[1]); + } + } + + private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException { + Terms terms0 = fields0.terms(fieldName); + Terms terms1 = fields1.terms(fieldName); + assertThat(terms0, notNullValue()); + assertThat(terms1, notNullValue()); + assertThat(terms0.size(), equalTo(terms1.size())); + + TermsEnum iter0 = terms0.iterator(null); + TermsEnum iter1 = terms1.iterator(null); + for (int i = 0; i < terms0.size(); i++) { + BytesRef next0 = iter0.next(); + assertThat(next0, notNullValue()); + BytesRef next1 = iter1.next(); + assertThat(next1, notNullValue()); + + // compare field value + String string0 = next0.utf8ToString(); + String string1 = next1.utf8ToString(); + assertThat("expected: " + string0, string0, equalTo(string1)); + + // compare df and ttf + assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq())); + assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq())); + + // compare freq and docs + DocsAndPositionsEnum docsAndPositions0 = iter0.docsAndPositions(null, null); + DocsAndPositionsEnum docsAndPositions1 = iter1.docsAndPositions(null, null); + assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc())); + assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq())); + + // compare position, start offsets and end offsets + for (int j = 0; j < docsAndPositions0.freq(); j++) { + assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition())); + assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset())); + assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset())); + } + } + assertThat(iter0.next(), nullValue()); + assertThat(iter1.next(), nullValue()); + } }