Refactor term vector api

This is necessary to allow adding a mult term vector request
This commit is contained in:
Boaz Leskes 2013-08-09 16:43:49 +02:00 committed by Britta Weber
parent a09f217b45
commit 18c71b16b5
10 changed files with 910 additions and 610 deletions

View File

@ -19,18 +19,9 @@
package org.elasticsearch.action.termvector;
import static org.apache.lucene.util.ArrayUtil.grow;
import gnu.trove.impl.Constants;
import gnu.trove.map.hash.TObjectLongHashMap;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.*;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
@ -38,16 +29,22 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamInput;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import static org.apache.lucene.util.ArrayUtil.grow;
/**
* This class represents the result of a {@link TermVectorRequest}. It works
* exactly like the {@link Fields} class except for one thing: It can return
* offsets and payloads even if positions are not present. You must call
* nextPosition() anyway to move the counter although this method only returns
* <tt>-1,</tt>, if no positions were returned by the {@link TermVectorRequest}.
*
* <p/>
* The data is stored in two byte arrays ({@code headerRef} and
* {@code termVectors}, both {@link ByteRef}) that have the following format:
* <p>
* <p/>
* {@code headerRef}: Stores offsets per field in the {@code termVectors} array
* and some header information as {@link BytesRef}. Format is
* <ul>
@ -64,9 +61,9 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>vint: offset in {@code termVectors} for last field</li>
* </ul>
* </ul>
*
* <p/>
* termVectors: Stores the actual term vectors as a {@link BytesRef}.
*
* <p/>
* Term vectors for each fields are stored in blocks, one for each field. The
* offsets in {@code headerRef} are used to find where the block for a field
* starts. Each block begins with a
@ -84,14 +81,14 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>vint: number of documents in the shard that has an entry for this field
* (docCount)</li>
* </ul>
*
* <p/>
* After that, for each term it stores
* <ul>
* <ul>
* <li>vint: term lengths</li>
* <li>BytesRef: term name</li>
* </ul>
*
* <p/>
* If term statistics are requested ({@code hasTermStatistics} is true, see
* {@code headerRef}):
* <ul>
@ -111,7 +108,6 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>BytesRef: payload_freqency (if payloads == true)</li>
* <ul>
* </ul> </ul>
*
*/
public final class TermVectorFields extends Fields {
@ -122,17 +118,14 @@ public final class TermVectorFields extends Fields {
final boolean hasFieldStatistic;
/**
* @param headerRef
* Stores offsets per field in the {@code termVectors} and some
* @param headerRef Stores offsets per field in the {@code termVectors} and some
* header information as {@link BytesRef}.
*
* @param termVectors
* Stores the actual term vectors as a {@link BytesRef}.
*
* @param termVectors Stores the actual term vectors as a {@link BytesRef}.
*/
public TermVectorFields(BytesReference headerRef, BytesReference termVectors) throws IOException {
BytesStreamInput header = new BytesStreamInput(headerRef);
fieldMap = new TObjectLongHashMap<String>();
fieldMap = new TObjectLongHashMap<String>(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1);
// here we read the header to fill the field offset map
String headerString = header.readString();
assert headerString.equals("TV");
@ -159,6 +152,9 @@ public final class TermVectorFields extends Fields {
// first, find where in the termVectors bytes the actual term vector for
// this field is stored
Long offset = fieldMap.get(field);
if (offset.longValue() < 0) {
return null; // we don't have it.
}
final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors);
perFieldTermVectorInput.reset();
perFieldTermVectorInput.skip(offset.longValue());

View File

@ -19,23 +19,22 @@
package org.elasticsearch.action.termvector;
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;
import com.google.common.collect.Sets;
import org.elasticsearch.ElasticSearchParseException;
import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentParser;
import com.google.common.collect.Sets;
import java.io.IOException;
import java.util.*;
/**
* Request returning the term vector (doc frequency, positions, offsets) for a
* document.
* <p>
* <p/>
* Note, the {@link #index()}, {@link #type(String)} and {@link #id(String)} are
* required.
*/
@ -49,6 +48,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
protected String preference;
// TODO: change to String[]
private Set<String> selectedFields;
private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
@ -246,6 +246,13 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
return validationException;
}
public static TermVectorRequest readTermVectorRequest(StreamInput in) throws IOException {
TermVectorRequest termVectorRequest = new TermVectorRequest();
termVectorRequest.readFrom(in);
return termVectorRequest;
}
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
@ -300,4 +307,60 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
// the ordinal for encoding! Only append to the end!
Positions, Offsets, Payloads, FieldStatistics, TermStatistics;
}
/**
* populates a request object (pre-populated with defaults) based on a parser.
*
* @param termVectorRequest
* @param parser
* @throws IOException
*/
public static void parseRequest(TermVectorRequest termVectorRequest, XContentParser parser) throws IOException {
XContentParser.Token token;
String currentFieldName = null;
List<String> fields = new ArrayList<String>();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (currentFieldName != null) {
if (currentFieldName.equals("fields")) {
if (token == XContentParser.Token.START_ARRAY) {
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
fields.add(parser.text());
}
} else {
throw new ElasticSearchParseException(
"The parameter fields must be given as an array! Use syntax : \"fields\" : [\"field1\", \"field2\",...]");
}
} else if (currentFieldName.equals("offsets")) {
termVectorRequest.offsets(parser.booleanValue());
} else if (currentFieldName.equals("positions")) {
termVectorRequest.positions(parser.booleanValue());
} else if (currentFieldName.equals("payloads")) {
termVectorRequest.payloads(parser.booleanValue());
} else if (currentFieldName.equals("term_statistics") || currentFieldName.equals("termStatistics")) {
termVectorRequest.termStatistics(parser.booleanValue());
} else if (currentFieldName.equals("field_statistics") || currentFieldName.equals("fieldStatistics")) {
termVectorRequest.fieldStatistics(parser.booleanValue());
} else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.
termVectorRequest.index = parser.text();
} else if ("_type".equals(currentFieldName)) {
termVectorRequest.type = parser.text();
} else if ("_id".equals(currentFieldName)) {
termVectorRequest.id = parser.text();
} else if ("_routing".equals(currentFieldName) || "routing".equals(currentFieldName)) {
termVectorRequest.routing = parser.text();
} else {
throw new ElasticSearchParseException("The parameter " + currentFieldName
+ " is not valid for term vector request!");
}
}
}
if (fields.size() > 0) {
String[] fieldsAsArray = new String[fields.size()];
termVectorRequest.selectedFields(fields.toArray(fieldsAsArray));
}
}
}

View File

@ -320,10 +320,11 @@ public class TermVectorResponse extends ActionResponse implements ToXContent {
this.exists = exists;
}
public void setFields(Fields fields, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
TermVectorWriter tvw = new TermVectorWriter(this);
if (fields != null) {
tvw.setFields(fields, selectedFields, flags, topLevelFields);
if (termVectorsByField != null) {
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields);
}
}
@ -342,4 +343,16 @@ public class TermVectorResponse extends ActionResponse implements ToXContent {
}
public String getIndex() {
return index;
}
public String getType() {
return type;
}
public String getId() {
return id;
}
}

View File

@ -18,22 +18,18 @@
*/
package org.elasticsearch.action.termvector;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import java.io.IOException;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.List;
import java.util.Set;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
// package only - this is an internal class!
final class TermVectorWriter {
final List<String> fields = new ArrayList<String>();
@ -49,30 +45,30 @@ final class TermVectorWriter {
response = termVectorResponse;
}
void setFields(Fields fields, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
int numFieldsWritten = 0;
TermsEnum iterator = null;
DocsAndPositionsEnum docsAndPosEnum = null;
DocsEnum docsEnum = null;
TermsEnum topLevelIterator = null;
for (String field : fields) {
for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue;
}
Terms terms = fields.terms(field);
Terms fieldTermVector = termVectorsByField.terms(field);
Terms topLevelTerms = topLevelFields.terms(field);
topLevelIterator = topLevelTerms.iterator(topLevelIterator);
boolean positions = flags.contains(Flag.Positions) && terms.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && terms.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && terms.hasPayloads();
startField(field, terms.size(), positions, offsets, payloads);
boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
startField(field, fieldTermVector.size(), positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) {
writeFieldStatistics(topLevelTerms);
}
iterator = terms.iterator(iterator);
iterator = fieldTermVector.iterator(iterator);
final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) { // iterate all terms of the
// current field

View File

@ -19,36 +19,28 @@
package org.elasticsearch.rest.action.termvector;
import static org.elasticsearch.rest.RestRequest.Method.GET;
import static org.elasticsearch.rest.RestRequest.Method.POST;
import static org.elasticsearch.rest.RestStatus.OK;
import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.elasticsearch.ElasticSearchParseException;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.rest.BaseRestHandler;
import org.elasticsearch.rest.RestChannel;
import org.elasticsearch.rest.RestController;
import org.elasticsearch.rest.RestRequest;
import org.elasticsearch.rest.XContentRestResponse;
import org.elasticsearch.rest.XContentThrowableRestResponse;
import org.elasticsearch.rest.*;
import org.elasticsearch.rest.action.support.RestXContentBuilder;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import static org.elasticsearch.rest.RestRequest.Method.GET;
import static org.elasticsearch.rest.RestRequest.Method.POST;
import static org.elasticsearch.rest.RestStatus.BAD_REQUEST;
import static org.elasticsearch.rest.RestStatus.OK;
import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder;
/**
* This class parses the json request and translates it into a
@ -70,19 +62,24 @@ public class RestTermVectorAction extends BaseRestHandler {
termVectorRequest.routing(request.param("routing"));
termVectorRequest.parent(request.param("parent"));
termVectorRequest.preference(request.param("preference"));
XContentParser parser = null;
if (request.hasContent()) {
try {
parseRequest(request.content(), termVectorRequest);
parser = XContentFactory.xContent(request.content()).createParser(request.content());
TermVectorRequest.parseRequest(termVectorRequest, parser);
} catch (IOException e) {
try {
XContentBuilder builder = RestXContentBuilder.restContentBuilder(request);
channel.sendResponse(new XContentRestResponse(request, BAD_REQUEST, builder.startObject().field("error", e.getMessage()).endObject()));
} catch (IOException e1) {
Set<String> selectedFields = termVectorRequest.selectedFields();
String fieldString = "all";
if (selectedFields != null) {
Strings.arrayToDelimitedString(termVectorRequest.selectedFields().toArray(new String[1]), " ");
logger.warn("Failed to send response", e1);
return;
}
} finally {
if (parser != null) {
parser.close();
}
logger.error("Something is wrong with your parameters for the term vector request. I am using parameters "
+ "\n positions :" + termVectorRequest.positions() + "\n offsets :" + termVectorRequest.offsets() + "\n payloads :"
+ termVectorRequest.payloads() + "\n termStatistics :" + termVectorRequest.termStatistics()
+ "\n fieldStatistics :" + termVectorRequest.fieldStatistics() + "\nfields " + fieldString, (Object) null);
}
}
readURIParameters(termVectorRequest, request);
@ -142,47 +139,4 @@ public class RestTermVectorAction extends BaseRestHandler {
}
}
static public void parseRequest(BytesReference cont, TermVectorRequest termVectorRequest) throws IOException {
XContentParser parser = XContentFactory.xContent(cont).createParser(cont);
try {
XContentParser.Token token;
String currentFieldName = null;
List<String> fields = new ArrayList<String>();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (currentFieldName != null) {
if (currentFieldName.equals("fields")) {
if (token == XContentParser.Token.START_ARRAY) {
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
fields.add(parser.text());
}
} else {
throw new ElasticSearchParseException(
"The parameter fields must be given as an array! Use syntax : \"fields\" : [\"field1\", \"field2\",...]");
}
} else if (currentFieldName.equals("offsets")) {
termVectorRequest.offsets(parser.booleanValue());
} else if (currentFieldName.equals("positions")) {
termVectorRequest.positions(parser.booleanValue());
} else if (currentFieldName.equals("payloads")) {
termVectorRequest.payloads(parser.booleanValue());
} else if (currentFieldName.equals("term_statistics") || currentFieldName.equals("termStatistics")) {
termVectorRequest.termStatistics(parser.booleanValue());
} else if (currentFieldName.equals("field_statistics") || currentFieldName.equals("fieldStatistics")) {
termVectorRequest.fieldStatistics(parser.booleanValue());
} else {
throw new ElasticSearchParseException("The parameter " + currentFieldName
+ " is not valid for term vector request!");
}
}
}
String[] fieldsAsArray = new String[fields.size()];
termVectorRequest.selectedFields(fields.toArray(fieldsAsArray));
} finally {
parser.close();
}
}
}

View File

@ -323,6 +323,11 @@ public abstract class AbstractSharedClusterTest extends ElasticsearchTestCase {
return client().prepareIndex(index, type).setSource(source).execute().actionGet();
}
protected IndexResponse index(String index, String type, String id, Map<String, Object> source) {
return client().prepareIndex(index, type, id).setSource(source).execute().actionGet();
}
protected GetResponse get(String index, String type, String id) {
return client().prepareGet(index, type, id).execute().actionGet();
}

View File

@ -0,0 +1,411 @@
package org.elasticsearch.test.integration.termvectors;
/*
* Licensed to ElasticSearch under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.Version;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.inject.internal.Join;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.indices.IndexMissingException;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.Matchers.equalTo;
public abstract class AbstractTermVectorTests extends AbstractSharedClusterTest {
protected static class TestFieldSetting {
final public String name;
final public boolean storedOffset;
final public boolean storedPayloads;
final public boolean storedPositions;
public TestFieldSetting(String name, boolean storedOffset, boolean storedPayloads, boolean storedPositions) {
this.name = name;
this.storedOffset = storedOffset;
this.storedPayloads = storedPayloads;
this.storedPositions = storedPositions;
}
public void addToMappings(XContentBuilder mappingsBuilder) throws IOException {
mappingsBuilder.startObject(name);
mappingsBuilder.field("type", "string");
String tv_settings;
if (storedPositions && storedOffset && storedPayloads) {
tv_settings = "with_positions_offsets_payloads";
} else if (storedPositions && storedOffset) {
tv_settings = "with_positions_offsets";
} else if (storedPayloads) {
tv_settings = "with_positions_payloads";
} else if (storedPositions) {
tv_settings = "with_positions";
} else if (storedOffset) {
tv_settings = "with_offsets";
} else {
tv_settings = "yes";
}
mappingsBuilder.field("term_vector", tv_settings);
if (storedPayloads) {
mappingsBuilder.field("analyzer", "tv_test");
}
mappingsBuilder.endObject();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("name: ").append(name).append(" tv_with:");
if (storedPayloads) {
sb.append("payloads,");
}
if (storedOffset) {
sb.append("offsets,");
}
if (storedPositions) {
sb.append("positions,");
}
return sb.toString();
}
}
protected static class TestDoc {
final public String id;
final public TestFieldSetting[] fieldSettings;
final public String[] fieldContent;
public String index = "test";
public String type = "type1";
public TestDoc(String id, TestFieldSetting[] fieldSettings, String[] fieldContent) {
this.id = id;
this.fieldSettings = fieldSettings;
this.fieldContent = fieldContent;
}
public TestDoc index(String index) {
this.index = index;
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("index:").append(index).append(" type:").append(type).append(" id:").append(id);
for (int i = 0; i < fieldSettings.length; i++) {
TestFieldSetting f = fieldSettings[i];
sb.append("\n").append("Field: ").append(f).append("\n content:").append(fieldContent[i]);
}
sb.append("\n");
return sb.toString();
}
}
protected static class TestConfig {
final public TestDoc doc;
final public String[] selectedFields;
final public boolean requestPositions;
final public boolean requestOffsets;
final public boolean requestPayloads;
public Class expectedException = null;
public TestConfig(TestDoc doc, String[] selectedFields, boolean requestPositions, boolean requestOffsets, boolean requestPayloads) {
this.doc = doc;
this.selectedFields = selectedFields;
this.requestPositions = requestPositions;
this.requestOffsets = requestOffsets;
this.requestPayloads = requestPayloads;
}
public TestConfig expectedException(Class exceptionClass) {
this.expectedException = exceptionClass;
return this;
}
@Override
public String toString() {
String requested = "";
if (requestOffsets) {
requested += "offsets,";
}
if (requestPositions) {
requested += "position,";
}
if (requestPayloads) {
requested += "payload,";
}
Locale aLocale = new Locale("en", "US");
return String.format(aLocale, "(doc: %s\n requested: %s, fields: %s)", doc, requested,
selectedFields == null ? "NULL" : Join.join(",", selectedFields));
}
}
protected void createIndexBasedOnFieldSettings(TestFieldSetting[] fieldSettings, int number_of_shards) throws IOException {
wipeIndex("test");
XContentBuilder mappingBuilder = jsonBuilder();
mappingBuilder.startObject().startObject("type1").startObject("properties");
for (TestFieldSetting field : fieldSettings) {
field.addToMappings(mappingBuilder);
}
ImmutableSettings.Builder settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
if (number_of_shards > 0) {
settings.put("number_of_shards", number_of_shards);
}
mappingBuilder.endObject().endObject().endObject();
run(prepareCreate("test").addMapping("type1", mappingBuilder).setSettings(settings));
ensureYellow();
}
/**
* Generate test documentsThe returned documents are already indexed.
*/
protected TestDoc[] generateTestDocs(int numberOfDocs, TestFieldSetting[] fieldSettings) {
String[] fieldContentOptions = new String[] { "Generating a random permutation of a sequence (such as when shuffling cards).",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Generating random numbers: see Random number generation.",
"Transforming a data stream (such as when using a scrambler in telecommunications)." };
String[] contentArray = new String[fieldSettings.length];
Map<String, Object> docSource = new HashMap<String, Object>();
TestDoc[] testDocs = new TestDoc[numberOfDocs];
for (int docId = 0; docId < numberOfDocs; docId++) {
docSource.clear();
for (int i = 0; i < contentArray.length; i++) {
contentArray[i] = fieldContentOptions[randomInt(fieldContentOptions.length - 1)];
docSource.put(fieldSettings[i].name, contentArray[i]);
}
TestDoc doc = new TestDoc(Integer.toString(docId), fieldSettings, contentArray.clone());
index(doc.index, doc.type, doc.id, docSource);
testDocs[docId] = doc;
}
refresh();
return testDocs;
}
protected TestConfig[] generateTestConfigs(int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) {
ArrayList<TestConfig> configs = new ArrayList<TestConfig>();
for (int i = 0; i < numberOfTests; i++) {
ArrayList<String> selectedFields = null;
if (randomBoolean()) {
// used field selection
selectedFields = new ArrayList<String>();
if (randomBoolean()) {
selectedFields.add("Doesnt_exist"); // this will be ignored.
}
for (TestFieldSetting field : fieldSettings)
if (randomBoolean()) {
selectedFields.add(field.name);
}
if (selectedFields.size() == 0) {
selectedFields = null; // 0 length set is not supported.
}
}
TestConfig config = new TestConfig(testDocs[randomInt(testDocs.length - 1)], selectedFields == null ? null
: selectedFields.toArray(new String[] {}), randomBoolean(), randomBoolean(), randomBoolean());
configs.add(config);
}
// always adds a test that fails
configs.add(new TestConfig(new TestDoc("doesnt_exist", new TestFieldSetting[] {}, new String[] {}).index("doesn't_exist"),
new String[] { "doesnt_exist" }, true, true, true).expectedException(IndexMissingException.class));
refresh();
return configs.toArray(new TestConfig[] {});
}
protected TestFieldSetting[] getFieldSettings() {
return new TestFieldSetting[] { new TestFieldSetting("field_with_positions", false, false, true),
new TestFieldSetting("field_with_offsets", true, false, false),
new TestFieldSetting("field_with_only_tv", false, false, false),
new TestFieldSetting("field_with_positions_offsets", false, false, true),
new TestFieldSetting("field_with_positions_payloads", false, true, true)
};
}
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
for (TestFieldSetting field : testDocs[0].fieldSettings) {
if (field.storedPayloads) {
mapping.put(field.name, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(Version.CURRENT.luceneVersion, reader);
TokenFilter filter = new LowerCaseFilter(Version.CURRENT.luceneVersion, tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.CURRENT.luceneVersion), mapping);
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(Version.CURRENT.luceneVersion, wrapper);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
for (TestDoc doc : testDocs) {
Document d = new Document();
d.add(new Field("id", doc.id, StringField.TYPE_STORED));
for (int i = 0; i < doc.fieldContent.length; i++) {
FieldType type = new FieldType(TextField.TYPE_STORED);
TestFieldSetting fieldSetting = doc.fieldSettings[i];
type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
}
writer.updateDocument(new Term("id", doc.id), d);
writer.commit();
}
writer.close();
return DirectoryReader.open(dir);
}
protected void validateResponse(TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException {
TestDoc testDoc = testConfig.doc;
HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(
Arrays.asList(testConfig.selectedFields));
Fields esTermVectorFields = esResponse.getFields();
for (TestFieldSetting field : testDoc.fieldSettings) {
Terms esTerms = esTermVectorFields.terms(field.name);
if (selectedFields != null && !selectedFields.contains(field.name)) {
assertNull(esTerms);
continue;
}
assertNotNull(esTerms);
Terms luceneTerms = luceneFields.terms(field.name);
TermsEnum esTermEnum = esTerms.iterator(null);
TermsEnum luceneTermEnum = luceneTerms.iterator(null);
while (esTermEnum.next() != null) {
assertNotNull(luceneTermEnum.next());
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
if (luceneDocsPosEnum == null) {
// test we expect that...
assertFalse(field.storedOffset);
assertFalse(field.storedPayloads);
assertFalse(field.storedPositions);
continue;
}
String currentTerm = esTermEnum.term().utf8ToString();
assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString()));
esDocsPosEnum.nextDoc();
luceneDocsPosEnum.nextDoc();
int freq = esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (field.storedPositions && testConfig.requestPositions) {
assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
} else {
assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
}
if (field.storedOffset && testConfig.requestOffsets) {
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
}
if (field.storedPayloads && testConfig.requestPayloads) {
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
}
}
}
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
}
}
protected TermVectorRequestBuilder getRequestForConfig(TestConfig config) {
return client().prepareTermVector(config.doc.index, config.doc.type, config.doc.id).setPayloads(config.requestPayloads)
.setOffsets(config.requestOffsets).setPositions(config.requestPositions).setFieldStatistics(true).setTermStatistics(true)
.setSelectedFields(config.selectedFields);
}
protected Fields getTermVectorsFromLucene(DirectoryReader directoryReader, TestDoc doc) throws IOException {
IndexSearcher searcher = new IndexSearcher(directoryReader);
TopDocs search = searcher.search(new TermQuery(new Term("id", doc.id)), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
assert (scoreDocs.length == 1);
return directoryReader.getTermVectors(scoreDocs[0].doc);
}
}

View File

@ -25,12 +25,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.io.BytesStream;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory;
@ -38,53 +35,13 @@ import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.Random;
import static org.hamcrest.Matchers.equalTo;
public class GetTermVectorTestsCheckDocFreq extends AbstractSharedClusterTest {
public class GetTermVectorCheckDocFreqTests extends AbstractSharedClusterTest {
@Test
public void streamRequest() throws IOException {
Random random = getRandom();
for (int i = 0; i < 10; i++) {
TermVectorRequest request = new TermVectorRequest("index", "type", "id");
request.offsets(random.nextBoolean());
request.fieldStatistics(random.nextBoolean());
request.payloads(random.nextBoolean());
request.positions(random.nextBoolean());
request.termStatistics(random.nextBoolean());
String parent = random.nextBoolean() ? "someParent" : null;
request.parent(parent);
String pref = random.nextBoolean() ? "somePreference" : null;
request.preference(pref);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
request.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorRequest req2 = new TermVectorRequest(null, null, null);
req2.readFrom(esBuffer);
assertThat(request.offsets(), equalTo(req2.offsets()));
assertThat(request.fieldStatistics(), equalTo(req2.fieldStatistics()));
assertThat(request.payloads(), equalTo(req2.payloads()));
assertThat(request.positions(), equalTo(req2.positions()));
assertThat(request.termStatistics(), equalTo(req2.termStatistics()));
assertThat(request.preference(), equalTo(pref));
assertThat(request.routing(), equalTo(parent));
}
}
@Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException {

View File

@ -19,271 +19,37 @@
package org.elasticsearch.test.integration.termvectors;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.*;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.elasticsearch.index.mapper.core.TypeParsers;
import org.elasticsearch.index.mapper.internal.AllFieldMapper;
import org.elasticsearch.rest.action.termvector.RestTermVectorAction;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.io.*;
import java.util.*;
import java.io.IOException;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.hamcrest.Matchers.equalTo;
public class GetTermVectorTests extends AbstractSharedClusterTest {
@Test
public void streamTest() throws Exception {
TermVectorResponse outResponse = new TermVectorResponse("a", "b", "c");
outResponse.setExists(true);
writeStandardTermVector(outResponse);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorResponse inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
// see if correct
checkIfStandardTermVector(inResponse);
outResponse = new TermVectorResponse("a", "b", "c");
writeEmptyTermVector(outResponse);
// write
outBuffer = new ByteArrayOutputStream();
out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
esBuffer = new InputStreamStreamInput(esInBuffer);
inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
assertTrue(inResponse.isExists());
public class GetTermVectorTests extends AbstractTermVectorTests {
}
private void checkIfStandardTermVector(TermVectorResponse inResponse) throws IOException {
Fields fields = inResponse.getFields();
assertThat(fields.terms("title"), Matchers.notNullValue());
assertThat(fields.terms("desc"), Matchers.notNullValue());
assertThat(fields.size(), equalTo(2));
}
private void writeEmptyTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
outResponse.setExists(true);
}
private void writeStandardTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
d.add(new Field("title", "the1 quick brown fox jumps over the1 lazy dog", type));
d.add(new Field("desc", "the1 quick brown fox jumps over the1 lazy dog", type));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
}
private Fields buildWithLuceneAndReturnFields(String docId, String[] fields, String[] content, boolean[] withPositions,
boolean[] withOffsets, boolean[] withPayloads) throws IOException {
assert (fields.length == withPayloads.length);
assert (content.length == withPayloads.length);
assert (withPositions.length == withPayloads.length);
assert (withOffsets.length == withPayloads.length);
Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
for (int i = 0; i < withPayloads.length; i++) {
if (withPayloads[i]) {
mapping.put(fields[i], new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(TEST_VERSION_CURRENT), mapping);
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, wrapper);
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
Document d = new Document();
for (int i = 0; i < fields.length; i++) {
d.add(new Field("id", docId, StringField.TYPE_STORED));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(withOffsets[i]);
type.setStoreTermVectorPayloads(withPayloads[i]);
type.setStoreTermVectorPositions(withPositions[i] || withOffsets[i] || withPayloads[i]);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fields[i], content[i], type));
writer.updateDocument(new Term("id", docId), d);
writer.commit();
}
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", docId)), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
assert (scoreDocs.length == 1);
int doc = scoreDocs[0].doc;
Fields returnFields = dr.getTermVectors(doc);
return returnFields;
}
@Test
public void testRestRequestParsing() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : [\"a\", \"b\",\"c\"], \"offsets\":false, \"positions\":false, \"payloads\":true}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
RestTermVectorAction.parseRequest(inputBytes, tvr);
Set<String> fields = tvr.selectedFields();
assertThat(fields.contains("a"), equalTo(true));
assertThat(fields.contains("b"), equalTo(true));
assertThat(fields.contains("c"), equalTo(true));
assertThat(tvr.offsets(), equalTo(false));
assertThat(tvr.positions(), equalTo(false));
assertThat(tvr.payloads(), equalTo(true));
String additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(5));
assertThat(fields.contains("d"), equalTo(true));
assertThat(fields.contains("e"), equalTo(true));
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
inputBytes = new BytesArray(" {\"offsets\":false, \"positions\":false, \"payloads\":true}");
tvr = new TermVectorRequest(null, null, null);
RestTermVectorAction.parseRequest(inputBytes, tvr);
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields(), equalTo(null));
additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(4));
}
@Test
public void testRestRequestParsingThrowsException() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : \"a, b,c \", \"offsets\":false, \"positions\":false, \"payloads\":true, \"meaningless_term\":2}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
boolean threwException = false;
try {
RestTermVectorAction.parseRequest(inputBytes, tvr);
} catch (Exception e) {
threwException = true;
}
assertThat(threwException, equalTo(true));
}
@Test
public void testNoSuchDoc() throws Exception {
run(addMapping(prepareCreate("test"), "type1", new Object[] { "field", "type", "string", "term_vector",
"with_positions_offsets_payloads" }));
run(addMapping(prepareCreate("test"), "type1", new Object[]{"field", "type", "string", "term_vector",
"with_positions_offsets_payloads"}));
ensureYellow();
@ -346,7 +112,7 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
public void testSimpleTermVectors() throws ElasticSearchException, IOException {
run(addMapping(prepareCreate("test"), "type1",
new Object[] { "field", "type", "string", "term_vector", "with_positions_offsets_payloads", "analyzer", "tv_test" })
new Object[]{"field", "type", "string", "term_vector", "with_positions_offsets_payloads", "analyzer", "tv_test"})
.setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
@ -359,11 +125,11 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
.endObject()).execute().actionGet();
refresh();
}
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
for (int i = 0; i < 10; i++) {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true)
.setOffsets(true).setPositions(true).setSelectedFields();
@ -405,9 +171,8 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
@Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
Random random = getRandom();
FieldType ft = new FieldType();
int config = random.nextInt(6);
int config = randomInt(6);
boolean storePositions = false;
boolean storeOffsets = false;
boolean storePayloads = false;
@ -451,7 +216,7 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
run(addMapping(prepareCreate("test"), "type1",
new Object[] { "field", "type", "string", "term_vector", optionString, "analyzer", "tv_test" }).setSettings(
new Object[]{"field", "type", "string", "term_vector", optionString, "analyzer", "tv_test"}).setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
ensureYellow();
@ -463,15 +228,15 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
.endObject()).execute().actionGet();
refresh();
}
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" };
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 };
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } };
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } };
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } };
String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
boolean isPayloadRequested = random.nextBoolean();
boolean isOffsetRequested = random.nextBoolean();
boolean isPositionsRequested = random.nextBoolean();
boolean isPayloadRequested = randomBoolean();
boolean isOffsetRequested = randomBoolean();
boolean isPositionsRequested = randomBoolean();
String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
for (int i = 0; i < 10; i++) {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
@ -552,147 +317,31 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
}
@Test
public void testDuellESLucene() throws Exception {
public void testDuelESLucene() throws Exception {
TestFieldSetting[] testFieldSettings = getFieldSettings();
createIndexBasedOnFieldSettings(testFieldSettings, -1);
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
String[] fieldNames = { "field_that_should_not_be_requested", "field_with_positions", "field_with_offsets", "field_with_only_tv",
"field_with_positions_offsets", "field_with_positions_payloads" };
run(addMapping(prepareCreate("test"), "type1",
new Object[] { fieldNames[0], "type", "string", "term_vector", "with_positions_offsets" },
new Object[] { fieldNames[1], "type", "string", "term_vector", "with_positions" },
new Object[] { fieldNames[2], "type", "string", "term_vector", "with_offsets" },
new Object[] { fieldNames[3], "type", "string", "store_term_vectors", "yes" },
new Object[] { fieldNames[4], "type", "string", "term_vector", "with_positions_offsets" },
new Object[] { fieldNames[5], "type", "string", "term_vector", "with_positions_payloads", "analyzer", "tv_test" })
.setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
// for (int i=0;i<testDocs.length;i++)
// logger.info("Doc: {}",testDocs[i]);
DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
ensureYellow();
// ginge auc mit XContentBuilder xcb = new XContentBuilder();
// now, create the same thing with lucene and see if the returned stuff
// is the same
String[] fieldContent = { "the quick shard jumps over the stupid brain", "here is another field",
"And yet another field withut any use.", "I am out of ideas on what to type here.",
"The last field for which offsets are stored but not positons.",
"The last field for which offsets are stored but not positons." };
boolean[] storeOffsets = { true, false, true, false, true, false };
boolean[] storePositions = { true, true, false, false, true, true };
boolean[] storePayloads = { false, false, false, false, false, true };
Map<String, Object> testSource = new HashMap<String, Object>();
for (int i = 0; i < fieldNames.length; i++) {
testSource.put(fieldNames[i], fieldContent[i]);
}
client().prepareIndex("test", "type1", "1").setSource(testSource).execute().actionGet();
refresh();
String[] selectedFields = { fieldNames[1], fieldNames[2], fieldNames[3], fieldNames[4], fieldNames[5] };
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, false, false, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, false, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, false, true, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, true, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, false, true);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, true, true);
}
private void testForConfig(String[] fieldNames, String[] fieldContent, boolean[] storeOffsets, boolean[] storePositions,
boolean[] storePayloads, String[] selectedFields, boolean withPositions, boolean withOffsets, boolean withPayloads)
throws IOException {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", "1").setPayloads(withPayloads).setOffsets(withOffsets)
.setPositions(withPositions).setFieldStatistics(true).setTermStatistics(true).setSelectedFields(selectedFields);
TermVectorResponse response = resp.execute().actionGet();
// build the same with lucene and compare the Fields
Fields luceneFields = buildWithLuceneAndReturnFields("1", fieldNames, fieldContent, storePositions, storeOffsets, storePayloads);
HashMap<String, Boolean> storeOfsetsMap = new HashMap<String, Boolean>();
HashMap<String, Boolean> storePositionsMap = new HashMap<String, Boolean>();
HashMap<String, Boolean> storePayloadsMap = new HashMap<String, Boolean>();
for (int i = 0; i < storePositions.length; i++) {
storeOfsetsMap.put(fieldNames[i], storeOffsets[i]);
storePositionsMap.put(fieldNames[i], storePositions[i]);
storePayloadsMap.put(fieldNames[i], storePayloads[i]);
}
compareLuceneESTermVectorResults(response.getFields(), luceneFields, storePositionsMap, storeOfsetsMap, storePayloadsMap,
withPositions, withOffsets, withPayloads, selectedFields);
}
private void compareLuceneESTermVectorResults(Fields fields, Fields luceneFields, HashMap<String, Boolean> storePositionsMap,
HashMap<String, Boolean> storeOfsetsMap, HashMap<String, Boolean> storePayloadsMap, boolean getPositions, boolean getOffsets,
boolean getPayloads, String[] selectedFields) throws IOException {
HashSet<String> selectedFieldsMap = new HashSet<String>(Arrays.asList(selectedFields));
Iterator<String> luceneFieldNames = luceneFields.iterator();
assertThat(luceneFields.size(), equalTo(storeOfsetsMap.size()));
assertThat(fields.size(), equalTo(selectedFields.length));
while (luceneFieldNames.hasNext()) {
String luceneFieldName = luceneFieldNames.next();
if (!selectedFieldsMap.contains(luceneFieldName))
for (TestConfig test : testConfigs) {
try {
TermVectorRequestBuilder request = getRequestForConfig(test);
if (test.expectedException != null) {
assertThrows(request, test.expectedException);
continue;
Terms esTerms = fields.terms(luceneFieldName);
Terms luceneTerms = luceneFields.terms(luceneFieldName);
TermsEnum esTermEnum = esTerms.iterator(null);
TermsEnum luceneTermEnum = luceneTerms.iterator(null);
int numTerms = 0;
while (esTermEnum.next() != null) {
luceneTermEnum.next();
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
if (luceneDocsPosEnum == null) {
assertThat(storeOfsetsMap.get(luceneFieldName), equalTo(false));
assertThat(storePayloadsMap.get(luceneFieldName), equalTo(false));
assertThat(storePositionsMap.get(luceneFieldName), equalTo(false));
continue;
}
numTerms++;
assertThat("failed for field: " + luceneFieldName, esTermEnum.term().utf8ToString(), equalTo(luceneTermEnum.term()
.utf8ToString()));
esDocsPosEnum.nextDoc();
luceneDocsPosEnum.nextDoc();
int freq = (int) esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (storePositionsMap.get(luceneFieldName) && getPositions) {
assertThat(luceneFieldName, lucenePos, equalTo(esPos));
} else {
assertThat(esPos, equalTo(-1));
}
if (storeOfsetsMap.get(luceneFieldName) && getOffsets) {
assertThat(luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat(luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat(esDocsPosEnum.startOffset(), equalTo(-1));
assertThat(esDocsPosEnum.endOffset(), equalTo(-1));
}
if (storePayloadsMap.get(luceneFieldName) && getPayloads) {
assertThat(luceneFieldName, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat(esDocsPosEnum.getPayload(), equalTo(null));
}
TermVectorResponse response = run(request);
Fields luceneTermVectors = getTermVectorsFromLucene(directoryReader, test.doc);
validateResponse(response, luceneTermVectors, test);
} catch (Throwable t) {
throw new Exception("Test exception while running " + test.toString(), t);
}
}
}
}
@Test

View File

@ -0,0 +1,256 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.termvectors;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.rest.action.termvector.RestTermVectorAction;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Set;
import static org.hamcrest.Matchers.equalTo;
public class TermVectorUnitTests extends org.elasticsearch.test.integration.ElasticsearchTestCase {
@Test
public void streamResponse() throws Exception {
TermVectorResponse outResponse = new TermVectorResponse("a", "b", "c");
outResponse.setExists(true);
writeStandardTermVector(outResponse);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorResponse inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
// see if correct
checkIfStandardTermVector(inResponse);
outResponse = new TermVectorResponse("a", "b", "c");
writeEmptyTermVector(outResponse);
// write
outBuffer = new ByteArrayOutputStream();
out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
esBuffer = new InputStreamStreamInput(esInBuffer);
inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
assertTrue(inResponse.isExists());
}
private void writeEmptyTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
outResponse.setExists(true);
}
private void writeStandardTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
d.add(new Field("title", "the1 quick brown fox jumps over the1 lazy dog", type));
d.add(new Field("desc", "the1 quick brown fox jumps over the1 lazy dog", type));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields termVectors = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(termVectors, null, flags, termVectors);
}
private void checkIfStandardTermVector(TermVectorResponse inResponse) throws IOException {
Fields fields = inResponse.getFields();
assertThat(fields.terms("title"), Matchers.notNullValue());
assertThat(fields.terms("desc"), Matchers.notNullValue());
assertThat(fields.size(), equalTo(2));
}
@Test
public void testRestRequestParsing() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : [\"a\", \"b\",\"c\"], \"offsets\":false, \"positions\":false, \"payloads\":true}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
XContentParser parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
Set<String> fields = tvr.selectedFields();
assertThat(fields.contains("a"), equalTo(true));
assertThat(fields.contains("b"), equalTo(true));
assertThat(fields.contains("c"), equalTo(true));
assertThat(tvr.offsets(), equalTo(false));
assertThat(tvr.positions(), equalTo(false));
assertThat(tvr.payloads(), equalTo(true));
String additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(5));
assertThat(fields.contains("d"), equalTo(true));
assertThat(fields.contains("e"), equalTo(true));
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
inputBytes = new BytesArray(" {\"offsets\":false, \"positions\":false, \"payloads\":true}");
tvr = new TermVectorRequest(null, null, null);
parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields(), equalTo(null));
additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(4));
}
@Test
public void testRequestParsingThrowsException() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : \"a, b,c \", \"offsets\":false, \"positions\":false, \"payloads\":true, \"meaningless_term\":2}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
boolean threwException = false;
try {
XContentParser parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
} catch (Exception e) {
threwException = true;
}
assertThat(threwException, equalTo(true));
}
@Test
public void streamRequest() throws IOException {
for (int i = 0; i < 10; i++) {
TermVectorRequest request = new TermVectorRequest("index", "type", "id");
request.offsets(randomBoolean());
request.fieldStatistics(randomBoolean());
request.payloads(randomBoolean());
request.positions(randomBoolean());
request.termStatistics(randomBoolean());
String parent = randomBoolean() ? "someParent" : null;
request.parent(parent);
String pref = randomBoolean() ? "somePreference" : null;
request.preference(pref);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
request.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorRequest req2 = new TermVectorRequest(null, null, null);
req2.readFrom(esBuffer);
assertThat(request.offsets(), equalTo(req2.offsets()));
assertThat(request.fieldStatistics(), equalTo(req2.fieldStatistics()));
assertThat(request.payloads(), equalTo(req2.payloads()));
assertThat(request.positions(), equalTo(req2.positions()));
assertThat(request.termStatistics(), equalTo(req2.termStatistics()));
assertThat(request.preference(), equalTo(pref));
assertThat(request.routing(), equalTo(parent));
}
}
}