Refactor term vector api

This is necessary to allow adding a mult term vector request
This commit is contained in:
Boaz Leskes 2013-08-09 16:43:49 +02:00 committed by Britta Weber
parent a09f217b45
commit 18c71b16b5
10 changed files with 910 additions and 610 deletions

View File

@ -19,18 +19,9 @@
package org.elasticsearch.action.termvector; package org.elasticsearch.action.termvector;
import static org.apache.lucene.util.ArrayUtil.grow; import gnu.trove.impl.Constants;
import gnu.trove.map.hash.TObjectLongHashMap; import gnu.trove.map.hash.TObjectLongHashMap;
import org.apache.lucene.index.*;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Bits; import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
@ -38,16 +29,22 @@ import org.apache.lucene.util.RamUsageEstimator;
import org.elasticsearch.common.bytes.BytesReference; import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamInput; import org.elasticsearch.common.io.stream.BytesStreamInput;
import java.io.IOException;
import java.util.Comparator;
import java.util.Iterator;
import static org.apache.lucene.util.ArrayUtil.grow;
/** /**
* This class represents the result of a {@link TermVectorRequest}. It works * This class represents the result of a {@link TermVectorRequest}. It works
* exactly like the {@link Fields} class except for one thing: It can return * exactly like the {@link Fields} class except for one thing: It can return
* offsets and payloads even if positions are not present. You must call * offsets and payloads even if positions are not present. You must call
* nextPosition() anyway to move the counter although this method only returns * nextPosition() anyway to move the counter although this method only returns
* <tt>-1,</tt>, if no positions were returned by the {@link TermVectorRequest}. * <tt>-1,</tt>, if no positions were returned by the {@link TermVectorRequest}.
* * <p/>
* The data is stored in two byte arrays ({@code headerRef} and * The data is stored in two byte arrays ({@code headerRef} and
* {@code termVectors}, both {@link ByteRef}) that have the following format: * {@code termVectors}, both {@link ByteRef}) that have the following format:
* <p> * <p/>
* {@code headerRef}: Stores offsets per field in the {@code termVectors} array * {@code headerRef}: Stores offsets per field in the {@code termVectors} array
* and some header information as {@link BytesRef}. Format is * and some header information as {@link BytesRef}. Format is
* <ul> * <ul>
@ -64,9 +61,9 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>vint: offset in {@code termVectors} for last field</li> * <li>vint: offset in {@code termVectors} for last field</li>
* </ul> * </ul>
* </ul> * </ul>
* * <p/>
* termVectors: Stores the actual term vectors as a {@link BytesRef}. * termVectors: Stores the actual term vectors as a {@link BytesRef}.
* * <p/>
* Term vectors for each fields are stored in blocks, one for each field. The * Term vectors for each fields are stored in blocks, one for each field. The
* offsets in {@code headerRef} are used to find where the block for a field * offsets in {@code headerRef} are used to find where the block for a field
* starts. Each block begins with a * starts. Each block begins with a
@ -84,14 +81,14 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>vint: number of documents in the shard that has an entry for this field * <li>vint: number of documents in the shard that has an entry for this field
* (docCount)</li> * (docCount)</li>
* </ul> * </ul>
* * <p/>
* After that, for each term it stores * After that, for each term it stores
* <ul> * <ul>
* <ul> * <ul>
* <li>vint: term lengths</li> * <li>vint: term lengths</li>
* <li>BytesRef: term name</li> * <li>BytesRef: term name</li>
* </ul> * </ul>
* * <p/>
* If term statistics are requested ({@code hasTermStatistics} is true, see * If term statistics are requested ({@code hasTermStatistics} is true, see
* {@code headerRef}): * {@code headerRef}):
* <ul> * <ul>
@ -111,7 +108,6 @@ import org.elasticsearch.common.io.stream.BytesStreamInput;
* <li>BytesRef: payload_freqency (if payloads == true)</li> * <li>BytesRef: payload_freqency (if payloads == true)</li>
* <ul> * <ul>
* </ul> </ul> * </ul> </ul>
*
*/ */
public final class TermVectorFields extends Fields { public final class TermVectorFields extends Fields {
@ -122,17 +118,14 @@ public final class TermVectorFields extends Fields {
final boolean hasFieldStatistic; final boolean hasFieldStatistic;
/** /**
* @param headerRef * @param headerRef Stores offsets per field in the {@code termVectors} and some
* Stores offsets per field in the {@code termVectors} and some * header information as {@link BytesRef}.
* header information as {@link BytesRef}. * @param termVectors Stores the actual term vectors as a {@link BytesRef}.
*
* @param termVectors
* Stores the actual term vectors as a {@link BytesRef}.
*
*/ */
public TermVectorFields(BytesReference headerRef, BytesReference termVectors) throws IOException { public TermVectorFields(BytesReference headerRef, BytesReference termVectors) throws IOException {
BytesStreamInput header = new BytesStreamInput(headerRef); BytesStreamInput header = new BytesStreamInput(headerRef);
fieldMap = new TObjectLongHashMap<String>(); fieldMap = new TObjectLongHashMap<String>(Constants.DEFAULT_CAPACITY, Constants.DEFAULT_LOAD_FACTOR, -1);
// here we read the header to fill the field offset map // here we read the header to fill the field offset map
String headerString = header.readString(); String headerString = header.readString();
assert headerString.equals("TV"); assert headerString.equals("TV");
@ -159,6 +152,9 @@ public final class TermVectorFields extends Fields {
// first, find where in the termVectors bytes the actual term vector for // first, find where in the termVectors bytes the actual term vector for
// this field is stored // this field is stored
Long offset = fieldMap.get(field); Long offset = fieldMap.get(field);
if (offset.longValue() < 0) {
return null; // we don't have it.
}
final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors); final BytesStreamInput perFieldTermVectorInput = new BytesStreamInput(this.termVectors);
perFieldTermVectorInput.reset(); perFieldTermVectorInput.reset();
perFieldTermVectorInput.skip(offset.longValue()); perFieldTermVectorInput.skip(offset.longValue());

View File

@ -19,23 +19,22 @@
package org.elasticsearch.action.termvector; package org.elasticsearch.action.termvector;
import java.io.IOException; import com.google.common.collect.Sets;
import java.util.EnumSet; import org.elasticsearch.ElasticSearchParseException;
import java.util.HashSet;
import java.util.Set;
import org.elasticsearch.action.ActionRequestValidationException; import org.elasticsearch.action.ActionRequestValidationException;
import org.elasticsearch.action.ValidateActions; import org.elasticsearch.action.ValidateActions;
import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest; import org.elasticsearch.action.support.single.shard.SingleShardOperationRequest;
import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.xcontent.XContentParser;
import com.google.common.collect.Sets; import java.io.IOException;
import java.util.*;
/** /**
* Request returning the term vector (doc frequency, positions, offsets) for a * Request returning the term vector (doc frequency, positions, offsets) for a
* document. * document.
* <p> * <p/>
* Note, the {@link #index()}, {@link #type(String)} and {@link #id(String)} are * Note, the {@link #index()}, {@link #type(String)} and {@link #id(String)} are
* required. * required.
*/ */
@ -49,6 +48,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
protected String preference; protected String preference;
// TODO: change to String[]
private Set<String> selectedFields; private Set<String> selectedFields;
private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads, private EnumSet<Flag> flagsEnum = EnumSet.of(Flag.Positions, Flag.Offsets, Flag.Payloads,
@ -67,7 +67,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
this.id = id; this.id = id;
this.type = type; this.type = type;
} }
public EnumSet<Flag> getFlags() { public EnumSet<Flag> getFlags() {
return flagsEnum; return flagsEnum;
} }
@ -135,7 +135,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
/** /**
* @returns <code>true</code> if term offsets should be returned. Otherwise * @returns <code>true</code> if term offsets should be returned. Otherwise
* <code>false</code> * <code>false</code>
*/ */
public boolean offsets() { public boolean offsets() {
return flagsEnum.contains(Flag.Offsets); return flagsEnum.contains(Flag.Offsets);
@ -159,7 +159,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
/** /**
* @returns <code>true</code> if term payloads should be returned. Otherwise * @returns <code>true</code> if term payloads should be returned. Otherwise
* <code>false</code> * <code>false</code>
*/ */
public boolean payloads() { public boolean payloads() {
return flagsEnum.contains(Flag.Payloads); return flagsEnum.contains(Flag.Payloads);
@ -175,7 +175,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
/** /**
* @returns <code>true</code> if term statistics should be returned. * @returns <code>true</code> if term statistics should be returned.
* Otherwise <code>false</code> * Otherwise <code>false</code>
*/ */
public boolean termStatistics() { public boolean termStatistics() {
return flagsEnum.contains(Flag.TermStatistics); return flagsEnum.contains(Flag.TermStatistics);
@ -191,7 +191,7 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
/** /**
* @returns <code>true</code> if field statistics should be returned. * @returns <code>true</code> if field statistics should be returned.
* Otherwise <code>false</code> * Otherwise <code>false</code>
*/ */
public boolean fieldStatistics() { public boolean fieldStatistics() {
return flagsEnum.contains(Flag.FieldStatistics); return flagsEnum.contains(Flag.FieldStatistics);
@ -246,6 +246,13 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
return validationException; return validationException;
} }
public static TermVectorRequest readTermVectorRequest(StreamInput in) throws IOException {
TermVectorRequest termVectorRequest = new TermVectorRequest();
termVectorRequest.readFrom(in);
return termVectorRequest;
}
@Override @Override
public void readFrom(StreamInput in) throws IOException { public void readFrom(StreamInput in) throws IOException {
super.readFrom(in); super.readFrom(in);
@ -300,4 +307,60 @@ public class TermVectorRequest extends SingleShardOperationRequest<TermVectorReq
// the ordinal for encoding! Only append to the end! // the ordinal for encoding! Only append to the end!
Positions, Offsets, Payloads, FieldStatistics, TermStatistics; Positions, Offsets, Payloads, FieldStatistics, TermStatistics;
} }
/**
* populates a request object (pre-populated with defaults) based on a parser.
*
* @param termVectorRequest
* @param parser
* @throws IOException
*/
public static void parseRequest(TermVectorRequest termVectorRequest, XContentParser parser) throws IOException {
XContentParser.Token token;
String currentFieldName = null;
List<String> fields = new ArrayList<String>();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (currentFieldName != null) {
if (currentFieldName.equals("fields")) {
if (token == XContentParser.Token.START_ARRAY) {
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
fields.add(parser.text());
}
} else {
throw new ElasticSearchParseException(
"The parameter fields must be given as an array! Use syntax : \"fields\" : [\"field1\", \"field2\",...]");
}
} else if (currentFieldName.equals("offsets")) {
termVectorRequest.offsets(parser.booleanValue());
} else if (currentFieldName.equals("positions")) {
termVectorRequest.positions(parser.booleanValue());
} else if (currentFieldName.equals("payloads")) {
termVectorRequest.payloads(parser.booleanValue());
} else if (currentFieldName.equals("term_statistics") || currentFieldName.equals("termStatistics")) {
termVectorRequest.termStatistics(parser.booleanValue());
} else if (currentFieldName.equals("field_statistics") || currentFieldName.equals("fieldStatistics")) {
termVectorRequest.fieldStatistics(parser.booleanValue());
} else if ("_index".equals(currentFieldName)) { // the following is important for multi request parsing.
termVectorRequest.index = parser.text();
} else if ("_type".equals(currentFieldName)) {
termVectorRequest.type = parser.text();
} else if ("_id".equals(currentFieldName)) {
termVectorRequest.id = parser.text();
} else if ("_routing".equals(currentFieldName) || "routing".equals(currentFieldName)) {
termVectorRequest.routing = parser.text();
} else {
throw new ElasticSearchParseException("The parameter " + currentFieldName
+ " is not valid for term vector request!");
}
}
}
if (fields.size() > 0) {
String[] fieldsAsArray = new String[fields.size()];
termVectorRequest.selectedFields(fields.toArray(fieldsAsArray));
}
}
} }

View File

@ -300,8 +300,8 @@ public class TermVectorResponse extends ActionResponse implements ToXContent {
builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies); builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
builder.endObject(); builder.endObject();
} else if (docCount == -1) { // this should only be -1 if the field } else if (docCount == -1) { // this should only be -1 if the field
// statistics were not requested at all. In // statistics were not requested at all. In
// this case all 3 values should be -1 // this case all 3 values should be -1
assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!"; assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!"; assert ((sumTotalTermFrequencies == -1)) : "docCount was -1 but sumTotalTermFrequencies ain't!";
} else { } else {
@ -320,10 +320,11 @@ public class TermVectorResponse extends ActionResponse implements ToXContent {
this.exists = exists; this.exists = exists;
} }
public void setFields(Fields fields, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException { public void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
TermVectorWriter tvw = new TermVectorWriter(this); TermVectorWriter tvw = new TermVectorWriter(this);
if (fields != null) {
tvw.setFields(fields, selectedFields, flags, topLevelFields); if (termVectorsByField != null) {
tvw.setFields(termVectorsByField, selectedFields, flags, topLevelFields);
} }
} }
@ -342,4 +343,16 @@ public class TermVectorResponse extends ActionResponse implements ToXContent {
} }
public String getIndex() {
return index;
}
public String getType() {
return type;
}
public String getId() {
return id;
}
} }

View File

@ -18,29 +18,25 @@
*/ */
package org.elasticsearch.action.termvector; package org.elasticsearch.action.termvector;
import org.apache.lucene.index.*;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.EnumSet; import java.util.EnumSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.BytesStreamOutput;
// package only - this is an internal class! // package only - this is an internal class!
final class TermVectorWriter { final class TermVectorWriter {
final List<String> fields = new ArrayList<String>(); final List<String> fields = new ArrayList<String>();
final List<Long> fieldOffset = new ArrayList<Long>(); final List<Long> fieldOffset = new ArrayList<Long>();
final BytesStreamOutput output = new BytesStreamOutput(1); // can we somehow final BytesStreamOutput output = new BytesStreamOutput(1); // can we somehow
// predict the // predict the
// size here? // size here?
private static final String HEADER = "TV"; private static final String HEADER = "TV";
private static final int CURRENT_VERSION = -1; private static final int CURRENT_VERSION = -1;
TermVectorResponse response = null; TermVectorResponse response = null;
@ -49,33 +45,33 @@ final class TermVectorWriter {
response = termVectorResponse; response = termVectorResponse;
} }
void setFields(Fields fields, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException { void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields) throws IOException {
int numFieldsWritten = 0; int numFieldsWritten = 0;
TermsEnum iterator = null; TermsEnum iterator = null;
DocsAndPositionsEnum docsAndPosEnum = null; DocsAndPositionsEnum docsAndPosEnum = null;
DocsEnum docsEnum = null; DocsEnum docsEnum = null;
TermsEnum topLevelIterator = null; TermsEnum topLevelIterator = null;
for (String field : fields) { for (String field : termVectorsByField) {
if ((selectedFields != null) && (!selectedFields.contains(field))) { if ((selectedFields != null) && (!selectedFields.contains(field))) {
continue; continue;
} }
Terms terms = fields.terms(field); Terms fieldTermVector = termVectorsByField.terms(field);
Terms topLevelTerms = topLevelFields.terms(field); Terms topLevelTerms = topLevelFields.terms(field);
topLevelIterator = topLevelTerms.iterator(topLevelIterator); topLevelIterator = topLevelTerms.iterator(topLevelIterator);
boolean positions = flags.contains(Flag.Positions) && terms.hasPositions(); boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
boolean offsets = flags.contains(Flag.Offsets) && terms.hasOffsets(); boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
boolean payloads = flags.contains(Flag.Payloads) && terms.hasPayloads(); boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
startField(field, terms.size(), positions, offsets, payloads); startField(field, fieldTermVector.size(), positions, offsets, payloads);
if (flags.contains(Flag.FieldStatistics)) { if (flags.contains(Flag.FieldStatistics)) {
writeFieldStatistics(topLevelTerms); writeFieldStatistics(topLevelTerms);
} }
iterator = terms.iterator(iterator); iterator = fieldTermVector.iterator(iterator);
final boolean useDocsAndPos = positions || offsets || payloads; final boolean useDocsAndPos = positions || offsets || payloads;
while (iterator.next() != null) { // iterate all terms of the while (iterator.next() != null) { // iterate all terms of the
// current field // current field
// get the doc frequency // get the doc frequency
BytesRef term = iterator.term(); BytesRef term = iterator.term();
boolean foundTerm = topLevelIterator.seekExact(term, false); boolean foundTerm = topLevelIterator.seekExact(term, false);
@ -127,7 +123,7 @@ final class TermVectorWriter {
} }
private DocsAndPositionsEnum writeTermWithDocsAndPos(TermsEnum iterator, DocsAndPositionsEnum docsAndPosEnum, boolean positions, private DocsAndPositionsEnum writeTermWithDocsAndPos(TermsEnum iterator, DocsAndPositionsEnum docsAndPosEnum, boolean positions,
boolean offsets, boolean payloads) throws IOException { boolean offsets, boolean payloads) throws IOException {
docsAndPosEnum = iterator.docsAndPositions(null, docsAndPosEnum); docsAndPosEnum = iterator.docsAndPositions(null, docsAndPosEnum);
// for each term (iterator next) in this field (field) // for each term (iterator next) in this field (field)
// iterate over the docs (should only be one) // iterate over the docs (should only be one)
@ -164,7 +160,7 @@ final class TermVectorWriter {
writePotentiallyNegativeVInt(termFreq); writePotentiallyNegativeVInt(termFreq);
} }
private void writeOffsets(int startOffset, int endOffset) throws IOException { private void writeOffsets(int startOffset, int endOffset) throws IOException {
assert (startOffset >= 0); assert (startOffset >= 0);
assert (endOffset >= 0); assert (endOffset >= 0);

View File

@ -19,36 +19,28 @@
package org.elasticsearch.rest.action.termvector; package org.elasticsearch.rest.action.termvector;
import static org.elasticsearch.rest.RestRequest.Method.GET;
import static org.elasticsearch.rest.RestRequest.Method.POST;
import static org.elasticsearch.rest.RestStatus.OK;
import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.elasticsearch.ElasticSearchParseException;
import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.termvector.TermVectorRequest; import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorResponse; import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.client.Client; import org.elasticsearch.client.Client;
import org.elasticsearch.common.Strings; import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser; import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.rest.BaseRestHandler; import org.elasticsearch.rest.*;
import org.elasticsearch.rest.RestChannel; import org.elasticsearch.rest.action.support.RestXContentBuilder;
import org.elasticsearch.rest.RestController;
import org.elasticsearch.rest.RestRequest; import java.io.IOException;
import org.elasticsearch.rest.XContentRestResponse; import java.util.HashSet;
import org.elasticsearch.rest.XContentThrowableRestResponse; import java.util.Set;
import static org.elasticsearch.rest.RestRequest.Method.GET;
import static org.elasticsearch.rest.RestRequest.Method.POST;
import static org.elasticsearch.rest.RestStatus.BAD_REQUEST;
import static org.elasticsearch.rest.RestStatus.OK;
import static org.elasticsearch.rest.action.support.RestXContentBuilder.restContentBuilder;
/** /**
* This class parses the json request and translates it into a * This class parses the json request and translates it into a
@ -70,19 +62,24 @@ public class RestTermVectorAction extends BaseRestHandler {
termVectorRequest.routing(request.param("routing")); termVectorRequest.routing(request.param("routing"));
termVectorRequest.parent(request.param("parent")); termVectorRequest.parent(request.param("parent"));
termVectorRequest.preference(request.param("preference")); termVectorRequest.preference(request.param("preference"));
XContentParser parser = null;
if (request.hasContent()) { if (request.hasContent()) {
try { try {
parseRequest(request.content(), termVectorRequest); parser = XContentFactory.xContent(request.content()).createParser(request.content());
} catch (IOException e1) { TermVectorRequest.parseRequest(termVectorRequest, parser);
Set<String> selectedFields = termVectorRequest.selectedFields(); } catch (IOException e) {
String fieldString = "all"; try {
if (selectedFields != null) { XContentBuilder builder = RestXContentBuilder.restContentBuilder(request);
Strings.arrayToDelimitedString(termVectorRequest.selectedFields().toArray(new String[1]), " "); channel.sendResponse(new XContentRestResponse(request, BAD_REQUEST, builder.startObject().field("error", e.getMessage()).endObject()));
} catch (IOException e1) {
logger.warn("Failed to send response", e1);
return;
}
} finally {
if (parser != null) {
parser.close();
} }
logger.error("Something is wrong with your parameters for the term vector request. I am using parameters "
+ "\n positions :" + termVectorRequest.positions() + "\n offsets :" + termVectorRequest.offsets() + "\n payloads :"
+ termVectorRequest.payloads() + "\n termStatistics :" + termVectorRequest.termStatistics()
+ "\n fieldStatistics :" + termVectorRequest.fieldStatistics() + "\nfields " + fieldString, (Object) null);
} }
} }
readURIParameters(termVectorRequest, request); readURIParameters(termVectorRequest, request);
@ -142,47 +139,4 @@ public class RestTermVectorAction extends BaseRestHandler {
} }
} }
static public void parseRequest(BytesReference cont, TermVectorRequest termVectorRequest) throws IOException {
XContentParser parser = XContentFactory.xContent(cont).createParser(cont);
try {
XContentParser.Token token;
String currentFieldName = null;
List<String> fields = new ArrayList<String>();
while ((token = parser.nextToken()) != XContentParser.Token.END_OBJECT) {
if (token == XContentParser.Token.FIELD_NAME) {
currentFieldName = parser.currentName();
} else if (currentFieldName != null) {
if (currentFieldName.equals("fields")) {
if (token == XContentParser.Token.START_ARRAY) {
while (parser.nextToken() != XContentParser.Token.END_ARRAY) {
fields.add(parser.text());
}
} else {
throw new ElasticSearchParseException(
"The parameter fields must be given as an array! Use syntax : \"fields\" : [\"field1\", \"field2\",...]");
}
} else if (currentFieldName.equals("offsets")) {
termVectorRequest.offsets(parser.booleanValue());
} else if (currentFieldName.equals("positions")) {
termVectorRequest.positions(parser.booleanValue());
} else if (currentFieldName.equals("payloads")) {
termVectorRequest.payloads(parser.booleanValue());
} else if (currentFieldName.equals("term_statistics") || currentFieldName.equals("termStatistics")) {
termVectorRequest.termStatistics(parser.booleanValue());
} else if (currentFieldName.equals("field_statistics") || currentFieldName.equals("fieldStatistics")) {
termVectorRequest.fieldStatistics(parser.booleanValue());
} else {
throw new ElasticSearchParseException("The parameter " + currentFieldName
+ " is not valid for term vector request!");
}
}
}
String[] fieldsAsArray = new String[fields.size()];
termVectorRequest.selectedFields(fields.toArray(fieldsAsArray));
} finally {
parser.close();
}
}
} }

View File

@ -323,6 +323,11 @@ public abstract class AbstractSharedClusterTest extends ElasticsearchTestCase {
return client().prepareIndex(index, type).setSource(source).execute().actionGet(); return client().prepareIndex(index, type).setSource(source).execute().actionGet();
} }
protected IndexResponse index(String index, String type, String id, Map<String, Object> source) {
return client().prepareIndex(index, type, id).setSource(source).execute().actionGet();
}
protected GetResponse get(String index, String type, String id) { protected GetResponse get(String index, String type, String id) {
return client().prepareGet(index, type, id).execute().actionGet(); return client().prepareGet(index, type, id).execute().actionGet();
} }

View File

@ -0,0 +1,411 @@
package org.elasticsearch.test.integration.termvectors;
/*
* Licensed to ElasticSearch under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.Version;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.inject.internal.Join;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.indices.IndexMissingException;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import java.io.IOException;
import java.io.Reader;
import java.util.*;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.hamcrest.Matchers.equalTo;
public abstract class AbstractTermVectorTests extends AbstractSharedClusterTest {
protected static class TestFieldSetting {
final public String name;
final public boolean storedOffset;
final public boolean storedPayloads;
final public boolean storedPositions;
public TestFieldSetting(String name, boolean storedOffset, boolean storedPayloads, boolean storedPositions) {
this.name = name;
this.storedOffset = storedOffset;
this.storedPayloads = storedPayloads;
this.storedPositions = storedPositions;
}
public void addToMappings(XContentBuilder mappingsBuilder) throws IOException {
mappingsBuilder.startObject(name);
mappingsBuilder.field("type", "string");
String tv_settings;
if (storedPositions && storedOffset && storedPayloads) {
tv_settings = "with_positions_offsets_payloads";
} else if (storedPositions && storedOffset) {
tv_settings = "with_positions_offsets";
} else if (storedPayloads) {
tv_settings = "with_positions_payloads";
} else if (storedPositions) {
tv_settings = "with_positions";
} else if (storedOffset) {
tv_settings = "with_offsets";
} else {
tv_settings = "yes";
}
mappingsBuilder.field("term_vector", tv_settings);
if (storedPayloads) {
mappingsBuilder.field("analyzer", "tv_test");
}
mappingsBuilder.endObject();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("name: ").append(name).append(" tv_with:");
if (storedPayloads) {
sb.append("payloads,");
}
if (storedOffset) {
sb.append("offsets,");
}
if (storedPositions) {
sb.append("positions,");
}
return sb.toString();
}
}
protected static class TestDoc {
final public String id;
final public TestFieldSetting[] fieldSettings;
final public String[] fieldContent;
public String index = "test";
public String type = "type1";
public TestDoc(String id, TestFieldSetting[] fieldSettings, String[] fieldContent) {
this.id = id;
this.fieldSettings = fieldSettings;
this.fieldContent = fieldContent;
}
public TestDoc index(String index) {
this.index = index;
return this;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("index:").append(index).append(" type:").append(type).append(" id:").append(id);
for (int i = 0; i < fieldSettings.length; i++) {
TestFieldSetting f = fieldSettings[i];
sb.append("\n").append("Field: ").append(f).append("\n content:").append(fieldContent[i]);
}
sb.append("\n");
return sb.toString();
}
}
protected static class TestConfig {
final public TestDoc doc;
final public String[] selectedFields;
final public boolean requestPositions;
final public boolean requestOffsets;
final public boolean requestPayloads;
public Class expectedException = null;
public TestConfig(TestDoc doc, String[] selectedFields, boolean requestPositions, boolean requestOffsets, boolean requestPayloads) {
this.doc = doc;
this.selectedFields = selectedFields;
this.requestPositions = requestPositions;
this.requestOffsets = requestOffsets;
this.requestPayloads = requestPayloads;
}
public TestConfig expectedException(Class exceptionClass) {
this.expectedException = exceptionClass;
return this;
}
@Override
public String toString() {
String requested = "";
if (requestOffsets) {
requested += "offsets,";
}
if (requestPositions) {
requested += "position,";
}
if (requestPayloads) {
requested += "payload,";
}
Locale aLocale = new Locale("en", "US");
return String.format(aLocale, "(doc: %s\n requested: %s, fields: %s)", doc, requested,
selectedFields == null ? "NULL" : Join.join(",", selectedFields));
}
}
protected void createIndexBasedOnFieldSettings(TestFieldSetting[] fieldSettings, int number_of_shards) throws IOException {
wipeIndex("test");
XContentBuilder mappingBuilder = jsonBuilder();
mappingBuilder.startObject().startObject("type1").startObject("properties");
for (TestFieldSetting field : fieldSettings) {
field.addToMappings(mappingBuilder);
}
ImmutableSettings.Builder settings = ImmutableSettings.settingsBuilder()
.put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase");
if (number_of_shards > 0) {
settings.put("number_of_shards", number_of_shards);
}
mappingBuilder.endObject().endObject().endObject();
run(prepareCreate("test").addMapping("type1", mappingBuilder).setSettings(settings));
ensureYellow();
}
/**
* Generate test documentsThe returned documents are already indexed.
*/
protected TestDoc[] generateTestDocs(int numberOfDocs, TestFieldSetting[] fieldSettings) {
String[] fieldContentOptions = new String[] { "Generating a random permutation of a sequence (such as when shuffling cards).",
"Selecting a random sample of a population (important in statistical sampling).",
"Allocating experimental units via random assignment to a treatment or control condition.",
"Generating random numbers: see Random number generation.",
"Transforming a data stream (such as when using a scrambler in telecommunications)." };
String[] contentArray = new String[fieldSettings.length];
Map<String, Object> docSource = new HashMap<String, Object>();
TestDoc[] testDocs = new TestDoc[numberOfDocs];
for (int docId = 0; docId < numberOfDocs; docId++) {
docSource.clear();
for (int i = 0; i < contentArray.length; i++) {
contentArray[i] = fieldContentOptions[randomInt(fieldContentOptions.length - 1)];
docSource.put(fieldSettings[i].name, contentArray[i]);
}
TestDoc doc = new TestDoc(Integer.toString(docId), fieldSettings, contentArray.clone());
index(doc.index, doc.type, doc.id, docSource);
testDocs[docId] = doc;
}
refresh();
return testDocs;
}
protected TestConfig[] generateTestConfigs(int numberOfTests, TestDoc[] testDocs, TestFieldSetting[] fieldSettings) {
ArrayList<TestConfig> configs = new ArrayList<TestConfig>();
for (int i = 0; i < numberOfTests; i++) {
ArrayList<String> selectedFields = null;
if (randomBoolean()) {
// used field selection
selectedFields = new ArrayList<String>();
if (randomBoolean()) {
selectedFields.add("Doesnt_exist"); // this will be ignored.
}
for (TestFieldSetting field : fieldSettings)
if (randomBoolean()) {
selectedFields.add(field.name);
}
if (selectedFields.size() == 0) {
selectedFields = null; // 0 length set is not supported.
}
}
TestConfig config = new TestConfig(testDocs[randomInt(testDocs.length - 1)], selectedFields == null ? null
: selectedFields.toArray(new String[] {}), randomBoolean(), randomBoolean(), randomBoolean());
configs.add(config);
}
// always adds a test that fails
configs.add(new TestConfig(new TestDoc("doesnt_exist", new TestFieldSetting[] {}, new String[] {}).index("doesn't_exist"),
new String[] { "doesnt_exist" }, true, true, true).expectedException(IndexMissingException.class));
refresh();
return configs.toArray(new TestConfig[] {});
}
protected TestFieldSetting[] getFieldSettings() {
return new TestFieldSetting[] { new TestFieldSetting("field_with_positions", false, false, true),
new TestFieldSetting("field_with_offsets", true, false, false),
new TestFieldSetting("field_with_only_tv", false, false, false),
new TestFieldSetting("field_with_positions_offsets", false, false, true),
new TestFieldSetting("field_with_positions_payloads", false, true, true)
};
}
protected DirectoryReader indexDocsWithLucene(TestDoc[] testDocs) throws IOException {
Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
for (TestFieldSetting field : testDocs[0].fieldSettings) {
if (field.storedPayloads) {
mapping.put(field.name, new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(Version.CURRENT.luceneVersion, reader);
TokenFilter filter = new LowerCaseFilter(Version.CURRENT.luceneVersion, tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(Version.CURRENT.luceneVersion), mapping);
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(Version.CURRENT.luceneVersion, wrapper);
conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
for (TestDoc doc : testDocs) {
Document d = new Document();
d.add(new Field("id", doc.id, StringField.TYPE_STORED));
for (int i = 0; i < doc.fieldContent.length; i++) {
FieldType type = new FieldType(TextField.TYPE_STORED);
TestFieldSetting fieldSetting = doc.fieldSettings[i];
type.setStoreTermVectorOffsets(fieldSetting.storedOffset);
type.setStoreTermVectorPayloads(fieldSetting.storedPayloads);
type.setStoreTermVectorPositions(fieldSetting.storedPositions || fieldSetting.storedPayloads || fieldSetting.storedOffset);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fieldSetting.name, doc.fieldContent[i], type));
}
writer.updateDocument(new Term("id", doc.id), d);
writer.commit();
}
writer.close();
return DirectoryReader.open(dir);
}
protected void validateResponse(TermVectorResponse esResponse, Fields luceneFields, TestConfig testConfig) throws IOException {
TestDoc testDoc = testConfig.doc;
HashSet<String> selectedFields = testConfig.selectedFields == null ? null : new HashSet<String>(
Arrays.asList(testConfig.selectedFields));
Fields esTermVectorFields = esResponse.getFields();
for (TestFieldSetting field : testDoc.fieldSettings) {
Terms esTerms = esTermVectorFields.terms(field.name);
if (selectedFields != null && !selectedFields.contains(field.name)) {
assertNull(esTerms);
continue;
}
assertNotNull(esTerms);
Terms luceneTerms = luceneFields.terms(field.name);
TermsEnum esTermEnum = esTerms.iterator(null);
TermsEnum luceneTermEnum = luceneTerms.iterator(null);
while (esTermEnum.next() != null) {
assertNotNull(luceneTermEnum.next());
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
if (luceneDocsPosEnum == null) {
// test we expect that...
assertFalse(field.storedOffset);
assertFalse(field.storedPayloads);
assertFalse(field.storedPositions);
continue;
}
String currentTerm = esTermEnum.term().utf8ToString();
assertThat("Token mismatch for field: " + field.name, currentTerm, equalTo(luceneTermEnum.term().utf8ToString()));
esDocsPosEnum.nextDoc();
luceneDocsPosEnum.nextDoc();
int freq = esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
String failDesc = " (field:" + field.name + " term:" + currentTerm + ")";
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (field.storedPositions && testConfig.requestPositions) {
assertThat("Position test failed" + failDesc, lucenePos, equalTo(esPos));
} else {
assertThat("Missing position test failed" + failDesc, esPos, equalTo(-1));
}
if (field.storedOffset && testConfig.requestOffsets) {
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat("Offset test failed" + failDesc, luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.startOffset(), equalTo(-1));
assertThat("Missing offset test failed" + failDesc, esDocsPosEnum.endOffset(), equalTo(-1));
}
if (field.storedPayloads && testConfig.requestPayloads) {
assertThat("Payload test failed" + failDesc, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat("Missing payload test failed" + failDesc, esDocsPosEnum.getPayload(), equalTo(null));
}
}
}
assertNull("Es returned terms are done but lucene isn't", luceneTermEnum.next());
}
}
protected TermVectorRequestBuilder getRequestForConfig(TestConfig config) {
return client().prepareTermVector(config.doc.index, config.doc.type, config.doc.id).setPayloads(config.requestPayloads)
.setOffsets(config.requestOffsets).setPositions(config.requestPositions).setFieldStatistics(true).setTermStatistics(true)
.setSelectedFields(config.selectedFields);
}
protected Fields getTermVectorsFromLucene(DirectoryReader directoryReader, TestDoc doc) throws IOException {
IndexSearcher searcher = new IndexSearcher(directoryReader);
TopDocs search = searcher.search(new TermQuery(new Term("id", doc.id)), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
assert (scoreDocs.length == 1);
return directoryReader.getTermVectors(scoreDocs[0].doc);
}
}

View File

@ -25,12 +25,9 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum; import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticSearchException; import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder; import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse; import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.io.BytesStream; import org.elasticsearch.common.io.BytesStream;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentFactory;
@ -38,53 +35,13 @@ import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.hamcrest.Matchers; import org.hamcrest.Matchers;
import org.junit.Test; import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.util.Random;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
public class GetTermVectorTestsCheckDocFreq extends AbstractSharedClusterTest { public class GetTermVectorCheckDocFreqTests extends AbstractSharedClusterTest {
@Test
public void streamRequest() throws IOException {
Random random = getRandom();
for (int i = 0; i < 10; i++) {
TermVectorRequest request = new TermVectorRequest("index", "type", "id");
request.offsets(random.nextBoolean());
request.fieldStatistics(random.nextBoolean());
request.payloads(random.nextBoolean());
request.positions(random.nextBoolean());
request.termStatistics(random.nextBoolean());
String parent = random.nextBoolean() ? "someParent" : null;
request.parent(parent);
String pref = random.nextBoolean() ? "somePreference" : null;
request.preference(pref);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
request.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorRequest req2 = new TermVectorRequest(null, null, null);
req2.readFrom(esBuffer);
assertThat(request.offsets(), equalTo(req2.offsets()));
assertThat(request.fieldStatistics(), equalTo(req2.fieldStatistics()));
assertThat(request.payloads(), equalTo(req2.payloads()));
assertThat(request.positions(), equalTo(req2.positions()));
assertThat(request.termStatistics(), equalTo(req2.termStatistics()));
assertThat(request.preference(), equalTo(pref));
assertThat(request.routing(), equalTo(parent));
}
}
@Test @Test
public void testSimpleTermVectors() throws ElasticSearchException, IOException { public void testSimpleTermVectors() throws ElasticSearchException, IOException {

View File

@ -19,271 +19,37 @@
package org.elasticsearch.test.integration.termvectors; package org.elasticsearch.test.integration.termvectors;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.FieldType;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.payloads.TypeAsPayloadTokenFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*; import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRef;
import org.elasticsearch.ElasticSearchException; import org.elasticsearch.ElasticSearchException;
import org.elasticsearch.action.ActionFuture; import org.elasticsearch.action.ActionFuture;
import org.elasticsearch.action.termvector.TermVectorRequest; import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.action.termvector.TermVectorRequestBuilder; import org.elasticsearch.action.termvector.TermVectorRequestBuilder;
import org.elasticsearch.action.termvector.TermVectorResponse; import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.xcontent.XContentFactory; import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.core.AbstractFieldMapper; import org.elasticsearch.index.mapper.core.AbstractFieldMapper;
import org.elasticsearch.index.mapper.core.TypeParsers; import org.elasticsearch.index.mapper.core.TypeParsers;
import org.elasticsearch.index.mapper.internal.AllFieldMapper; import org.elasticsearch.index.mapper.internal.AllFieldMapper;
import org.elasticsearch.rest.action.termvector.RestTermVectorAction;
import org.elasticsearch.test.integration.AbstractSharedClusterTest;
import org.hamcrest.Matchers; import org.hamcrest.Matchers;
import org.junit.Test; import org.junit.Test;
import java.io.*; import java.io.IOException;
import java.util.*;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertThrows;
import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.equalTo;
public class GetTermVectorTests extends AbstractSharedClusterTest { public class GetTermVectorTests extends AbstractTermVectorTests {
@Test
public void streamTest() throws Exception {
TermVectorResponse outResponse = new TermVectorResponse("a", "b", "c");
outResponse.setExists(true);
writeStandardTermVector(outResponse);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorResponse inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
// see if correct
checkIfStandardTermVector(inResponse);
outResponse = new TermVectorResponse("a", "b", "c");
writeEmptyTermVector(outResponse);
// write
outBuffer = new ByteArrayOutputStream();
out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
esBuffer = new InputStreamStreamInput(esInBuffer);
inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
assertTrue(inResponse.isExists());
}
private void checkIfStandardTermVector(TermVectorResponse inResponse) throws IOException {
Fields fields = inResponse.getFields();
assertThat(fields.terms("title"), Matchers.notNullValue());
assertThat(fields.terms("desc"), Matchers.notNullValue());
assertThat(fields.size(), equalTo(2));
}
private void writeEmptyTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
outResponse.setExists(true);
}
private void writeStandardTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
d.add(new Field("title", "the1 quick brown fox jumps over the1 lazy dog", type));
d.add(new Field("desc", "the1 quick brown fox jumps over the1 lazy dog", type));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
}
private Fields buildWithLuceneAndReturnFields(String docId, String[] fields, String[] content, boolean[] withPositions,
boolean[] withOffsets, boolean[] withPayloads) throws IOException {
assert (fields.length == withPayloads.length);
assert (content.length == withPayloads.length);
assert (withPositions.length == withPayloads.length);
assert (withOffsets.length == withPayloads.length);
Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
for (int i = 0; i < withPayloads.length; i++) {
if (withPayloads[i]) {
mapping.put(fields[i], new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, reader);
TokenFilter filter = new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer);
filter = new TypeAsPayloadTokenFilter(filter);
return new TokenStreamComponents(tokenizer, filter);
}
});
}
}
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new StandardAnalyzer(TEST_VERSION_CURRENT), mapping);
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, wrapper);
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
Document d = new Document();
for (int i = 0; i < fields.length; i++) {
d.add(new Field("id", docId, StringField.TYPE_STORED));
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(withOffsets[i]);
type.setStoreTermVectorPayloads(withPayloads[i]);
type.setStoreTermVectorPositions(withPositions[i] || withOffsets[i] || withPayloads[i]);
type.setStoreTermVectors(true);
type.freeze();
d.add(new Field(fields[i], content[i], type));
writer.updateDocument(new Term("id", docId), d);
writer.commit();
}
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", docId)), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
assert (scoreDocs.length == 1);
int doc = scoreDocs[0].doc;
Fields returnFields = dr.getTermVectors(doc);
return returnFields;
}
@Test
public void testRestRequestParsing() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : [\"a\", \"b\",\"c\"], \"offsets\":false, \"positions\":false, \"payloads\":true}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
RestTermVectorAction.parseRequest(inputBytes, tvr);
Set<String> fields = tvr.selectedFields();
assertThat(fields.contains("a"), equalTo(true));
assertThat(fields.contains("b"), equalTo(true));
assertThat(fields.contains("c"), equalTo(true));
assertThat(tvr.offsets(), equalTo(false));
assertThat(tvr.positions(), equalTo(false));
assertThat(tvr.payloads(), equalTo(true));
String additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(5));
assertThat(fields.contains("d"), equalTo(true));
assertThat(fields.contains("e"), equalTo(true));
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
inputBytes = new BytesArray(" {\"offsets\":false, \"positions\":false, \"payloads\":true}");
tvr = new TermVectorRequest(null, null, null);
RestTermVectorAction.parseRequest(inputBytes, tvr);
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields(), equalTo(null));
additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(4));
}
@Test
public void testRestRequestParsingThrowsException() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : \"a, b,c \", \"offsets\":false, \"positions\":false, \"payloads\":true, \"meaningless_term\":2}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
boolean threwException = false;
try {
RestTermVectorAction.parseRequest(inputBytes, tvr);
} catch (Exception e) {
threwException = true;
}
assertThat(threwException, equalTo(true));
}
@Test @Test
public void testNoSuchDoc() throws Exception { public void testNoSuchDoc() throws Exception {
run(addMapping(prepareCreate("test"), "type1", new Object[] { "field", "type", "string", "term_vector", run(addMapping(prepareCreate("test"), "type1", new Object[]{"field", "type", "string", "term_vector",
"with_positions_offsets_payloads" })); "with_positions_offsets_payloads"}));
ensureYellow(); ensureYellow();
@ -346,7 +112,7 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
public void testSimpleTermVectors() throws ElasticSearchException, IOException { public void testSimpleTermVectors() throws ElasticSearchException, IOException {
run(addMapping(prepareCreate("test"), "type1", run(addMapping(prepareCreate("test"), "type1",
new Object[] { "field", "type", "string", "term_vector", "with_positions_offsets_payloads", "analyzer", "tv_test" }) new Object[]{"field", "type", "string", "term_vector", "with_positions_offsets_payloads", "analyzer", "tv_test"})
.setSettings( .setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
@ -354,16 +120,16 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i)) client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") .setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
// 31the34 35lazy39 40dog43 // 31the34 35lazy39 40dog43
.endObject()).execute().actionGet(); .endObject()).execute().actionGet();
refresh(); refresh();
} }
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true) TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)).setPayloads(true)
.setOffsets(true).setPositions(true).setSelectedFields(); .setOffsets(true).setPositions(true).setSelectedFields();
@ -405,44 +171,43 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
@Test @Test
public void testRandomSingleTermVectors() throws ElasticSearchException, IOException { public void testRandomSingleTermVectors() throws ElasticSearchException, IOException {
Random random = getRandom();
FieldType ft = new FieldType(); FieldType ft = new FieldType();
int config = random.nextInt(6); int config = randomInt(6);
boolean storePositions = false; boolean storePositions = false;
boolean storeOffsets = false; boolean storeOffsets = false;
boolean storePayloads = false; boolean storePayloads = false;
boolean storeTermVectors = false; boolean storeTermVectors = false;
switch (config) { switch (config) {
case 0: { case 0: {
// do nothing // do nothing
} }
case 1: { case 1: {
storeTermVectors = true; storeTermVectors = true;
} }
case 2: { case 2: {
storeTermVectors = true; storeTermVectors = true;
storePositions = true; storePositions = true;
} }
case 3: { case 3: {
storeTermVectors = true; storeTermVectors = true;
storeOffsets = true; storeOffsets = true;
} }
case 4: { case 4: {
storeTermVectors = true; storeTermVectors = true;
storePositions = true; storePositions = true;
storeOffsets = true; storeOffsets = true;
} }
case 5: { case 5: {
storeTermVectors = true; storeTermVectors = true;
storePositions = true; storePositions = true;
storePayloads = true; storePayloads = true;
} }
case 6: { case 6: {
storeTermVectors = true; storeTermVectors = true;
storePositions = true; storePositions = true;
storeOffsets = true; storeOffsets = true;
storePayloads = true; storePayloads = true;
} }
} }
ft.setStoreTermVectors(storeTermVectors); ft.setStoreTermVectors(storeTermVectors);
ft.setStoreTermVectorOffsets(storeOffsets); ft.setStoreTermVectorOffsets(storeOffsets);
@ -451,27 +216,27 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
String optionString = AbstractFieldMapper.termVectorOptionsToString(ft); String optionString = AbstractFieldMapper.termVectorOptionsToString(ft);
run(addMapping(prepareCreate("test"), "type1", run(addMapping(prepareCreate("test"), "type1",
new Object[] { "field", "type", "string", "term_vector", optionString, "analyzer", "tv_test" }).setSettings( new Object[]{"field", "type", "string", "term_vector", optionString, "analyzer", "tv_test"}).setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace") ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "whitespace")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase"))); .putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
ensureYellow(); ensureYellow();
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
client().prepareIndex("test", "type1", Integer.toString(i)) client().prepareIndex("test", "type1", Integer.toString(i))
.setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog") .setSource(XContentFactory.jsonBuilder().startObject().field("field", "the quick brown fox jumps over the lazy dog")
// 0the3 4quick9 10brown15 16fox19 20jumps25 26over30 // 0the3 4quick9 10brown15 16fox19 20jumps25 26over30
// 31the34 35lazy39 40dog43 // 31the34 35lazy39 40dog43
.endObject()).execute().actionGet(); .endObject()).execute().actionGet();
refresh(); refresh();
} }
String[] values = { "brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the" }; String[] values = {"brown", "dog", "fox", "jumps", "lazy", "over", "quick", "the"};
int[] freq = { 1, 1, 1, 1, 1, 1, 1, 2 }; int[] freq = {1, 1, 1, 1, 1, 1, 1, 2};
int[][] pos = { { 2 }, { 8 }, { 3 }, { 4 }, { 7 }, { 5 }, { 1 }, { 0, 6 } }; int[][] pos = {{2}, {8}, {3}, {4}, {7}, {5}, {1}, {0, 6}};
int[][] startOffset = { { 10 }, { 40 }, { 16 }, { 20 }, { 35 }, { 26 }, { 4 }, { 0, 31 } }; int[][] startOffset = {{10}, {40}, {16}, {20}, {35}, {26}, {4}, {0, 31}};
int[][] endOffset = { { 15 }, { 43 }, { 19 }, { 25 }, { 39 }, { 30 }, { 9 }, { 3, 34 } }; int[][] endOffset = {{15}, {43}, {19}, {25}, {39}, {30}, {9}, {3, 34}};
boolean isPayloadRequested = random.nextBoolean(); boolean isPayloadRequested = randomBoolean();
boolean isOffsetRequested = random.nextBoolean(); boolean isOffsetRequested = randomBoolean();
boolean isPositionsRequested = random.nextBoolean(); boolean isPositionsRequested = randomBoolean();
String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString); String infoString = createInfoString(isPositionsRequested, isOffsetRequested, isPayloadRequested, optionString);
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i)) TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", Integer.toString(i))
@ -544,7 +309,7 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
} }
private String createInfoString(boolean isPositionsRequested, boolean isOffsetRequested, boolean isPayloadRequested, private String createInfoString(boolean isPositionsRequested, boolean isOffsetRequested, boolean isPayloadRequested,
String optionString) { String optionString) {
String ret = "Store config: " + optionString + "\n" + "Requested: pos-" String ret = "Store config: " + optionString + "\n" + "Requested: pos-"
+ (isPositionsRequested ? "yes" : "no") + ", offsets-" + (isOffsetRequested ? "yes" : "no") + ", payload- " + (isPositionsRequested ? "yes" : "no") + ", offsets-" + (isOffsetRequested ? "yes" : "no") + ", payload- "
+ (isPayloadRequested ? "yes" : "no") + "\n"; + (isPayloadRequested ? "yes" : "no") + "\n";
@ -552,147 +317,31 @@ public class GetTermVectorTests extends AbstractSharedClusterTest {
} }
@Test @Test
public void testDuellESLucene() throws Exception { public void testDuelESLucene() throws Exception {
TestFieldSetting[] testFieldSettings = getFieldSettings();
createIndexBasedOnFieldSettings(testFieldSettings, -1);
TestDoc[] testDocs = generateTestDocs(5, testFieldSettings);
String[] fieldNames = { "field_that_should_not_be_requested", "field_with_positions", "field_with_offsets", "field_with_only_tv", // for (int i=0;i<testDocs.length;i++)
"field_with_positions_offsets", "field_with_positions_payloads" }; // logger.info("Doc: {}",testDocs[i]);
run(addMapping(prepareCreate("test"), "type1", DirectoryReader directoryReader = indexDocsWithLucene(testDocs);
new Object[] { fieldNames[0], "type", "string", "term_vector", "with_positions_offsets" }, TestConfig[] testConfigs = generateTestConfigs(20, testDocs, testFieldSettings);
new Object[] { fieldNames[1], "type", "string", "term_vector", "with_positions" },
new Object[] { fieldNames[2], "type", "string", "term_vector", "with_offsets" },
new Object[] { fieldNames[3], "type", "string", "store_term_vectors", "yes" },
new Object[] { fieldNames[4], "type", "string", "term_vector", "with_positions_offsets" },
new Object[] { fieldNames[5], "type", "string", "term_vector", "with_positions_payloads", "analyzer", "tv_test" })
.setSettings(
ImmutableSettings.settingsBuilder().put("index.analysis.analyzer.tv_test.tokenizer", "standard")
.putArray("index.analysis.analyzer.tv_test.filter", "type_as_payload", "lowercase")));
ensureYellow(); for (TestConfig test : testConfigs) {
// ginge auc mit XContentBuilder xcb = new XContentBuilder(); try {
TermVectorRequestBuilder request = getRequestForConfig(test);
// now, create the same thing with lucene and see if the returned stuff if (test.expectedException != null) {
// is the same assertThrows(request, test.expectedException);
String[] fieldContent = { "the quick shard jumps over the stupid brain", "here is another field",
"And yet another field withut any use.", "I am out of ideas on what to type here.",
"The last field for which offsets are stored but not positons.",
"The last field for which offsets are stored but not positons." };
boolean[] storeOffsets = { true, false, true, false, true, false };
boolean[] storePositions = { true, true, false, false, true, true };
boolean[] storePayloads = { false, false, false, false, false, true };
Map<String, Object> testSource = new HashMap<String, Object>();
for (int i = 0; i < fieldNames.length; i++) {
testSource.put(fieldNames[i], fieldContent[i]);
}
client().prepareIndex("test", "type1", "1").setSource(testSource).execute().actionGet();
refresh();
String[] selectedFields = { fieldNames[1], fieldNames[2], fieldNames[3], fieldNames[4], fieldNames[5] };
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, false, false, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, false, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, false, true, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, true, false);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, false, true);
testForConfig(fieldNames, fieldContent, storeOffsets, storePositions, storePayloads, selectedFields, true, true, true);
}
private void testForConfig(String[] fieldNames, String[] fieldContent, boolean[] storeOffsets, boolean[] storePositions,
boolean[] storePayloads, String[] selectedFields, boolean withPositions, boolean withOffsets, boolean withPayloads)
throws IOException {
TermVectorRequestBuilder resp = client().prepareTermVector("test", "type1", "1").setPayloads(withPayloads).setOffsets(withOffsets)
.setPositions(withPositions).setFieldStatistics(true).setTermStatistics(true).setSelectedFields(selectedFields);
TermVectorResponse response = resp.execute().actionGet();
// build the same with lucene and compare the Fields
Fields luceneFields = buildWithLuceneAndReturnFields("1", fieldNames, fieldContent, storePositions, storeOffsets, storePayloads);
HashMap<String, Boolean> storeOfsetsMap = new HashMap<String, Boolean>();
HashMap<String, Boolean> storePositionsMap = new HashMap<String, Boolean>();
HashMap<String, Boolean> storePayloadsMap = new HashMap<String, Boolean>();
for (int i = 0; i < storePositions.length; i++) {
storeOfsetsMap.put(fieldNames[i], storeOffsets[i]);
storePositionsMap.put(fieldNames[i], storePositions[i]);
storePayloadsMap.put(fieldNames[i], storePayloads[i]);
}
compareLuceneESTermVectorResults(response.getFields(), luceneFields, storePositionsMap, storeOfsetsMap, storePayloadsMap,
withPositions, withOffsets, withPayloads, selectedFields);
}
private void compareLuceneESTermVectorResults(Fields fields, Fields luceneFields, HashMap<String, Boolean> storePositionsMap,
HashMap<String, Boolean> storeOfsetsMap, HashMap<String, Boolean> storePayloadsMap, boolean getPositions, boolean getOffsets,
boolean getPayloads, String[] selectedFields) throws IOException {
HashSet<String> selectedFieldsMap = new HashSet<String>(Arrays.asList(selectedFields));
Iterator<String> luceneFieldNames = luceneFields.iterator();
assertThat(luceneFields.size(), equalTo(storeOfsetsMap.size()));
assertThat(fields.size(), equalTo(selectedFields.length));
while (luceneFieldNames.hasNext()) {
String luceneFieldName = luceneFieldNames.next();
if (!selectedFieldsMap.contains(luceneFieldName))
continue;
Terms esTerms = fields.terms(luceneFieldName);
Terms luceneTerms = luceneFields.terms(luceneFieldName);
TermsEnum esTermEnum = esTerms.iterator(null);
TermsEnum luceneTermEnum = luceneTerms.iterator(null);
int numTerms = 0;
while (esTermEnum.next() != null) {
luceneTermEnum.next();
assertThat(esTermEnum.totalTermFreq(), equalTo(luceneTermEnum.totalTermFreq()));
DocsAndPositionsEnum esDocsPosEnum = esTermEnum.docsAndPositions(null, null, 0);
DocsAndPositionsEnum luceneDocsPosEnum = luceneTermEnum.docsAndPositions(null, null, 0);
if (luceneDocsPosEnum == null) {
assertThat(storeOfsetsMap.get(luceneFieldName), equalTo(false));
assertThat(storePayloadsMap.get(luceneFieldName), equalTo(false));
assertThat(storePositionsMap.get(luceneFieldName), equalTo(false));
continue; continue;
} }
numTerms++;
assertThat("failed for field: " + luceneFieldName, esTermEnum.term().utf8ToString(), equalTo(luceneTermEnum.term() TermVectorResponse response = run(request);
.utf8ToString())); Fields luceneTermVectors = getTermVectorsFromLucene(directoryReader, test.doc);
esDocsPosEnum.nextDoc(); validateResponse(response, luceneTermVectors, test);
luceneDocsPosEnum.nextDoc(); } catch (Throwable t) {
throw new Exception("Test exception while running " + test.toString(), t);
int freq = (int) esDocsPosEnum.freq();
assertThat(freq, equalTo(luceneDocsPosEnum.freq()));
for (int i = 0; i < freq; i++) {
int lucenePos = luceneDocsPosEnum.nextPosition();
int esPos = esDocsPosEnum.nextPosition();
if (storePositionsMap.get(luceneFieldName) && getPositions) {
assertThat(luceneFieldName, lucenePos, equalTo(esPos));
} else {
assertThat(esPos, equalTo(-1));
}
if (storeOfsetsMap.get(luceneFieldName) && getOffsets) {
assertThat(luceneDocsPosEnum.startOffset(), equalTo(esDocsPosEnum.startOffset()));
assertThat(luceneDocsPosEnum.endOffset(), equalTo(esDocsPosEnum.endOffset()));
} else {
assertThat(esDocsPosEnum.startOffset(), equalTo(-1));
assertThat(esDocsPosEnum.endOffset(), equalTo(-1));
}
if (storePayloadsMap.get(luceneFieldName) && getPayloads) {
assertThat(luceneFieldName, luceneDocsPosEnum.getPayload(), equalTo(esDocsPosEnum.getPayload()));
} else {
assertThat(esDocsPosEnum.getPayload(), equalTo(null));
}
}
} }
} }
} }
@Test @Test

View File

@ -0,0 +1,256 @@
/*
* Licensed to ElasticSearch and Shay Banon under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. ElasticSearch licenses this
* file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.test.unit.termvectors;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.elasticsearch.action.termvector.TermVectorRequest;
import org.elasticsearch.action.termvector.TermVectorRequest.Flag;
import org.elasticsearch.action.termvector.TermVectorResponse;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.io.stream.InputStreamStreamInput;
import org.elasticsearch.common.io.stream.OutputStreamStreamOutput;
import org.elasticsearch.common.xcontent.XContentFactory;
import org.elasticsearch.common.xcontent.XContentParser;
import org.elasticsearch.common.xcontent.XContentType;
import org.elasticsearch.rest.action.termvector.RestTermVectorAction;
import org.hamcrest.Matchers;
import org.junit.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.util.EnumSet;
import java.util.Set;
import static org.hamcrest.Matchers.equalTo;
public class TermVectorUnitTests extends org.elasticsearch.test.integration.ElasticsearchTestCase {
@Test
public void streamResponse() throws Exception {
TermVectorResponse outResponse = new TermVectorResponse("a", "b", "c");
outResponse.setExists(true);
writeStandardTermVector(outResponse);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorResponse inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
// see if correct
checkIfStandardTermVector(inResponse);
outResponse = new TermVectorResponse("a", "b", "c");
writeEmptyTermVector(outResponse);
// write
outBuffer = new ByteArrayOutputStream();
out = new OutputStreamStreamOutput(outBuffer);
outResponse.writeTo(out);
// read
esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
esBuffer = new InputStreamStreamInput(esInBuffer);
inResponse = new TermVectorResponse("a", "b", "c");
inResponse.readFrom(esBuffer);
assertTrue(inResponse.isExists());
}
private void writeEmptyTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = new RAMDirectory();
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields fields = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(fields, null, flags, fields);
outResponse.setExists(true);
}
private void writeStandardTermVector(TermVectorResponse outResponse) throws IOException {
Directory dir = FSDirectory.open(new File("/tmp/foo"));
IndexWriterConfig conf = new IndexWriterConfig(TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT));
conf.setOpenMode(OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, conf);
FieldType type = new FieldType(TextField.TYPE_STORED);
type.setStoreTermVectorOffsets(true);
type.setStoreTermVectorPayloads(false);
type.setStoreTermVectorPositions(true);
type.setStoreTermVectors(true);
type.freeze();
Document d = new Document();
d.add(new Field("id", "abc", StringField.TYPE_STORED));
d.add(new Field("title", "the1 quick brown fox jumps over the1 lazy dog", type));
d.add(new Field("desc", "the1 quick brown fox jumps over the1 lazy dog", type));
writer.updateDocument(new Term("id", "abc"), d);
writer.commit();
writer.close();
DirectoryReader dr = DirectoryReader.open(dir);
IndexSearcher s = new IndexSearcher(dr);
TopDocs search = s.search(new TermQuery(new Term("id", "abc")), 1);
ScoreDoc[] scoreDocs = search.scoreDocs;
int doc = scoreDocs[0].doc;
Fields termVectors = dr.getTermVectors(doc);
EnumSet<Flag> flags = EnumSet.of(Flag.Positions, Flag.Offsets);
outResponse.setFields(termVectors, null, flags, termVectors);
}
private void checkIfStandardTermVector(TermVectorResponse inResponse) throws IOException {
Fields fields = inResponse.getFields();
assertThat(fields.terms("title"), Matchers.notNullValue());
assertThat(fields.terms("desc"), Matchers.notNullValue());
assertThat(fields.size(), equalTo(2));
}
@Test
public void testRestRequestParsing() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : [\"a\", \"b\",\"c\"], \"offsets\":false, \"positions\":false, \"payloads\":true}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
XContentParser parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
Set<String> fields = tvr.selectedFields();
assertThat(fields.contains("a"), equalTo(true));
assertThat(fields.contains("b"), equalTo(true));
assertThat(fields.contains("c"), equalTo(true));
assertThat(tvr.offsets(), equalTo(false));
assertThat(tvr.positions(), equalTo(false));
assertThat(tvr.payloads(), equalTo(true));
String additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(5));
assertThat(fields.contains("d"), equalTo(true));
assertThat(fields.contains("e"), equalTo(true));
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
inputBytes = new BytesArray(" {\"offsets\":false, \"positions\":false, \"payloads\":true}");
tvr = new TermVectorRequest(null, null, null);
parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
additionalFields = "";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields(), equalTo(null));
additionalFields = "b,c ,d, e ";
RestTermVectorAction.addFieldStringsFromParameter(tvr, additionalFields);
assertThat(tvr.selectedFields().size(), equalTo(4));
}
@Test
public void testRequestParsingThrowsException() throws Exception {
BytesReference inputBytes = new BytesArray(
" {\"fields\" : \"a, b,c \", \"offsets\":false, \"positions\":false, \"payloads\":true, \"meaningless_term\":2}");
TermVectorRequest tvr = new TermVectorRequest(null, null, null);
boolean threwException = false;
try {
XContentParser parser = XContentFactory.xContent(XContentType.JSON).createParser(inputBytes);
TermVectorRequest.parseRequest(tvr, parser);
} catch (Exception e) {
threwException = true;
}
assertThat(threwException, equalTo(true));
}
@Test
public void streamRequest() throws IOException {
for (int i = 0; i < 10; i++) {
TermVectorRequest request = new TermVectorRequest("index", "type", "id");
request.offsets(randomBoolean());
request.fieldStatistics(randomBoolean());
request.payloads(randomBoolean());
request.positions(randomBoolean());
request.termStatistics(randomBoolean());
String parent = randomBoolean() ? "someParent" : null;
request.parent(parent);
String pref = randomBoolean() ? "somePreference" : null;
request.preference(pref);
// write
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
OutputStreamStreamOutput out = new OutputStreamStreamOutput(outBuffer);
request.writeTo(out);
// read
ByteArrayInputStream esInBuffer = new ByteArrayInputStream(outBuffer.toByteArray());
InputStreamStreamInput esBuffer = new InputStreamStreamInput(esInBuffer);
TermVectorRequest req2 = new TermVectorRequest(null, null, null);
req2.readFrom(esBuffer);
assertThat(request.offsets(), equalTo(req2.offsets()));
assertThat(request.fieldStatistics(), equalTo(req2.fieldStatistics()));
assertThat(request.payloads(), equalTo(req2.payloads()));
assertThat(request.positions(), equalTo(req2.positions()));
assertThat(request.termStatistics(), equalTo(req2.termStatistics()));
assertThat(request.preference(), equalTo(pref));
assertThat(request.routing(), equalTo(parent));
}
}
}