Analysis: Add multi-valued text support
Add support array text as a multi-valued for AnalyzeRequestBuilder Add support array text as a multi-valued for Analyze REST API Add docs Closes #3023
This commit is contained in:
parent
66921ffa50
commit
3a1a4d3e89
|
@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '
|
|||
|
||||
coming[2.0.0, body based parameters were added in 2.0.0]
|
||||
|
||||
If text parameter is provided as array of strings, it is analyzed as a multi-valued field.
|
||||
|
||||
[source,js]
|
||||
--------------------------------------------------
|
||||
curl -XGET 'localhost:9200/_analyze' -d '
|
||||
{
|
||||
"analyzer" : "standard",
|
||||
"text" : ["this is a test", "the second text"]
|
||||
}'
|
||||
--------------------------------------------------
|
||||
|
||||
coming[2.0.0, body based parameters were added in 2.0.0]
|
||||
|
||||
Or by building a custom transient analyzer out of tokenizers,
|
||||
token filters and char filters. Token filters can use the shorter 'filters'
|
||||
parameter name:
|
||||
|
|
|
@ -37,7 +37,7 @@
|
|||
"description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
|
||||
},
|
||||
"text": {
|
||||
"type" : "string",
|
||||
"type" : "list",
|
||||
"description" : "The text on which the analysis should be performed (when request body is not used)"
|
||||
},
|
||||
"tokenizer": {
|
||||
|
|
|
@ -63,3 +63,11 @@ setup:
|
|||
body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
|
||||
- length: {tokens: 1 }
|
||||
- match: { tokens.0.token: bar foo }
|
||||
---
|
||||
"Array text":
|
||||
- do:
|
||||
indices.analyze:
|
||||
body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
|
||||
- length: {tokens: 2 }
|
||||
- match: { tokens.0.token: foo bar }
|
||||
- match: { tokens.1.token: baz }
|
||||
|
|
|
@ -36,7 +36,7 @@ import static org.elasticsearch.action.ValidateActions.addValidationError;
|
|||
*/
|
||||
public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {
|
||||
|
||||
private String text;
|
||||
private String[] text;
|
||||
|
||||
private String analyzer;
|
||||
|
||||
|
@ -61,11 +61,11 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
|
|||
this.index(index);
|
||||
}
|
||||
|
||||
public String text() {
|
||||
public String[] text() {
|
||||
return this.text;
|
||||
}
|
||||
|
||||
public AnalyzeRequest text(String text) {
|
||||
public AnalyzeRequest text(String... text) {
|
||||
this.text = text;
|
||||
return this;
|
||||
}
|
||||
|
@ -118,7 +118,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
|
|||
@Override
|
||||
public ActionRequestValidationException validate() {
|
||||
ActionRequestValidationException validationException = super.validate();
|
||||
if (text == null) {
|
||||
if (text == null || text.length == 0) {
|
||||
validationException = addValidationError("text is missing", validationException);
|
||||
}
|
||||
if (tokenFilters == null) {
|
||||
|
@ -133,7 +133,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
|
|||
@Override
|
||||
public void readFrom(StreamInput in) throws IOException {
|
||||
super.readFrom(in);
|
||||
text = in.readString();
|
||||
text = in.readStringArray();
|
||||
analyzer = in.readOptionalString();
|
||||
tokenizer = in.readOptionalString();
|
||||
tokenFilters = in.readStringArray();
|
||||
|
@ -144,7 +144,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
|
|||
@Override
|
||||
public void writeTo(StreamOutput out) throws IOException {
|
||||
super.writeTo(out);
|
||||
out.writeString(text);
|
||||
out.writeStringArray(text);
|
||||
out.writeOptionalString(analyzer);
|
||||
out.writeOptionalString(tokenizer);
|
||||
out.writeStringArray(tokenFilters);
|
||||
|
|
|
@ -30,7 +30,7 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
|
|||
super(client, action, new AnalyzeRequest());
|
||||
}
|
||||
|
||||
public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String text) {
|
||||
public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String... text) {
|
||||
super(client, action, new AnalyzeRequest(index).text(text));
|
||||
}
|
||||
|
||||
|
@ -86,4 +86,12 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
|
|||
request.charFilters(charFilters);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets texts to analyze
|
||||
*/
|
||||
public AnalyzeRequestBuilder setText(String... texts) {
|
||||
request.text(texts);
|
||||
return this;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
|||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.elasticsearch.ElasticsearchException;
|
||||
import org.elasticsearch.Version;
|
||||
import org.elasticsearch.action.support.ActionFilters;
|
||||
|
@ -210,38 +211,43 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
|
|||
|
||||
List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
|
||||
TokenStream stream = null;
|
||||
try {
|
||||
stream = analyzer.tokenStream(field, request.text());
|
||||
stream.reset();
|
||||
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
|
||||
int lastPosition = -1;
|
||||
int lastOffset = 0;
|
||||
for (String text : request.text()) {
|
||||
try {
|
||||
stream = analyzer.tokenStream(field, text);
|
||||
stream.reset();
|
||||
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
|
||||
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
|
||||
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
|
||||
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
|
||||
|
||||
while (stream.incrementToken()) {
|
||||
int increment = posIncr.getPositionIncrement();
|
||||
if (increment > 0) {
|
||||
lastPosition = lastPosition + increment;
|
||||
}
|
||||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));
|
||||
|
||||
int position = -1;
|
||||
while (stream.incrementToken()) {
|
||||
int increment = posIncr.getPositionIncrement();
|
||||
if (increment > 0) {
|
||||
position = position + increment;
|
||||
}
|
||||
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
|
||||
}
|
||||
stream.end();
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("failed to analyze", e);
|
||||
} finally {
|
||||
if (stream != null) {
|
||||
try {
|
||||
stream.close();
|
||||
} catch (IOException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
if (closeAnalyzer) {
|
||||
analyzer.close();
|
||||
stream.end();
|
||||
lastOffset += offset.endOffset();
|
||||
lastPosition += posIncr.getPositionIncrement();
|
||||
|
||||
lastPosition += analyzer.getPositionIncrementGap(field);
|
||||
lastOffset += analyzer.getOffsetGap(field);
|
||||
|
||||
} catch (IOException e) {
|
||||
throw new ElasticsearchException("failed to analyze", e);
|
||||
} finally {
|
||||
IOUtils.closeWhileHandlingException(stream);
|
||||
}
|
||||
}
|
||||
|
||||
if (closeAnalyzer) {
|
||||
analyzer.close();
|
||||
}
|
||||
|
||||
return new AnalyzeResponse(tokens);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -587,6 +587,12 @@ public interface IndicesAdminClient extends ElasticsearchClient {
|
|||
*/
|
||||
AnalyzeRequestBuilder prepareAnalyze(String text);
|
||||
|
||||
/**
|
||||
* Analyze text/texts.
|
||||
*
|
||||
*/
|
||||
AnalyzeRequestBuilder prepareAnalyze();
|
||||
|
||||
/**
|
||||
* Puts an index template.
|
||||
*/
|
||||
|
|
|
@ -1478,6 +1478,11 @@ public abstract class AbstractClient extends AbstractComponent implements Client
|
|||
return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE, null, text);
|
||||
}
|
||||
|
||||
@Override
|
||||
public AnalyzeRequestBuilder prepareAnalyze() {
|
||||
return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
|
||||
return execute(PutIndexTemplateAction.INSTANCE, request);
|
||||
|
|
|
@ -22,6 +22,7 @@ import com.google.common.collect.Lists;
|
|||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
|
||||
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
|
||||
import org.elasticsearch.client.Client;
|
||||
import org.elasticsearch.common.Strings;
|
||||
import org.elasticsearch.common.bytes.BytesReference;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -55,10 +56,10 @@ public class RestAnalyzeAction extends BaseRestHandler {
|
|||
@Override
|
||||
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {
|
||||
|
||||
String text = request.param("text");
|
||||
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");
|
||||
|
||||
AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
|
||||
analyzeRequest.text(text);
|
||||
analyzeRequest.text(texts);
|
||||
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
|
||||
analyzeRequest.analyzer(request.param("analyzer"));
|
||||
analyzeRequest.field(request.param("field"));
|
||||
|
@ -69,9 +70,9 @@ public class RestAnalyzeAction extends BaseRestHandler {
|
|||
if (RestActions.hasBodyContent(request)) {
|
||||
XContentType type = RestActions.guessBodyContentType(request);
|
||||
if (type == null) {
|
||||
if (text == null) {
|
||||
text = RestActions.getRestContent(request).toUtf8();
|
||||
analyzeRequest.text(text);
|
||||
if (texts == null || texts.length == 0) {
|
||||
texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
|
||||
analyzeRequest.text(texts);
|
||||
}
|
||||
} else {
|
||||
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
|
||||
|
@ -95,7 +96,16 @@ public class RestAnalyzeAction extends BaseRestHandler {
|
|||
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
|
||||
analyzeRequest.preferLocal(parser.booleanValue());
|
||||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
|
||||
analyzeRequest.text(parser.text());
|
||||
analyzeRequest.text(parser.text());
|
||||
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
|
||||
List<String> texts = Lists.newArrayList();
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
|
||||
if (token.isValue() == false) {
|
||||
throw new IllegalArgumentException(currentFieldName + " array element should only contain text");
|
||||
}
|
||||
texts.add(parser.text());
|
||||
}
|
||||
analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY));
|
||||
} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
|
||||
analyzeRequest.analyzer(parser.text());
|
||||
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
|
||||
|
@ -110,7 +120,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
|
|||
}
|
||||
filters.add(parser.text());
|
||||
}
|
||||
analyzeRequest.tokenFilters(filters.toArray(new String[0]));
|
||||
analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY));
|
||||
} else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
|
||||
List<String> charFilters = Lists.newArrayList();
|
||||
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
|
||||
|
@ -119,7 +129,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
|
|||
}
|
||||
charFilters.add(parser.text());
|
||||
}
|
||||
analyzeRequest.tokenFilters(charFilters.toArray(new String[0]));
|
||||
analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY));
|
||||
} else {
|
||||
throw new IllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
|
||||
}
|
||||
|
|
|
@ -158,18 +158,7 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
|
|||
ensureGreen();
|
||||
|
||||
client().admin().indices().preparePutMapping("test")
|
||||
.setType("document").setSource(
|
||||
"{\n" +
|
||||
" \"document\":{\n" +
|
||||
" \"properties\":{\n" +
|
||||
" \"simple\":{\n" +
|
||||
" \"type\":\"string\",\n" +
|
||||
" \"analyzer\": \"simple\"\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
" }\n" +
|
||||
"}"
|
||||
).get();
|
||||
.setType("document").setSource("simple", "type=string,analyzer=simple").get();
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze("THIS IS A TEST");
|
||||
|
@ -220,7 +209,8 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
|
|||
|
||||
RestAnalyzeAction.buildFromContent(content, analyzeRequest);
|
||||
|
||||
assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
|
||||
assertThat(analyzeRequest.text().length, equalTo(1));
|
||||
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
|
||||
assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
|
||||
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
|
||||
}
|
||||
|
@ -239,7 +229,6 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
|
||||
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
|
||||
|
@ -258,4 +247,35 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void analyzerWithMultiValues() throws Exception {
|
||||
|
||||
assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
|
||||
ensureGreen();
|
||||
|
||||
client().admin().indices().preparePutMapping("test")
|
||||
.setType("document").setSource("simple", "type=string,analyzer=simple,position_offset_gap=100").get();
|
||||
|
||||
String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};
|
||||
|
||||
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze();
|
||||
requestBuilder.setText(texts);
|
||||
requestBuilder.setIndex(indexOrAlias());
|
||||
requestBuilder.setField("simple");
|
||||
AnalyzeResponse analyzeResponse = requestBuilder.get();
|
||||
assertThat(analyzeResponse.getTokens().size(), equalTo(7));
|
||||
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
|
||||
assertThat(token.getTerm(), equalTo("test"));
|
||||
assertThat(token.getPosition(), equalTo(3));
|
||||
assertThat(token.getStartOffset(), equalTo(10));
|
||||
assertThat(token.getEndOffset(), equalTo(14));
|
||||
|
||||
token = analyzeResponse.getTokens().get(5);
|
||||
assertThat(token.getTerm(), equalTo("second"));
|
||||
assertThat(token.getPosition(), equalTo(105));
|
||||
assertThat(token.getStartOffset(), equalTo(19));
|
||||
assertThat(token.getEndOffset(), equalTo(25));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue