Analysis: Add multi-valued text support

Add support array text as a multi-valued for AnalyzeRequestBuilder
Add support array text as a multi-valued for Analyze REST API
Add docs

Closes #3023
This commit is contained in:
Jun Ohtani 2015-04-27 10:55:21 +09:00
parent 66921ffa50
commit 3a1a4d3e89
10 changed files with 133 additions and 57 deletions

View File

@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '
coming[2.0.0, body based parameters were added in 2.0.0]
If text parameter is provided as array of strings, it is analyzed as a multi-valued field.
[source,js]
--------------------------------------------------
curl -XGET 'localhost:9200/_analyze' -d '
{
"analyzer" : "standard",
"text" : ["this is a test", "the second text"]
}'
--------------------------------------------------
coming[2.0.0, body based parameters were added in 2.0.0]
Or by building a custom transient analyzer out of tokenizers,
token filters and char filters. Token filters can use the shorter 'filters'
parameter name:

View File

@ -37,7 +37,7 @@
"description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
},
"text": {
"type" : "string",
"type" : "list",
"description" : "The text on which the analysis should be performed (when request body is not used)"
},
"tokenizer": {

View File

@ -63,3 +63,11 @@ setup:
body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 1 }
- match: { tokens.0.token: bar foo }
---
"Array text":
- do:
indices.analyze:
body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
- length: {tokens: 2 }
- match: { tokens.0.token: foo bar }
- match: { tokens.1.token: baz }

View File

@ -36,7 +36,7 @@ import static org.elasticsearch.action.ValidateActions.addValidationError;
*/
public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {
private String text;
private String[] text;
private String analyzer;
@ -61,11 +61,11 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
this.index(index);
}
public String text() {
public String[] text() {
return this.text;
}
public AnalyzeRequest text(String text) {
public AnalyzeRequest text(String... text) {
this.text = text;
return this;
}
@ -118,7 +118,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
@Override
public ActionRequestValidationException validate() {
ActionRequestValidationException validationException = super.validate();
if (text == null) {
if (text == null || text.length == 0) {
validationException = addValidationError("text is missing", validationException);
}
if (tokenFilters == null) {
@ -133,7 +133,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
text = in.readString();
text = in.readStringArray();
analyzer = in.readOptionalString();
tokenizer = in.readOptionalString();
tokenFilters = in.readStringArray();
@ -144,7 +144,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
out.writeString(text);
out.writeStringArray(text);
out.writeOptionalString(analyzer);
out.writeOptionalString(tokenizer);
out.writeStringArray(tokenFilters);

View File

@ -30,7 +30,7 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
super(client, action, new AnalyzeRequest());
}
public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String text) {
public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String... text) {
super(client, action, new AnalyzeRequest(index).text(text));
}
@ -86,4 +86,12 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
request.charFilters(charFilters);
return this;
}
/**
* Sets texts to analyze
*/
public AnalyzeRequestBuilder setText(String... texts) {
request.text(texts);
return this;
}
}

View File

@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.util.IOUtils;
import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.action.support.ActionFilters;
@ -210,38 +211,43 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction
List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
TokenStream stream = null;
try {
stream = analyzer.tokenStream(field, request.text());
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
int lastPosition = -1;
int lastOffset = 0;
for (String text : request.text()) {
try {
stream = analyzer.tokenStream(field, text);
stream.reset();
CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
TypeAttribute type = stream.addAttribute(TypeAttribute.class);
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
lastPosition = lastPosition + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));
int position = -1;
while (stream.incrementToken()) {
int increment = posIncr.getPositionIncrement();
if (increment > 0) {
position = position + increment;
}
tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
}
stream.end();
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
if (stream != null) {
try {
stream.close();
} catch (IOException e) {
// ignore
}
}
if (closeAnalyzer) {
analyzer.close();
stream.end();
lastOffset += offset.endOffset();
lastPosition += posIncr.getPositionIncrement();
lastPosition += analyzer.getPositionIncrementGap(field);
lastOffset += analyzer.getOffsetGap(field);
} catch (IOException e) {
throw new ElasticsearchException("failed to analyze", e);
} finally {
IOUtils.closeWhileHandlingException(stream);
}
}
if (closeAnalyzer) {
analyzer.close();
}
return new AnalyzeResponse(tokens);
}
}

View File

@ -587,6 +587,12 @@ public interface IndicesAdminClient extends ElasticsearchClient {
*/
AnalyzeRequestBuilder prepareAnalyze(String text);
/**
* Analyze text/texts.
*
*/
AnalyzeRequestBuilder prepareAnalyze();
/**
* Puts an index template.
*/

View File

@ -1478,6 +1478,11 @@ public abstract class AbstractClient extends AbstractComponent implements Client
return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE, null, text);
}
@Override
public AnalyzeRequestBuilder prepareAnalyze() {
return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE);
}
@Override
public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
return execute(PutIndexTemplateAction.INSTANCE, request);

View File

@ -22,6 +22,7 @@ import com.google.common.collect.Lists;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
import org.elasticsearch.client.Client;
import org.elasticsearch.common.Strings;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.settings.Settings;
@ -55,10 +56,10 @@ public class RestAnalyzeAction extends BaseRestHandler {
@Override
public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {
String text = request.param("text");
String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");
AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
analyzeRequest.text(text);
analyzeRequest.text(texts);
analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
analyzeRequest.analyzer(request.param("analyzer"));
analyzeRequest.field(request.param("field"));
@ -69,9 +70,9 @@ public class RestAnalyzeAction extends BaseRestHandler {
if (RestActions.hasBodyContent(request)) {
XContentType type = RestActions.guessBodyContentType(request);
if (type == null) {
if (text == null) {
text = RestActions.getRestContent(request).toUtf8();
analyzeRequest.text(text);
if (texts == null || texts.length == 0) {
texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
analyzeRequest.text(texts);
}
} else {
// NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
@ -95,7 +96,16 @@ public class RestAnalyzeAction extends BaseRestHandler {
} else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
analyzeRequest.preferLocal(parser.booleanValue());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.text(parser.text());
analyzeRequest.text(parser.text());
} else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
List<String> texts = Lists.newArrayList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
if (token.isValue() == false) {
throw new IllegalArgumentException(currentFieldName + " array element should only contain text");
}
texts.add(parser.text());
}
analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY));
} else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
analyzeRequest.analyzer(parser.text());
} else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
@ -110,7 +120,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
}
filters.add(parser.text());
}
analyzeRequest.tokenFilters(filters.toArray(new String[0]));
analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY));
} else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
List<String> charFilters = Lists.newArrayList();
while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
@ -119,7 +129,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
}
charFilters.add(parser.text());
}
analyzeRequest.tokenFilters(charFilters.toArray(new String[0]));
analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY));
} else {
throw new IllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
}

View File

@ -158,18 +158,7 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
ensureGreen();
client().admin().indices().preparePutMapping("test")
.setType("document").setSource(
"{\n" +
" \"document\":{\n" +
" \"properties\":{\n" +
" \"simple\":{\n" +
" \"type\":\"string\",\n" +
" \"analyzer\": \"simple\"\n" +
" }\n" +
" }\n" +
" }\n" +
"}"
).get();
.setType("document").setSource("simple", "type=string,analyzer=simple").get();
for (int i = 0; i < 10; i++) {
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze("THIS IS A TEST");
@ -220,7 +209,8 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
RestAnalyzeAction.buildFromContent(content, analyzeRequest);
assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
assertThat(analyzeRequest.text().length, equalTo(1));
assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
}
@ -239,7 +229,6 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
}
}
@Test
public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
@ -258,4 +247,35 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
}
}
@Test
public void analyzerWithMultiValues() throws Exception {
assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
ensureGreen();
client().admin().indices().preparePutMapping("test")
.setType("document").setSource("simple", "type=string,analyzer=simple,position_offset_gap=100").get();
String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};
final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze();
requestBuilder.setText(texts);
requestBuilder.setIndex(indexOrAlias());
requestBuilder.setField("simple");
AnalyzeResponse analyzeResponse = requestBuilder.get();
assertThat(analyzeResponse.getTokens().size(), equalTo(7));
AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
assertThat(token.getTerm(), equalTo("test"));
assertThat(token.getPosition(), equalTo(3));
assertThat(token.getStartOffset(), equalTo(10));
assertThat(token.getEndOffset(), equalTo(14));
token = analyzeResponse.getTokens().get(5);
assertThat(token.getTerm(), equalTo("second"));
assertThat(token.getPosition(), equalTo(105));
assertThat(token.getStartOffset(), equalTo(19));
assertThat(token.getEndOffset(), equalTo(25));
}
}