Analysis: Add multi-valued text support

Add support array text as a multi-valued for AnalyzeRequestBuilder Add support array text as a multi-valued for Analyze REST API Add docs Closes #3023
2015-04-27 10:55:21 +09:00 · 2015-04-27 10:55:21 +09:00 · 3a1a4d3e89
parent 66921ffa50
commit 3a1a4d3e89
10 changed files with 133 additions and 57 deletions
--- a/docs/reference/indices/analyze.asciidoc
+++ b/docs/reference/indices/analyze.asciidoc
@ -18,6 +18,19 @@ curl -XGET 'localhost:9200/_analyze' -d '

 coming[2.0.0, body based parameters were added in 2.0.0]

+If text parameter is provided as array of strings, it is analyzed as a multi-valued field.
+
+[source,js]
+--------------------------------------------------
+curl -XGET 'localhost:9200/_analyze' -d '
+{
+  "analyzer" : "standard",
+  "text" : ["this is a test", "the second text"]
+}'
+--------------------------------------------------
+
+coming[2.0.0, body based parameters were added in 2.0.0]
+
 Or by building a custom transient analyzer out of tokenizers,
 token filters and char filters. Token filters can use the shorter 'filters'
 parameter name:
--- a/rest-api-spec/api/indices.analyze.json
+++ b/rest-api-spec/api/indices.analyze.json
@ -37,7 +37,7 @@
          "description" : "With `true`, specify that a local shard should be used if available, with `false`, use a random shard (default: true)"
        },
        "text": {
-          "type" : "string",
+          "type" : "list",
          "description" : "The text on which the analysis should be performed (when request body is not used)"
        },
        "tokenizer": {
--- a/rest-api-spec/test/indices.analyze/10_analyze.yaml
+++ b/rest-api-spec/test/indices.analyze/10_analyze.yaml
@ -63,3 +63,11 @@ setup:
          body: { "text": "Bar Foo", "filters": ["lowercase"], "tokenizer": keyword }
    - length: {tokens: 1 }
    - match:     { tokens.0.token: bar foo }
+---
+"Array text":
+    - do:
+        indices.analyze:
+          body: { "text": ["Foo Bar", "Baz"], "filters": ["lowercase"], "tokenizer": keyword }
+    - length: {tokens: 2 }
+    - match:     { tokens.0.token: foo bar }
+    - match:     { tokens.1.token: baz }
--- a/src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequest.java
+++ b/src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequest.java
@ -36,7 +36,7 @@ import static org.elasticsearch.action.ValidateActions.addValidationError;
 */
 public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest> {

-    private String text;
+    private String[] text;

    private String analyzer;

@ -61,11 +61,11 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
        this.index(index);
    }

-    public String text() {
+    public String[] text() {
        return this.text;
    }

-    public AnalyzeRequest text(String text) {
+    public AnalyzeRequest text(String... text) {
        this.text = text;
        return this;
    }
@ -118,7 +118,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
    @Override
    public ActionRequestValidationException validate() {
        ActionRequestValidationException validationException = super.validate();
-        if (text == null) {
+        if (text == null || text.length == 0) {
            validationException = addValidationError("text is missing", validationException);
        }
        if (tokenFilters == null) {
@ -133,7 +133,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
    @Override
    public void readFrom(StreamInput in) throws IOException {
        super.readFrom(in);
-        text = in.readString();
+        text = in.readStringArray();
        analyzer = in.readOptionalString();
        tokenizer = in.readOptionalString();
        tokenFilters = in.readStringArray();
@ -144,7 +144,7 @@ public class AnalyzeRequest extends SingleCustomOperationRequest<AnalyzeRequest>
    @Override
    public void writeTo(StreamOutput out) throws IOException {
        super.writeTo(out);
-        out.writeString(text);
+        out.writeStringArray(text);
        out.writeOptionalString(analyzer);
        out.writeOptionalString(tokenizer);
        out.writeStringArray(tokenFilters);
--- a/src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequestBuilder.java
+++ b/src/main/java/org/elasticsearch/action/admin/indices/analyze/AnalyzeRequestBuilder.java
@ -30,7 +30,7 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
        super(client, action, new AnalyzeRequest());
    }

-    public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String text) {
+    public AnalyzeRequestBuilder(ElasticsearchClient client, AnalyzeAction action, String index, String... text) {
        super(client, action, new AnalyzeRequest(index).text(text));
    }

@ -86,4 +86,12 @@ public class AnalyzeRequestBuilder extends SingleCustomOperationRequestBuilder<A
        request.charFilters(charFilters);
        return this;
    }
+
+    /**
+     * Sets texts to analyze
+     */
+    public AnalyzeRequestBuilder setText(String... texts) {
+        request.text(texts);
+        return this;
+    }
 }
--- a/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java
+++ b/src/main/java/org/elasticsearch/action/admin/indices/analyze/TransportAnalyzeAction.java
@ -25,6 +25,7 @@ import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.IOUtils;
 import org.elasticsearch.ElasticsearchException;
 import org.elasticsearch.Version;
 import org.elasticsearch.action.support.ActionFilters;
@ -210,38 +211,43 @@ public class TransportAnalyzeAction extends TransportSingleCustomOperationAction

        List<AnalyzeResponse.AnalyzeToken> tokens = Lists.newArrayList();
        TokenStream stream = null;
-        try {
-            stream = analyzer.tokenStream(field, request.text());
-            stream.reset();
-            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
-            PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
-            OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
-            TypeAttribute type = stream.addAttribute(TypeAttribute.class);
+        int lastPosition = -1;
+        int lastOffset = 0;
+        for (String text : request.text()) {
+            try {
+                stream = analyzer.tokenStream(field, text);
+                stream.reset();
+                CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
+                PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
+                OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
+                TypeAttribute type = stream.addAttribute(TypeAttribute.class);
+
+                while (stream.incrementToken()) {
+                    int increment = posIncr.getPositionIncrement();
+                    if (increment > 0) {
+                        lastPosition = lastPosition + increment;
+                    }
+                    tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type()));

-            int position = -1;
-            while (stream.incrementToken()) {
-                int increment = posIncr.getPositionIncrement();
-                if (increment > 0) {
-                    position = position + increment;
                }
-                tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), position, offset.startOffset(), offset.endOffset(), type.type()));
-            }
-            stream.end();
-        } catch (IOException e) {
-            throw new ElasticsearchException("failed to analyze", e);
-        } finally {
-            if (stream != null) {
-                try {
-                    stream.close();
-                } catch (IOException e) {
-                    // ignore
-                }
-            }
-            if (closeAnalyzer) {
-                analyzer.close();
+                stream.end();
+                lastOffset += offset.endOffset();
+                lastPosition += posIncr.getPositionIncrement();
+
+                lastPosition += analyzer.getPositionIncrementGap(field);
+                lastOffset += analyzer.getOffsetGap(field);
+
+            } catch (IOException e) {
+                throw new ElasticsearchException("failed to analyze", e);
+            } finally {
+                IOUtils.closeWhileHandlingException(stream);
            }
        }

+        if (closeAnalyzer) {
+            analyzer.close();
+        }
+
        return new AnalyzeResponse(tokens);
    }
 }
--- a/src/main/java/org/elasticsearch/client/IndicesAdminClient.java
+++ b/src/main/java/org/elasticsearch/client/IndicesAdminClient.java
@ -587,6 +587,12 @@ public interface IndicesAdminClient extends ElasticsearchClient {
     */
    AnalyzeRequestBuilder prepareAnalyze(String text);

+    /**
+     * Analyze text/texts.
+     *
+     */
+    AnalyzeRequestBuilder prepareAnalyze();
+
    /**
     * Puts an index template.
     */
--- a/src/main/java/org/elasticsearch/client/support/AbstractClient.java
+++ b/src/main/java/org/elasticsearch/client/support/AbstractClient.java
@ -1478,6 +1478,11 @@ public abstract class AbstractClient extends AbstractComponent implements Client
            return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE, null, text);
        }

+        @Override
+        public AnalyzeRequestBuilder prepareAnalyze() {
+            return new AnalyzeRequestBuilder(this, AnalyzeAction.INSTANCE);
+        }
+
        @Override
        public ActionFuture<PutIndexTemplateResponse> putTemplate(final PutIndexTemplateRequest request) {
            return execute(PutIndexTemplateAction.INSTANCE, request);
--- a/src/main/java/org/elasticsearch/rest/action/admin/indices/analyze/RestAnalyzeAction.java
+++ b/src/main/java/org/elasticsearch/rest/action/admin/indices/analyze/RestAnalyzeAction.java
@ -22,6 +22,7 @@ import com.google.common.collect.Lists;
 import org.elasticsearch.action.admin.indices.analyze.AnalyzeRequest;
 import org.elasticsearch.action.admin.indices.analyze.AnalyzeResponse;
 import org.elasticsearch.client.Client;
+import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.settings.Settings;
@ -55,10 +56,10 @@ public class RestAnalyzeAction extends BaseRestHandler {
    @Override
    public void handleRequest(final RestRequest request, final RestChannel channel, final Client client) {

-        String text = request.param("text");
+        String[] texts = request.paramAsStringArrayOrEmptyIfAll("text");

        AnalyzeRequest analyzeRequest = new AnalyzeRequest(request.param("index"));
-        analyzeRequest.text(text);
+        analyzeRequest.text(texts);
        analyzeRequest.preferLocal(request.paramAsBoolean("prefer_local", analyzeRequest.preferLocalShard()));
        analyzeRequest.analyzer(request.param("analyzer"));
        analyzeRequest.field(request.param("field"));
@ -69,9 +70,9 @@ public class RestAnalyzeAction extends BaseRestHandler {
        if (RestActions.hasBodyContent(request)) {
            XContentType type = RestActions.guessBodyContentType(request);
            if (type == null) {
-                if (text == null) {
-                    text = RestActions.getRestContent(request).toUtf8();
-                    analyzeRequest.text(text);
+                if (texts == null || texts.length == 0) {
+                    texts = new String[]{ RestActions.getRestContent(request).toUtf8() };
+                    analyzeRequest.text(texts);
                }
            } else {
                // NOTE: if rest request with xcontent body has request parameters, the parameters does not override xcontent values
@ -95,7 +96,16 @@ public class RestAnalyzeAction extends BaseRestHandler {
                    } else if ("prefer_local".equals(currentFieldName) && token == XContentParser.Token.VALUE_BOOLEAN) {
                        analyzeRequest.preferLocal(parser.booleanValue());
                    } else if ("text".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
-                            analyzeRequest.text(parser.text());
+                        analyzeRequest.text(parser.text());
+                    } else if ("text".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
+                        List<String> texts = Lists.newArrayList();
+                        while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
+                            if (token.isValue() == false) {
+                                throw new IllegalArgumentException(currentFieldName + " array element should only contain text");
+                            }
+                            texts.add(parser.text());
+                        }
+                        analyzeRequest.text(texts.toArray(Strings.EMPTY_ARRAY));
                    } else if ("analyzer".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
                        analyzeRequest.analyzer(parser.text());
                    } else if ("field".equals(currentFieldName) && token == XContentParser.Token.VALUE_STRING) {
@ -110,7 +120,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
                            }
                            filters.add(parser.text());
                        }
-                        analyzeRequest.tokenFilters(filters.toArray(new String[0]));
+                        analyzeRequest.tokenFilters(filters.toArray(Strings.EMPTY_ARRAY));
                    } else if ("char_filters".equals(currentFieldName) && token == XContentParser.Token.START_ARRAY) {
                        List<String> charFilters = Lists.newArrayList();
                        while ((token = parser.nextToken()) != XContentParser.Token.END_ARRAY) {
@ -119,7 +129,7 @@ public class RestAnalyzeAction extends BaseRestHandler {
                            }
                            charFilters.add(parser.text());
                        }
-                        analyzeRequest.tokenFilters(charFilters.toArray(new String[0]));
+                        analyzeRequest.tokenFilters(charFilters.toArray(Strings.EMPTY_ARRAY));
                    } else {
                        throw new IllegalArgumentException("Unknown parameter [" + currentFieldName + "] in request body or parameter is of the wrong type[" + token + "] ");
                    }
--- a/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionTests.java
+++ b/src/test/java/org/elasticsearch/indices/analyze/AnalyzeActionTests.java
@ -158,18 +158,7 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
        ensureGreen();

        client().admin().indices().preparePutMapping("test")
-                .setType("document").setSource(
-                "{\n" +
-                        "    \"document\":{\n" +
-                        "        \"properties\":{\n" +
-                        "            \"simple\":{\n" +
-                        "                \"type\":\"string\",\n" +
-                        "                \"analyzer\": \"simple\"\n" +
-                        "            }\n" +
-                        "        }\n" +
-                        "    }\n" +
-                        "}"
-        ).get();
+                .setType("document").setSource("simple", "type=string,analyzer=simple").get();

        for (int i = 0; i < 10; i++) {
            final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze("THIS IS A TEST");
@ -220,7 +209,8 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {

        RestAnalyzeAction.buildFromContent(content, analyzeRequest);

-        assertThat(analyzeRequest.text(), equalTo("THIS IS A TEST"));
+        assertThat(analyzeRequest.text().length, equalTo(1));
+        assertThat(analyzeRequest.text(), equalTo(new String[]{"THIS IS A TEST"}));
        assertThat(analyzeRequest.tokenizer(), equalTo("keyword"));
        assertThat(analyzeRequest.tokenFilters(), equalTo(new String[]{"lowercase"}));
    }
@ -239,7 +229,6 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
        }
    }

-
    @Test
    public void testParseXContentForAnalyzeRequestWithUnknownParamThrowsException() throws Exception {
        AnalyzeRequest analyzeRequest = new AnalyzeRequest("for test");
@ -258,4 +247,35 @@ public class AnalyzeActionTests extends ElasticsearchIntegrationTest {
        }
    }

+    @Test
+    public void analyzerWithMultiValues() throws Exception {
+
+        assertAcked(prepareCreate("test").addAlias(new Alias("alias")));
+        ensureGreen();
+
+        client().admin().indices().preparePutMapping("test")
+            .setType("document").setSource("simple", "type=string,analyzer=simple,position_offset_gap=100").get();
+
+        String[] texts = new String[]{"THIS IS A TEST", "THE SECOND TEXT"};
+
+        final AnalyzeRequestBuilder requestBuilder = client().admin().indices().prepareAnalyze();
+        requestBuilder.setText(texts);
+        requestBuilder.setIndex(indexOrAlias());
+        requestBuilder.setField("simple");
+        AnalyzeResponse analyzeResponse = requestBuilder.get();
+        assertThat(analyzeResponse.getTokens().size(), equalTo(7));
+        AnalyzeResponse.AnalyzeToken token = analyzeResponse.getTokens().get(3);
+        assertThat(token.getTerm(), equalTo("test"));
+        assertThat(token.getPosition(), equalTo(3));
+        assertThat(token.getStartOffset(), equalTo(10));
+        assertThat(token.getEndOffset(), equalTo(14));
+
+        token = analyzeResponse.getTokens().get(5);
+        assertThat(token.getTerm(), equalTo("second"));
+        assertThat(token.getPosition(), equalTo(105));
+        assertThat(token.getStartOffset(), equalTo(19));
+        assertThat(token.getEndOffset(), equalTo(25));
+
+    }
+
 }