From d08e9c7080d955409162b3f078e500d83dc773f5 Mon Sep 17 00:00:00 2001 From: David Pilato Date: Mon, 27 Oct 2014 22:01:22 +0100 Subject: [PATCH] Test: add a standalone tool which process content This tool is a simple main class which can be used to test what is extracted from a given binary file or from its base64 equivalent. You can give as first argument the BASE64 content Available options: -u file:/URL/TO/YOUR/DOC (in place of BASE64 content) -s set extracted size (default to mapper attachment size) Examples: ``` StandaloneTest BASE64Text StandaloneTest BASE64Text -s 1000000 StandaloneTest -u /tmp/mydoc.pdf StandaloneTest -u /tmp/mydoc.pdf -s 1000000 ``` Closes #89. --- .../attachments/test/StandaloneTest.java | 178 ++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 src/test/java/org/elasticsearch/plugin/mapper/attachments/test/StandaloneTest.java diff --git a/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/StandaloneTest.java b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/StandaloneTest.java new file mode 100644 index 00000000000..1197afafcb3 --- /dev/null +++ b/src/test/java/org/elasticsearch/plugin/mapper/attachments/test/StandaloneTest.java @@ -0,0 +1,178 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.plugin.mapper.attachments.test; + +import org.elasticsearch.common.bytes.BytesReference; +import org.elasticsearch.common.cli.CliTool; +import org.elasticsearch.common.cli.CliToolConfig; +import org.elasticsearch.common.cli.Terminal; +import org.elasticsearch.common.cli.commons.CommandLine; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.xcontent.XContentBuilder; +import org.elasticsearch.env.Environment; +import org.elasticsearch.index.mapper.DocumentMapper; +import org.elasticsearch.index.mapper.DocumentMapperParser; +import org.elasticsearch.index.mapper.ParseContext; +import org.elasticsearch.index.mapper.attachment.AttachmentMapper; +import org.elasticsearch.index.mapper.xcontent.MapperTestUtils; + +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.util.Locale; + +import static org.elasticsearch.common.cli.CliToolConfig.Builder.cmd; +import static org.elasticsearch.common.cli.CliToolConfig.Builder.option; +import static org.elasticsearch.common.io.Streams.copyToByteArray; +import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath; +import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; + +/** + * This class provides a simple main class which can be used to test what is extracted from a given binary file. + * You can run it using + * -u file://URL/TO/YOUR/DOC + * -s set extracted size (default to mapper attachment size) + * BASE64 encoded binary + * + * Example: + * StandaloneTest BASE64Text + * StandaloneTest -u /tmp/mydoc.pdf + * StandaloneTest -u /tmp/mydoc.pdf -s 1000000 + */ +public class StandaloneTest extends CliTool { + + private static final CliToolConfig CONFIG = CliToolConfig.config("tika", StandaloneTest.class) + .cmds(TikaTest.CMD) + .build(); + + static class TikaTest extends Command { + private static final String NAME = "tika"; + private final String url; + private final Integer size; + private final String base64text; + private final DocumentMapper docMapper; + + private static final CliToolConfig.Cmd CMD = cmd(NAME, TikaTest.class) + .options(option("u", "url").required(false).hasArg(false)) + .options(option("s", "size").required(false).hasArg(false)) + .build(); + + protected TikaTest(Terminal terminal, String url, Integer size, String base64text) throws IOException { + super(terminal); + this.size = size; + this.url = url; + this.base64text = base64text; + DocumentMapperParser mapperParser = MapperTestUtils.newMapperParser(); + mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser()); + + String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json"); + docMapper = mapperParser.parse(mapping); + } + + @Override + public ExitStatus execute(Settings settings, Environment env) throws Exception { + XContentBuilder builder = jsonBuilder().startObject().field("_id", 1).field("file").startObject(); + + if (base64text != null) { + // If base64 is provided + builder.field("_content", base64text); + } else { + // A file is provided + File file = new File(new URL(url).getFile()); + boolean exists = file.exists(); + if (!exists) { + return ExitStatus.IO_ERROR; + } + + byte[] bytes = copyToByteArray(file); + builder.field("_content", bytes); + } + + if (size >= 0) { + builder.field("_indexed_chars", 10); + } + + BytesReference json = builder.endObject().endObject().bytes(); + + ParseContext.Document doc = docMapper.parse(json).rootDoc(); + + terminal.println("## Extracted text"); + terminal.println("--------------------- BEGIN -----------------------"); + terminal.println(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName())); + terminal.println("---------------------- END ------------------------"); + terminal.println("## Metadata"); + printMetadataContent(doc, AttachmentMapper.FieldNames.AUTHOR); + printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_LENGTH); + printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_TYPE); + printMetadataContent(doc, AttachmentMapper.FieldNames.DATE); + printMetadataContent(doc, AttachmentMapper.FieldNames.KEYWORDS); + printMetadataContent(doc, AttachmentMapper.FieldNames.LANGUAGE); + printMetadataContent(doc, AttachmentMapper.FieldNames.NAME); + printMetadataContent(doc, AttachmentMapper.FieldNames.TITLE); + + return ExitStatus.OK; + } + + private void printMetadataContent(ParseContext.Document doc, String field) { + terminal.println("- %s: %s", field, doc.get(docMapper.mappers().smartName("file." + field).mapper().names().indexName())); + } + + public static Command parse(Terminal terminal, CommandLine cli) throws IOException { + String url = cli.getOptionValue("u"); + String base64text = null; + String sSize = cli.getOptionValue("s"); + Integer size = sSize != null ? Integer.parseInt(sSize) : -1; + if (url == null && cli.getArgs().length == 0) { + return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)"); + } + if (url == null) { + if (cli.getArgs().length == 0) { + return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)"); + } + base64text = cli.getArgs()[0]; + } else { + if (cli.getArgs().length == 1) { + return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided. Not both. (type -h for help)"); + } + } + return new TikaTest(terminal, url, size, base64text); + } + } + + public StandaloneTest() { + super(CONFIG); + } + + + public static void main(String[] args) { + StandaloneTest pluginManager = new StandaloneTest(); + pluginManager.execute(args); + } + + @Override + protected Command parse(String cmdName, CommandLine cli) throws Exception { + switch (cmdName.toLowerCase(Locale.ROOT)) { + case TikaTest.NAME: return TikaTest.parse(terminal, cli); + default: + assert false : "can't get here as cmd name is validated before this method is called"; + return exitCmd(ExitStatus.CODE_ERROR); + } + } +}