Test: add a standalone tool which process content

This tool is a simple main class which can be used to test what is extracted from a given binary file or from its base64 equivalent.

You can give as first argument the BASE64 content

Available options:

 -u file:/URL/TO/YOUR/DOC (in place of BASE64 content)
 -s set extracted size (default to mapper attachment size)

Examples:

```
StandaloneTest BASE64Text
StandaloneTest BASE64Text -s 1000000
StandaloneTest -u /tmp/mydoc.pdf
StandaloneTest -u /tmp/mydoc.pdf -s 1000000
```

Closes #89.
This commit is contained in:
David Pilato 2014-10-27 22:01:22 +01:00
parent c3bf3b1ce9
commit d08e9c7080
1 changed files with 178 additions and 0 deletions

View File

@ -0,0 +1,178 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.plugin.mapper.attachments.test;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.cli.CliTool;
import org.elasticsearch.common.cli.CliToolConfig;
import org.elasticsearch.common.cli.Terminal;
import org.elasticsearch.common.cli.commons.CommandLine;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.mapper.DocumentMapper;
import org.elasticsearch.index.mapper.DocumentMapperParser;
import org.elasticsearch.index.mapper.ParseContext;
import org.elasticsearch.index.mapper.attachment.AttachmentMapper;
import org.elasticsearch.index.mapper.xcontent.MapperTestUtils;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.Locale;
import static org.elasticsearch.common.cli.CliToolConfig.Builder.cmd;
import static org.elasticsearch.common.cli.CliToolConfig.Builder.option;
import static org.elasticsearch.common.io.Streams.copyToByteArray;
import static org.elasticsearch.common.io.Streams.copyToStringFromClasspath;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
/**
* This class provides a simple main class which can be used to test what is extracted from a given binary file.
* You can run it using
* -u file://URL/TO/YOUR/DOC
* -s set extracted size (default to mapper attachment size)
* BASE64 encoded binary
*
* Example:
* StandaloneTest BASE64Text
* StandaloneTest -u /tmp/mydoc.pdf
* StandaloneTest -u /tmp/mydoc.pdf -s 1000000
*/
public class StandaloneTest extends CliTool {
private static final CliToolConfig CONFIG = CliToolConfig.config("tika", StandaloneTest.class)
.cmds(TikaTest.CMD)
.build();
static class TikaTest extends Command {
private static final String NAME = "tika";
private final String url;
private final Integer size;
private final String base64text;
private final DocumentMapper docMapper;
private static final CliToolConfig.Cmd CMD = cmd(NAME, TikaTest.class)
.options(option("u", "url").required(false).hasArg(false))
.options(option("s", "size").required(false).hasArg(false))
.build();
protected TikaTest(Terminal terminal, String url, Integer size, String base64text) throws IOException {
super(terminal);
this.size = size;
this.url = url;
this.base64text = base64text;
DocumentMapperParser mapperParser = MapperTestUtils.newMapperParser();
mapperParser.putTypeParser(AttachmentMapper.CONTENT_TYPE, new AttachmentMapper.TypeParser());
String mapping = copyToStringFromClasspath("/org/elasticsearch/index/mapper/xcontent/test-mapping.json");
docMapper = mapperParser.parse(mapping);
}
@Override
public ExitStatus execute(Settings settings, Environment env) throws Exception {
XContentBuilder builder = jsonBuilder().startObject().field("_id", 1).field("file").startObject();
if (base64text != null) {
// If base64 is provided
builder.field("_content", base64text);
} else {
// A file is provided
File file = new File(new URL(url).getFile());
boolean exists = file.exists();
if (!exists) {
return ExitStatus.IO_ERROR;
}
byte[] bytes = copyToByteArray(file);
builder.field("_content", bytes);
}
if (size >= 0) {
builder.field("_indexed_chars", 10);
}
BytesReference json = builder.endObject().endObject().bytes();
ParseContext.Document doc = docMapper.parse(json).rootDoc();
terminal.println("## Extracted text");
terminal.println("--------------------- BEGIN -----------------------");
terminal.println(doc.get(docMapper.mappers().smartName("file").mapper().names().indexName()));
terminal.println("---------------------- END ------------------------");
terminal.println("## Metadata");
printMetadataContent(doc, AttachmentMapper.FieldNames.AUTHOR);
printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_LENGTH);
printMetadataContent(doc, AttachmentMapper.FieldNames.CONTENT_TYPE);
printMetadataContent(doc, AttachmentMapper.FieldNames.DATE);
printMetadataContent(doc, AttachmentMapper.FieldNames.KEYWORDS);
printMetadataContent(doc, AttachmentMapper.FieldNames.LANGUAGE);
printMetadataContent(doc, AttachmentMapper.FieldNames.NAME);
printMetadataContent(doc, AttachmentMapper.FieldNames.TITLE);
return ExitStatus.OK;
}
private void printMetadataContent(ParseContext.Document doc, String field) {
terminal.println("- %s: %s", field, doc.get(docMapper.mappers().smartName("file." + field).mapper().names().indexName()));
}
public static Command parse(Terminal terminal, CommandLine cli) throws IOException {
String url = cli.getOptionValue("u");
String base64text = null;
String sSize = cli.getOptionValue("s");
Integer size = sSize != null ? Integer.parseInt(sSize) : -1;
if (url == null && cli.getArgs().length == 0) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
}
if (url == null) {
if (cli.getArgs().length == 0) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided (type -h for help)");
}
base64text = cli.getArgs()[0];
} else {
if (cli.getArgs().length == 1) {
return exitCmd(ExitStatus.USAGE, terminal, "url or BASE64 content should be provided. Not both. (type -h for help)");
}
}
return new TikaTest(terminal, url, size, base64text);
}
}
public StandaloneTest() {
super(CONFIG);
}
public static void main(String[] args) {
StandaloneTest pluginManager = new StandaloneTest();
pluginManager.execute(args);
}
@Override
protected Command parse(String cmdName, CommandLine cli) throws Exception {
switch (cmdName.toLowerCase(Locale.ROOT)) {
case TikaTest.NAME: return TikaTest.parse(terminal, cli);
default:
assert false : "can't get here as cmd name is validated before this method is called";
return exitCmd(ExitStatus.CODE_ERROR);
}
}
}