SOLR-284: random cleanups, tests, interface changes

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@793953 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Yonik Seeley 2009-07-14 15:53:05 +00:00
parent 8bb8d6561c
commit 98b4dd2e93
8 changed files with 247 additions and 241 deletions

View File

@ -115,7 +115,7 @@ New Features
can be specified.
(Georgios Stamatis, Lars Kotthoff, Chris Harris via koji)
20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, gsingers)
20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, yonik, gsingers)
21. SOLR-819: Added factories for Arabic support (gsingers)

View File

@ -36,9 +36,11 @@ import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
import org.apache.tika.exception.TikaException;
import org.apache.xml.serialize.OutputFormat;
import org.apache.xml.serialize.XMLSerializer;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import java.io.IOException;
import java.io.InputStream;
@ -187,10 +189,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
}
rsp.add(stream.getName() + "_metadata", metadataNL);
}
} catch (Exception e) {
//TODO: handle here with an option to not fail and just log the exception
} catch (SAXException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} catch (TikaException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
} finally {
IOUtils.closeQuietly(inputStream);
}

View File

@ -23,7 +23,11 @@ package org.apache.solr.handler.extraction;
**/
public interface ExtractingParams {
public static final String EXTRACTING_PREFIX = "ext.";
/**
* Map all generated attribute names to field names with lowercase and underscores.
*/
public static final String LOWERNAMES = "lowernames";
/**
* The param prefix for mapping Tika metadata to Solr fields.
@ -35,7 +39,7 @@ public interface ExtractingParams {
*
*
*/
public static final String MAP_PREFIX = EXTRACTING_PREFIX + "map.";
public static final String MAP_PREFIX = "map.";
/**
* The boost value for the name of the field. The boost can be specified by a name mapping.
@ -48,7 +52,7 @@ public interface ExtractingParams {
* will boost the solr.title field for this document by 2.5
*
*/
public static final String BOOST_PREFIX = EXTRACTING_PREFIX + "boost.";
public static final String BOOST_PREFIX = "boost.";
/**
* Pass in literal values to be added to the document, as in
@ -57,7 +61,7 @@ public interface ExtractingParams {
* </pre>
*
*/
public static final String LITERALS_PREFIX = EXTRACTING_PREFIX + "literal.";
public static final String LITERALS_PREFIX = "literal.";
/**
@ -67,34 +71,21 @@ public interface ExtractingParams {
* <p/>
* See Tika's docs for what the extracted document looks like.
* <p/>
* @see #DEFAULT_FIELDNAME
* @see #CAPTURE_FIELDS
* @see #CAPTURE_ELEMENTS
*/
public static final String XPATH_EXPRESSION = EXTRACTING_PREFIX + "xpath";
public static final String XPATH_EXPRESSION = "xpath";
/**
* Only extract and return the document, do not index it.
* Only extract and return the content, do not index it.
*/
public static final String EXTRACT_ONLY = EXTRACTING_PREFIX + "extract.only";
public static final String EXTRACT_ONLY = "extractOnly";
/**
* Don't throw an exception if a field doesn't exist, just ignore it
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
*/
public static final String IGNORE_UNDECLARED_FIELDS = EXTRACTING_PREFIX + "ignore.und.fl";
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
/**
* Index attributes separately according to their name, instead of just adding them to the string buffer
*/
public static final String INDEX_ATTRIBUTES = EXTRACTING_PREFIX + "idx.attr";
/**
* The field to index the contents to by default. If you want to capture a specific piece
* of the Tika document separately, see {@link #CAPTURE_FIELDS}.
*
* @see #CAPTURE_FIELDS
*/
public static final String DEFAULT_FIELDNAME = EXTRACTING_PREFIX + "def.fl";
/**
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
@ -116,26 +107,25 @@ public interface ExtractingParams {
* By passing in the p tag, you could capture all P tags separately from the rest of the text.
* Thus, in the example, the capture of the P tag would be: "some text here. more text"
*
* @see #DEFAULT_FIELDNAME
*/
public static final String CAPTURE_FIELDS = EXTRACTING_PREFIX + "capture";
public static final String CAPTURE_ELEMENTS = "capture";
/**
* The type of the stream. If not specified, Tika will use mime type detection.
*/
public static final String STREAM_TYPE = EXTRACTING_PREFIX + "stream.type";
public static final String STREAM_TYPE = "stream.type";
/**
* Optional. The file name. If specified, Tika can take this into account while
* guessing the MIME type.
*/
public static final String RESOURCE_NAME = EXTRACTING_PREFIX + "resource.name";
public static final String RESOURCE_NAME = "resource.name";
/**
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible
* to setup a dynamic field to automatically capture it
*/
public static final String METADATA_PREFIX = EXTRACTING_PREFIX + "metadata.prefix";
public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
}

View File

@ -19,16 +19,11 @@ package org.apache.solr.handler.extraction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.DateUtil;
import org.apache.solr.schema.DateField;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.schema.StrField;
import org.apache.solr.schema.TextField;
import org.apache.solr.schema.FieldType;
import org.apache.solr.schema.UUIDField;
import org.apache.tika.metadata.Metadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -37,14 +32,7 @@ import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import java.text.DateFormat;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Stack;
import java.util.UUID;
import java.util.*;
/**
@ -60,29 +48,22 @@ import java.util.UUID;
*/
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
protected SolrInputDocument document;
private SolrInputDocument document;
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
protected Metadata metadata;
protected SolrParams params;
protected StringBuilder catchAllBuilder = new StringBuilder(2048);
//private StringBuilder currentBuilder;
protected IndexSchema schema;
//create empty so we don't have to worry about null checks
protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
protected Stack<StringBuilder> bldrStack = new Stack<StringBuilder>();
private Metadata metadata;
private SolrParams params;
private StringBuilder catchAllBuilder = new StringBuilder(2048);
private IndexSchema schema;
private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
private LinkedList<StringBuilder> bldrStack = new LinkedList<StringBuilder>();
protected boolean ignoreUndeclaredFields = false;
protected boolean indexAttribs = false;
protected String defaultFieldName;
private boolean captureAttribs;
private boolean lowerNames;
private String contentFieldName = "content";
protected String metadataPrefix = "";
/**
* Only access through getNextId();
*/
private static long identifier = Long.MIN_VALUE;
private String unknownFieldPrefix = "";
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
@ -97,22 +78,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
this.params = params;
this.schema = schema;
this.dateFormats = dateFormats;
this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false);
this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
this.metadataPrefix = params.get(METADATA_PREFIX, "");
//if there's no default field and we are intending to index, then throw an exception
if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified");
}
String[] captureFields = params.getParams(CAPTURE_FIELDS);
this.lowerNames = params.getBool(LOWERNAMES, false);
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<String, StringBuilder>();
for (int i = 0; i < captureFields.length; i++) {
fieldBuilders.put(captureFields[i], new StringBuilder());
}
}
bldrStack.push(catchAllBuilder);
bldrStack.add(catchAllBuilder);
}
@ -128,73 +105,27 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
//handle the metadata extracted from the document
for (String name : metadata.names()) {
String[] vals = metadata.getValues(name);
name = findMappedMetadataName(name);
SchemaField schFld = schema.getFieldOrNull(name);
if (schFld != null) {
boost = getBoost(name);
if (schFld.multiValued()) {
for (int i = 0; i < vals.length; i++) {
String val = vals[i];
document.addField(name, transformValue(val, schFld), boost);
}
} else {
StringBuilder builder = new StringBuilder();
for (int i = 0; i < vals.length; i++) {
builder.append(vals[i]).append(' ');
}
document.addField(name, transformValue(builder.toString().trim(), schFld), boost);
}
} else {
//TODO: error or log?
if (ignoreUndeclaredFields == false) {
// Arguably we should handle this as a special case. Why? Because unlike basically
// all the other fields in metadata, this one was probably set not by Tika by in
// ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this
// field just because you specified a resource.name parameter to the handler, should
// you?
if (name != Metadata.RESOURCE_NAME_KEY) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + name);
}
}
}
addField(name, null, vals);
}
//handle the literals from the params
Iterator<String> paramNames = params.getParameterNamesIterator();
while (paramNames.hasNext()) {
String name = paramNames.next();
if (name.startsWith(LITERALS_PREFIX)) {
String fieldName = name.substring(LITERALS_PREFIX.length());
//no need to map names here, since they are literals from the user
SchemaField schFld = schema.getFieldOrNull(fieldName);
if (schFld != null) {
String[] values = params.getParams(name);
if (schFld.multiValued() == false && values.length > 1) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The Field " + fieldName + " is not multivalued");
}
boost = getBoost(fieldName);
for (int i = 0; i < values.length; i++) {
//no need to transform here, b/c we can assume the user sent it in correctly
document.addField(fieldName, values[i], boost);
String pname = paramNames.next();
if (!pname.startsWith(LITERALS_PREFIX)) continue;
String name = pname.substring(LITERALS_PREFIX.length());
addField(name, null, params.getParams(pname));
}
} else {
handleUndeclaredField(fieldName);
}
}
}
//add in the content
document.addField(defaultFieldName, catchAllBuilder.toString(), getBoost(defaultFieldName));
addField(contentFieldName, catchAllBuilder.toString(), null);
//add in the captured content
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
if (entry.getValue().length() > 0) {
String fieldName = findMappedName(entry.getKey());
SchemaField schFld = schema.getFieldOrNull(fieldName);
if (schFld != null) {
document.addField(fieldName, transformValue(entry.getValue().toString(), schFld), getBoost(fieldName));
} else {
handleUndeclaredField(fieldName);
}
addField(entry.getKey(), entry.getValue().toString(), null);
}
}
if (log.isDebugEnabled()) {
@ -203,6 +134,75 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
return document;
}
// Naming rules:
// 1) optionally map names to nicenames (lowercase+underscores)
// 2) execute "map" commands
// 3) if resulting field is unknown, map it to a common prefix
private void addField(String fname, String fval, String[] vals) {
if (lowerNames) {
StringBuilder sb = new StringBuilder();
for (int i=0; i<fname.length(); i++) {
char ch = fname.charAt(i);
if (!Character.isLetterOrDigit(ch)) ch='_';
else ch=Character.toLowerCase(ch);
sb.append(ch);
}
fname = sb.toString();
}
String name = findMappedName(fname);
SchemaField sf = schema.getFieldOrNull(name);
if (sf==null && unknownFieldPrefix.length() > 0) {
name = unknownFieldPrefix + name;
sf = schema.getFieldOrNull(name);
}
// Arguably we should handle this as a special case. Why? Because unlike basically
// all the other fields in metadata, this one was probably set not by Tika by in
// ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this
// field just because you specified a resource.name parameter to the handler, should
// you?
if (sf == null && unknownFieldPrefix.length()==0 && name == Metadata.RESOURCE_NAME_KEY) {
return;
}
// normalize val params so vals.length>1
if (vals != null && vals.length==1) {
fval = vals[0];
vals = null;
}
// single valued field with multiple values... catenate them.
if (sf != null && !sf.multiValued() && vals != null) {
StringBuilder builder = new StringBuilder();
boolean first=true;
for (String val : vals) {
if (first) {
first=false;
} else {
builder.append(' ');
}
builder.append(val);
}
fval = builder.toString();
vals=null;
}
float boost = getBoost(name);
if (fval != null) {
document.addField(name, transformValue(fval, sf), boost);
}
if (vals != null) {
for (String val : vals) {
document.addField(name, transformValue(val, sf), boost);
}
}
// no value set - throw exception for debugging
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
}
@Override
@ -213,7 +213,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
builder.setLength(0);
}
bldrStack.clear();
bldrStack.push(catchAllBuilder);
bldrStack.add(catchAllBuilder);
}
@ -222,33 +222,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
//we need to switch the currentBuilder
bldrStack.push(theBldr);
bldrStack.add(theBldr);
}
if (indexAttribs == true) {
if (captureAttribs == true) {
for (int i = 0; i < attributes.getLength(); i++) {
String fieldName = findMappedName(localName);
SchemaField schFld = schema.getFieldOrNull(fieldName);
if (schFld != null) {
document.addField(fieldName, transformValue(attributes.getValue(i), schFld), getBoost(fieldName));
} else {
handleUndeclaredField(fieldName);
}
addField(localName, attributes.getValue(i), null);
}
} else {
for (int i = 0; i < attributes.getLength(); i++) {
bldrStack.peek().append(attributes.getValue(i)).append(' ');
}
}
}
protected void handleUndeclaredField(String fieldName) {
if (ignoreUndeclaredFields == false) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + fieldName);
} else {
if (log.isInfoEnabled()) {
log.info("Ignoring Field: " + fieldName);
bldrStack.getLast().append(attributes.getValue(i)).append(' ');
}
}
bldrStack.getLast().append(' ');
}
@Override
@ -256,17 +241,16 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
StringBuilder theBldr = fieldBuilders.get(localName);
if (theBldr != null) {
//pop the stack
bldrStack.pop();
bldrStack.removeLast();
assert (bldrStack.size() >= 1);
}
bldrStack.getLast().append(' ');
}
@Override
public void characters(char[] chars, int offset, int length) throws SAXException {
bldrStack.peek().append(chars, offset, length);
bldrStack.getLast().append(chars, offset, length);
}
@ -281,7 +265,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
*/
protected String transformValue(String val, SchemaField schFld) {
String result = val;
if (schFld.getType() instanceof DateField) {
if (schFld != null && schFld.getType() instanceof DateField) {
//try to transform the date
try {
Date date = DateUtil.parseDate(val, dateFormats);
@ -289,8 +273,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
result = df.format(date);
} catch (Exception e) {
//TODO: error or log?
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
// Let the specific fieldType handle errors
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
}
}
return result;
@ -317,20 +301,4 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
return params.get(MAP_PREFIX + name, name);
}
/**
* Get the name mapping for the metadata field. Prepends metadataPrefix onto the returned result.
*
* @param name The name to check to see if there is a mapping
* @return The new name, else <code>name</code>
*/
protected String findMappedMetadataName(String name) {
return metadataPrefix + params.get(MAP_PREFIX + name, name);
}
protected synchronized long getNextId() {
return identifier++;
}
}

View File

@ -50,36 +50,80 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
public void testExtraction() throws Exception {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
loadLocal("solr-word.pdf", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.def.fl", "extractedContent",
"ext.literal.id", "one",
"ext.map.Last-Modified", "extractedDate"
loadLocal("solr-word.pdf", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"map.content", "extractedContent",
"literal.id", "one",
"map.Last-Modified", "extractedDate"
);
assertQ(req("title:solr-word"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:solr-word"), "//*[@numFound='1']");
loadLocal("simple.html", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.map.language", "extractedLanguage",
"ext.literal.id", "two",
"ext.def.fl", "extractedContent",
"ext.map.Last-Modified", "extractedDate"
loadLocal("simple.html", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"map.language", "extractedLanguage",
"literal.id", "two",
"map.content", "extractedContent",
"map.Last-Modified", "extractedDate"
);
assertQ(req("title:Welcome"), "//*[@numFound='0']");
assertU(commit());
assertQ(req("title:Welcome"), "//*[@numFound='1']");
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.literal.id", "three",
"ext.def.fl", "extractedContent",
"ext.map.language", "extractedLanguage",
"ext.map.Last-Modified", "extractedDate"
loadLocal("simple.html",
"literal.id","simple2",
"uprefix", "t_",
"lowernames", "true",
"captureAttr", "true", "map.a","t_href",
"map.content_language", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping
"commit", "true" // test immediate commit
);
// test that purposely causes a failure to print out the doc for test debugging
// assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
// test both lowernames and unknown field mapping
assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
// load again in the exact same way, but boost one field
loadLocal("simple.html",
"literal.id","simple3",
"uprefix", "t_",
"lowernames", "true",
"captureAttr", "true", "map.a","t_href",
"map.content_language", "abcxyz",
"commit", "true"
,"boost.t_href", "100.0"
);
assertQ(req("t_href:http"), "//*[@numFound='2']");
assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
// test capture
loadLocal("simple.html",
"literal.id","simple4",
"uprefix", "t_",
"capture","p", // capture only what is in the title element
"commit", "true"
);
assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']");
assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']");
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"literal.id", "three",
"map.content", "extractedContent",
"map.language", "extractedLanguage",
"map.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
assertU(commit());
@ -93,15 +137,15 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
assertTrue("handler is null and it shouldn't be", handler != null);
//test literal
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.def.fl", "extractedContent",
"ext.literal.id", "one",
"ext.map.language", "extractedLanguage",
"ext.literal.extractionLiteralMV", "one",
"ext.literal.extractionLiteralMV", "two",
"ext.map.Last-Modified", "extractedDate"
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"map.content", "extractedContent",
"literal.id", "one",
"map.language", "extractedLanguage",
"literal.extractionLiteralMV", "one",
"literal.extractionLiteralMV", "two",
"map.Last-Modified", "extractedDate"
);
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
@ -112,29 +156,30 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
try {
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.def.fl", "extractedContent",
"ext.literal.id", "two",
"ext.map.language", "extractedLanguage",
"ext.literal.extractionLiteral", "one",
"ext.literal.extractionLiteral", "two",
"ext.map.Last-Modified", "extractedDate"
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"map.content", "extractedContent",
"literal.id", "two",
"map.language", "extractedLanguage",
"literal.extractionLiteral", "one",
"literal.extractionLiteral", "two",
"map.Last-Modified", "extractedDate"
);
assertTrue("Exception should have been thrown", false);
// TODO: original author did not specify why an exception should be thrown... how to fix?
// assertTrue("Exception should have been thrown", false);
} catch (SolrException e) {
//nothing to see here, move along
}
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.def.fl", "extractedContent",
"ext.literal.id", "three",
"ext.map.language", "extractedLanguage",
"ext.literal.extractionLiteral", "one",
"ext.map.Last-Modified", "extractedDate"
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"map.content", "extractedContent",
"literal.id", "three",
"map.language", "extractedLanguage",
"literal.extractionLiteral", "one",
"map.Last-Modified", "extractedDate"
);
assertU(commit());
assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
@ -147,12 +192,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying MIME type:
loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.literal.id", "one",
"ext.map.language", "extractedLanguage",
"ext.def.fl", "extractedContent",
loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"literal.id", "one",
"map.language", "extractedLanguage",
"map.content", "extractedContent",
ExtractingParams.STREAM_TYPE, "text/plain"
);
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
@ -165,12 +210,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
assertTrue("handler is null and it shouldn't be", handler != null);
// Load plain text specifying filename
loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
"ext.map.Author", "extractedAuthor",
"ext.literal.id", "one",
"ext.map.language", "extractedLanguage",
"ext.def.fl", "extractedContent",
loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer",
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
"map.Author", "extractedAuthor",
"literal.id", "one",
"map.language", "extractedLanguage",
"map.content", "extractedContent",
ExtractingParams.RESOURCE_NAME, "version_control.txt"
);
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");

View File

@ -315,7 +315,7 @@
<fields>
<field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="name" type="nametext" indexed="true" stored="true"/>
<field name="text" type="text" indexed="true" stored="false"/>
<field name="subject" type="text" indexed="true" stored="true"/>
@ -443,8 +443,8 @@
<dynamicField name="*aa" type="string" indexed="true" stored="true"/>
<dynamicField name="*aaa" type="integer" indexed="false" stored="true"/>
<!-- ignored becuase not stored or indexed -->
<dynamicField name="*_ignored" type="text" indexed="false" stored="false"/>
<!-- ignored because not stored or indexed -->
<dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
</fields>

View File

@ -401,6 +401,9 @@
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored"/>
<dynamicField name="attr_*" type="text" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="random*" type="random" />
<!-- uncomment the following to ignore any fields that don't already match an existing

View File

@ -640,14 +640,12 @@
</arr>
</requestHandler>
<!--
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
<lst name="defaults">
<str name="ext.map.Last-Modified">last_modified</str>
<bool name="ext.ignore.und.fl">true</bool>
<str name="uprefix">ignored_</str>
<str name="map.content">text</str>
</lst>
</requestHandler>
-->
<!-- A component to return terms and document frequency of those terms.