mirror of https://github.com/apache/lucene.git
SOLR-284: random cleanups, tests, interface changes
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@793953 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
8bb8d6561c
commit
98b4dd2e93
|
@ -115,7 +115,7 @@ New Features
|
|||
can be specified.
|
||||
(Georgios Stamatis, Lars Kotthoff, Chris Harris via koji)
|
||||
|
||||
20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, gsingers)
|
||||
20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, yonik, gsingers)
|
||||
|
||||
21. SOLR-819: Added factories for Arabic support (gsingers)
|
||||
|
||||
|
|
|
@ -36,9 +36,11 @@ import org.apache.tika.sax.XHTMLContentHandler;
|
|||
import org.apache.tika.sax.xpath.Matcher;
|
||||
import org.apache.tika.sax.xpath.MatchingContentHandler;
|
||||
import org.apache.tika.sax.xpath.XPathParser;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.apache.xml.serialize.OutputFormat;
|
||||
import org.apache.xml.serialize.XMLSerializer;
|
||||
import org.xml.sax.ContentHandler;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
|
@ -187,10 +189,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|||
}
|
||||
rsp.add(stream.getName() + "_metadata", metadataNL);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
//TODO: handle here with an option to not fail and just log the exception
|
||||
} catch (SAXException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||
} catch (TikaException e) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
|
||||
|
||||
} finally {
|
||||
IOUtils.closeQuietly(inputStream);
|
||||
}
|
||||
|
|
|
@ -23,7 +23,11 @@ package org.apache.solr.handler.extraction;
|
|||
**/
|
||||
public interface ExtractingParams {
|
||||
|
||||
public static final String EXTRACTING_PREFIX = "ext.";
|
||||
/**
|
||||
* Map all generated attribute names to field names with lowercase and underscores.
|
||||
*/
|
||||
public static final String LOWERNAMES = "lowernames";
|
||||
|
||||
|
||||
/**
|
||||
* The param prefix for mapping Tika metadata to Solr fields.
|
||||
|
@ -35,7 +39,7 @@ public interface ExtractingParams {
|
|||
*
|
||||
*
|
||||
*/
|
||||
public static final String MAP_PREFIX = EXTRACTING_PREFIX + "map.";
|
||||
public static final String MAP_PREFIX = "map.";
|
||||
|
||||
/**
|
||||
* The boost value for the name of the field. The boost can be specified by a name mapping.
|
||||
|
@ -48,7 +52,7 @@ public interface ExtractingParams {
|
|||
* will boost the solr.title field for this document by 2.5
|
||||
*
|
||||
*/
|
||||
public static final String BOOST_PREFIX = EXTRACTING_PREFIX + "boost.";
|
||||
public static final String BOOST_PREFIX = "boost.";
|
||||
|
||||
/**
|
||||
* Pass in literal values to be added to the document, as in
|
||||
|
@ -57,7 +61,7 @@ public interface ExtractingParams {
|
|||
* </pre>
|
||||
*
|
||||
*/
|
||||
public static final String LITERALS_PREFIX = EXTRACTING_PREFIX + "literal.";
|
||||
public static final String LITERALS_PREFIX = "literal.";
|
||||
|
||||
|
||||
/**
|
||||
|
@ -67,34 +71,21 @@ public interface ExtractingParams {
|
|||
* <p/>
|
||||
* See Tika's docs for what the extracted document looks like.
|
||||
* <p/>
|
||||
* @see #DEFAULT_FIELDNAME
|
||||
* @see #CAPTURE_FIELDS
|
||||
* @see #CAPTURE_ELEMENTS
|
||||
*/
|
||||
public static final String XPATH_EXPRESSION = EXTRACTING_PREFIX + "xpath";
|
||||
public static final String XPATH_EXPRESSION = "xpath";
|
||||
|
||||
|
||||
/**
|
||||
* Only extract and return the document, do not index it.
|
||||
* Only extract and return the content, do not index it.
|
||||
*/
|
||||
public static final String EXTRACT_ONLY = EXTRACTING_PREFIX + "extract.only";
|
||||
public static final String EXTRACT_ONLY = "extractOnly";
|
||||
|
||||
/**
|
||||
* Don't throw an exception if a field doesn't exist, just ignore it
|
||||
* Capture attributes separately according to the name of the element, instead of just adding them to the string buffer
|
||||
*/
|
||||
public static final String IGNORE_UNDECLARED_FIELDS = EXTRACTING_PREFIX + "ignore.und.fl";
|
||||
public static final String CAPTURE_ATTRIBUTES = "captureAttr";
|
||||
|
||||
/**
|
||||
* Index attributes separately according to their name, instead of just adding them to the string buffer
|
||||
*/
|
||||
public static final String INDEX_ATTRIBUTES = EXTRACTING_PREFIX + "idx.attr";
|
||||
|
||||
/**
|
||||
* The field to index the contents to by default. If you want to capture a specific piece
|
||||
* of the Tika document separately, see {@link #CAPTURE_FIELDS}.
|
||||
*
|
||||
* @see #CAPTURE_FIELDS
|
||||
*/
|
||||
public static final String DEFAULT_FIELDNAME = EXTRACTING_PREFIX + "def.fl";
|
||||
|
||||
/**
|
||||
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
|
||||
|
@ -116,26 +107,25 @@ public interface ExtractingParams {
|
|||
* By passing in the p tag, you could capture all P tags separately from the rest of the text.
|
||||
* Thus, in the example, the capture of the P tag would be: "some text here. more text"
|
||||
*
|
||||
* @see #DEFAULT_FIELDNAME
|
||||
*/
|
||||
public static final String CAPTURE_FIELDS = EXTRACTING_PREFIX + "capture";
|
||||
public static final String CAPTURE_ELEMENTS = "capture";
|
||||
|
||||
/**
|
||||
* The type of the stream. If not specified, Tika will use mime type detection.
|
||||
*/
|
||||
public static final String STREAM_TYPE = EXTRACTING_PREFIX + "stream.type";
|
||||
public static final String STREAM_TYPE = "stream.type";
|
||||
|
||||
|
||||
/**
|
||||
* Optional. The file name. If specified, Tika can take this into account while
|
||||
* guessing the MIME type.
|
||||
*/
|
||||
public static final String RESOURCE_NAME = EXTRACTING_PREFIX + "resource.name";
|
||||
public static final String RESOURCE_NAME = "resource.name";
|
||||
|
||||
|
||||
/**
|
||||
* Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible
|
||||
* to setup a dynamic field to automatically capture it
|
||||
*/
|
||||
public static final String METADATA_PREFIX = EXTRACTING_PREFIX + "metadata.prefix";
|
||||
public static final String UNKNOWN_FIELD_PREFIX = "uprefix";
|
||||
}
|
||||
|
|
|
@ -19,16 +19,11 @@ package org.apache.solr.handler.extraction;
|
|||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.common.SolrInputField;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.util.DateUtil;
|
||||
import org.apache.solr.schema.DateField;
|
||||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.schema.SchemaField;
|
||||
import org.apache.solr.schema.StrField;
|
||||
import org.apache.solr.schema.TextField;
|
||||
import org.apache.solr.schema.FieldType;
|
||||
import org.apache.solr.schema.UUIDField;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -37,14 +32,7 @@ import org.xml.sax.SAXException;
|
|||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
import java.text.DateFormat;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.Stack;
|
||||
import java.util.UUID;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -60,29 +48,22 @@ import java.util.UUID;
|
|||
*/
|
||||
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
|
||||
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
|
||||
protected SolrInputDocument document;
|
||||
private SolrInputDocument document;
|
||||
|
||||
protected Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
||||
private Collection<String> dateFormats = DateUtil.DEFAULT_DATE_FORMATS;
|
||||
|
||||
protected Metadata metadata;
|
||||
protected SolrParams params;
|
||||
protected StringBuilder catchAllBuilder = new StringBuilder(2048);
|
||||
//private StringBuilder currentBuilder;
|
||||
protected IndexSchema schema;
|
||||
//create empty so we don't have to worry about null checks
|
||||
protected Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
|
||||
protected Stack<StringBuilder> bldrStack = new Stack<StringBuilder>();
|
||||
private Metadata metadata;
|
||||
private SolrParams params;
|
||||
private StringBuilder catchAllBuilder = new StringBuilder(2048);
|
||||
private IndexSchema schema;
|
||||
private Map<String, StringBuilder> fieldBuilders = Collections.emptyMap();
|
||||
private LinkedList<StringBuilder> bldrStack = new LinkedList<StringBuilder>();
|
||||
|
||||
protected boolean ignoreUndeclaredFields = false;
|
||||
protected boolean indexAttribs = false;
|
||||
protected String defaultFieldName;
|
||||
private boolean captureAttribs;
|
||||
private boolean lowerNames;
|
||||
private String contentFieldName = "content";
|
||||
|
||||
protected String metadataPrefix = "";
|
||||
|
||||
/**
|
||||
* Only access through getNextId();
|
||||
*/
|
||||
private static long identifier = Long.MIN_VALUE;
|
||||
private String unknownFieldPrefix = "";
|
||||
|
||||
|
||||
public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) {
|
||||
|
@ -97,22 +78,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
this.params = params;
|
||||
this.schema = schema;
|
||||
this.dateFormats = dateFormats;
|
||||
this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false);
|
||||
this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
|
||||
this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
|
||||
this.metadataPrefix = params.get(METADATA_PREFIX, "");
|
||||
//if there's no default field and we are intending to index, then throw an exception
|
||||
if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified");
|
||||
}
|
||||
String[] captureFields = params.getParams(CAPTURE_FIELDS);
|
||||
|
||||
this.lowerNames = params.getBool(LOWERNAMES, false);
|
||||
this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false);
|
||||
this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, "");
|
||||
String[] captureFields = params.getParams(CAPTURE_ELEMENTS);
|
||||
if (captureFields != null && captureFields.length > 0) {
|
||||
fieldBuilders = new HashMap<String, StringBuilder>();
|
||||
for (int i = 0; i < captureFields.length; i++) {
|
||||
fieldBuilders.put(captureFields[i], new StringBuilder());
|
||||
}
|
||||
}
|
||||
bldrStack.push(catchAllBuilder);
|
||||
bldrStack.add(catchAllBuilder);
|
||||
}
|
||||
|
||||
|
||||
|
@ -128,73 +105,27 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
//handle the metadata extracted from the document
|
||||
for (String name : metadata.names()) {
|
||||
String[] vals = metadata.getValues(name);
|
||||
name = findMappedMetadataName(name);
|
||||
SchemaField schFld = schema.getFieldOrNull(name);
|
||||
if (schFld != null) {
|
||||
boost = getBoost(name);
|
||||
if (schFld.multiValued()) {
|
||||
for (int i = 0; i < vals.length; i++) {
|
||||
String val = vals[i];
|
||||
document.addField(name, transformValue(val, schFld), boost);
|
||||
}
|
||||
} else {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
for (int i = 0; i < vals.length; i++) {
|
||||
builder.append(vals[i]).append(' ');
|
||||
}
|
||||
document.addField(name, transformValue(builder.toString().trim(), schFld), boost);
|
||||
}
|
||||
} else {
|
||||
//TODO: error or log?
|
||||
if (ignoreUndeclaredFields == false) {
|
||||
// Arguably we should handle this as a special case. Why? Because unlike basically
|
||||
// all the other fields in metadata, this one was probably set not by Tika by in
|
||||
// ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this
|
||||
// field just because you specified a resource.name parameter to the handler, should
|
||||
// you?
|
||||
if (name != Metadata.RESOURCE_NAME_KEY) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + name);
|
||||
}
|
||||
}
|
||||
}
|
||||
addField(name, null, vals);
|
||||
}
|
||||
|
||||
//handle the literals from the params
|
||||
Iterator<String> paramNames = params.getParameterNamesIterator();
|
||||
while (paramNames.hasNext()) {
|
||||
String name = paramNames.next();
|
||||
if (name.startsWith(LITERALS_PREFIX)) {
|
||||
String fieldName = name.substring(LITERALS_PREFIX.length());
|
||||
//no need to map names here, since they are literals from the user
|
||||
SchemaField schFld = schema.getFieldOrNull(fieldName);
|
||||
if (schFld != null) {
|
||||
String[] values = params.getParams(name);
|
||||
if (schFld.multiValued() == false && values.length > 1) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The Field " + fieldName + " is not multivalued");
|
||||
}
|
||||
boost = getBoost(fieldName);
|
||||
for (int i = 0; i < values.length; i++) {
|
||||
//no need to transform here, b/c we can assume the user sent it in correctly
|
||||
document.addField(fieldName, values[i], boost);
|
||||
String pname = paramNames.next();
|
||||
if (!pname.startsWith(LITERALS_PREFIX)) continue;
|
||||
|
||||
String name = pname.substring(LITERALS_PREFIX.length());
|
||||
addField(name, null, params.getParams(pname));
|
||||
}
|
||||
} else {
|
||||
handleUndeclaredField(fieldName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
//add in the content
|
||||
document.addField(defaultFieldName, catchAllBuilder.toString(), getBoost(defaultFieldName));
|
||||
addField(contentFieldName, catchAllBuilder.toString(), null);
|
||||
|
||||
//add in the captured content
|
||||
for (Map.Entry<String, StringBuilder> entry : fieldBuilders.entrySet()) {
|
||||
if (entry.getValue().length() > 0) {
|
||||
String fieldName = findMappedName(entry.getKey());
|
||||
SchemaField schFld = schema.getFieldOrNull(fieldName);
|
||||
if (schFld != null) {
|
||||
document.addField(fieldName, transformValue(entry.getValue().toString(), schFld), getBoost(fieldName));
|
||||
} else {
|
||||
handleUndeclaredField(fieldName);
|
||||
}
|
||||
addField(entry.getKey(), entry.getValue().toString(), null);
|
||||
}
|
||||
}
|
||||
if (log.isDebugEnabled()) {
|
||||
|
@ -203,6 +134,75 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
return document;
|
||||
}
|
||||
|
||||
// Naming rules:
|
||||
// 1) optionally map names to nicenames (lowercase+underscores)
|
||||
// 2) execute "map" commands
|
||||
// 3) if resulting field is unknown, map it to a common prefix
|
||||
private void addField(String fname, String fval, String[] vals) {
|
||||
if (lowerNames) {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
for (int i=0; i<fname.length(); i++) {
|
||||
char ch = fname.charAt(i);
|
||||
if (!Character.isLetterOrDigit(ch)) ch='_';
|
||||
else ch=Character.toLowerCase(ch);
|
||||
sb.append(ch);
|
||||
}
|
||||
fname = sb.toString();
|
||||
}
|
||||
|
||||
String name = findMappedName(fname);
|
||||
SchemaField sf = schema.getFieldOrNull(name);
|
||||
if (sf==null && unknownFieldPrefix.length() > 0) {
|
||||
name = unknownFieldPrefix + name;
|
||||
sf = schema.getFieldOrNull(name);
|
||||
}
|
||||
|
||||
// Arguably we should handle this as a special case. Why? Because unlike basically
|
||||
// all the other fields in metadata, this one was probably set not by Tika by in
|
||||
// ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this
|
||||
// field just because you specified a resource.name parameter to the handler, should
|
||||
// you?
|
||||
if (sf == null && unknownFieldPrefix.length()==0 && name == Metadata.RESOURCE_NAME_KEY) {
|
||||
return;
|
||||
}
|
||||
|
||||
// normalize val params so vals.length>1
|
||||
if (vals != null && vals.length==1) {
|
||||
fval = vals[0];
|
||||
vals = null;
|
||||
}
|
||||
|
||||
// single valued field with multiple values... catenate them.
|
||||
if (sf != null && !sf.multiValued() && vals != null) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
boolean first=true;
|
||||
for (String val : vals) {
|
||||
if (first) {
|
||||
first=false;
|
||||
} else {
|
||||
builder.append(' ');
|
||||
}
|
||||
builder.append(val);
|
||||
}
|
||||
fval = builder.toString();
|
||||
vals=null;
|
||||
}
|
||||
|
||||
float boost = getBoost(name);
|
||||
|
||||
if (fval != null) {
|
||||
document.addField(name, transformValue(fval, sf), boost);
|
||||
}
|
||||
|
||||
if (vals != null) {
|
||||
for (String val : vals) {
|
||||
document.addField(name, transformValue(val, sf), boost);
|
||||
}
|
||||
}
|
||||
|
||||
// no value set - throw exception for debugging
|
||||
// if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value ");
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
|
@ -213,7 +213,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
builder.setLength(0);
|
||||
}
|
||||
bldrStack.clear();
|
||||
bldrStack.push(catchAllBuilder);
|
||||
bldrStack.add(catchAllBuilder);
|
||||
}
|
||||
|
||||
|
||||
|
@ -222,33 +222,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
StringBuilder theBldr = fieldBuilders.get(localName);
|
||||
if (theBldr != null) {
|
||||
//we need to switch the currentBuilder
|
||||
bldrStack.push(theBldr);
|
||||
bldrStack.add(theBldr);
|
||||
}
|
||||
if (indexAttribs == true) {
|
||||
if (captureAttribs == true) {
|
||||
for (int i = 0; i < attributes.getLength(); i++) {
|
||||
String fieldName = findMappedName(localName);
|
||||
SchemaField schFld = schema.getFieldOrNull(fieldName);
|
||||
if (schFld != null) {
|
||||
document.addField(fieldName, transformValue(attributes.getValue(i), schFld), getBoost(fieldName));
|
||||
} else {
|
||||
handleUndeclaredField(fieldName);
|
||||
}
|
||||
addField(localName, attributes.getValue(i), null);
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < attributes.getLength(); i++) {
|
||||
bldrStack.peek().append(attributes.getValue(i)).append(' ');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void handleUndeclaredField(String fieldName) {
|
||||
if (ignoreUndeclaredFields == false) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + fieldName);
|
||||
} else {
|
||||
if (log.isInfoEnabled()) {
|
||||
log.info("Ignoring Field: " + fieldName);
|
||||
bldrStack.getLast().append(attributes.getValue(i)).append(' ');
|
||||
}
|
||||
}
|
||||
bldrStack.getLast().append(' ');
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -256,17 +241,16 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
StringBuilder theBldr = fieldBuilders.get(localName);
|
||||
if (theBldr != null) {
|
||||
//pop the stack
|
||||
bldrStack.pop();
|
||||
bldrStack.removeLast();
|
||||
assert (bldrStack.size() >= 1);
|
||||
}
|
||||
|
||||
|
||||
bldrStack.getLast().append(' ');
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void characters(char[] chars, int offset, int length) throws SAXException {
|
||||
bldrStack.peek().append(chars, offset, length);
|
||||
bldrStack.getLast().append(chars, offset, length);
|
||||
}
|
||||
|
||||
|
||||
|
@ -281,7 +265,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
*/
|
||||
protected String transformValue(String val, SchemaField schFld) {
|
||||
String result = val;
|
||||
if (schFld.getType() instanceof DateField) {
|
||||
if (schFld != null && schFld.getType() instanceof DateField) {
|
||||
//try to transform the date
|
||||
try {
|
||||
Date date = DateUtil.parseDate(val, dateFormats);
|
||||
|
@ -289,8 +273,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
result = df.format(date);
|
||||
|
||||
} catch (Exception e) {
|
||||
//TODO: error or log?
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
|
||||
// Let the specific fieldType handle errors
|
||||
// throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
@ -317,20 +301,4 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
return params.get(MAP_PREFIX + name, name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the name mapping for the metadata field. Prepends metadataPrefix onto the returned result.
|
||||
*
|
||||
* @param name The name to check to see if there is a mapping
|
||||
* @return The new name, else <code>name</code>
|
||||
*/
|
||||
protected String findMappedMetadataName(String name) {
|
||||
return metadataPrefix + params.get(MAP_PREFIX + name, name);
|
||||
}
|
||||
|
||||
|
||||
protected synchronized long getNextId() {
|
||||
return identifier++;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -50,36 +50,80 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
public void testExtraction() throws Exception {
|
||||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
loadLocal("solr-word.pdf", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.literal.id", "one",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
loadLocal("solr-word.pdf", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"map.content", "extractedContent",
|
||||
"literal.id", "one",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
);
|
||||
assertQ(req("title:solr-word"), "//*[@numFound='0']");
|
||||
assertU(commit());
|
||||
assertQ(req("title:solr-word"), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("simple.html", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.literal.id", "two",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
|
||||
loadLocal("simple.html", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"map.language", "extractedLanguage",
|
||||
"literal.id", "two",
|
||||
"map.content", "extractedContent",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
);
|
||||
assertQ(req("title:Welcome"), "//*[@numFound='0']");
|
||||
assertU(commit());
|
||||
assertQ(req("title:Welcome"), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.literal.id", "three",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
|
||||
loadLocal("simple.html",
|
||||
"literal.id","simple2",
|
||||
"uprefix", "t_",
|
||||
"lowernames", "true",
|
||||
"captureAttr", "true", "map.a","t_href",
|
||||
"map.content_language", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping
|
||||
"commit", "true" // test immediate commit
|
||||
);
|
||||
|
||||
// test that purposely causes a failure to print out the doc for test debugging
|
||||
// assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']");
|
||||
|
||||
// test both lowernames and unknown field mapping
|
||||
assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']");
|
||||
|
||||
// load again in the exact same way, but boost one field
|
||||
loadLocal("simple.html",
|
||||
"literal.id","simple3",
|
||||
"uprefix", "t_",
|
||||
"lowernames", "true",
|
||||
"captureAttr", "true", "map.a","t_href",
|
||||
"map.content_language", "abcxyz",
|
||||
"commit", "true"
|
||||
|
||||
,"boost.t_href", "100.0"
|
||||
);
|
||||
|
||||
assertQ(req("t_href:http"), "//*[@numFound='2']");
|
||||
assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']");
|
||||
|
||||
// test capture
|
||||
loadLocal("simple.html",
|
||||
"literal.id","simple4",
|
||||
"uprefix", "t_",
|
||||
"capture","p", // capture only what is in the title element
|
||||
"commit", "true"
|
||||
);
|
||||
assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']");
|
||||
assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']");
|
||||
|
||||
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"literal.id", "three",
|
||||
"map.content", "extractedContent",
|
||||
"map.language", "extractedLanguage",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
);
|
||||
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
|
||||
assertU(commit());
|
||||
|
@ -93,15 +137,15 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
|
||||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
//test literal
|
||||
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.literal.id", "one",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.literal.extractionLiteralMV", "one",
|
||||
"ext.literal.extractionLiteralMV", "two",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"map.content", "extractedContent",
|
||||
"literal.id", "one",
|
||||
"map.language", "extractedLanguage",
|
||||
"literal.extractionLiteralMV", "one",
|
||||
"literal.extractionLiteralMV", "two",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
|
||||
);
|
||||
assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']");
|
||||
|
@ -112,29 +156,30 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']");
|
||||
|
||||
try {
|
||||
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.literal.id", "two",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.literal.extractionLiteral", "one",
|
||||
"ext.literal.extractionLiteral", "two",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"map.content", "extractedContent",
|
||||
"literal.id", "two",
|
||||
"map.language", "extractedLanguage",
|
||||
"literal.extractionLiteral", "one",
|
||||
"literal.extractionLiteral", "two",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
);
|
||||
assertTrue("Exception should have been thrown", false);
|
||||
// TODO: original author did not specify why an exception should be thrown... how to fix?
|
||||
// assertTrue("Exception should have been thrown", false);
|
||||
} catch (SolrException e) {
|
||||
//nothing to see here, move along
|
||||
}
|
||||
|
||||
loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.def.fl", "extractedContent",
|
||||
"ext.literal.id", "three",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.literal.extractionLiteral", "one",
|
||||
"ext.map.Last-Modified", "extractedDate"
|
||||
loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"map.content", "extractedContent",
|
||||
"literal.id", "three",
|
||||
"map.language", "extractedLanguage",
|
||||
"literal.extractionLiteral", "one",
|
||||
"map.Last-Modified", "extractedDate"
|
||||
);
|
||||
assertU(commit());
|
||||
assertQ(req("extractionLiteral:one"), "//*[@numFound='1']");
|
||||
|
@ -147,12 +192,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
|
||||
// Load plain text specifying MIME type:
|
||||
loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.literal.id", "one",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.def.fl", "extractedContent",
|
||||
loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"literal.id", "one",
|
||||
"map.language", "extractedLanguage",
|
||||
"map.content", "extractedContent",
|
||||
ExtractingParams.STREAM_TYPE, "text/plain"
|
||||
);
|
||||
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
|
||||
|
@ -165,12 +210,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase {
|
|||
assertTrue("handler is null and it shouldn't be", handler != null);
|
||||
|
||||
// Load plain text specifying filename
|
||||
loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer",
|
||||
"ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords",
|
||||
"ext.map.Author", "extractedAuthor",
|
||||
"ext.literal.id", "one",
|
||||
"ext.map.language", "extractedLanguage",
|
||||
"ext.def.fl", "extractedContent",
|
||||
loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer",
|
||||
"map.creator", "extractedCreator", "map.Keywords", "extractedKeywords",
|
||||
"map.Author", "extractedAuthor",
|
||||
"literal.id", "one",
|
||||
"map.language", "extractedLanguage",
|
||||
"map.content", "extractedContent",
|
||||
ExtractingParams.RESOURCE_NAME, "version_control.txt"
|
||||
);
|
||||
assertQ(req("extractedContent:Apache"), "//*[@numFound='0']");
|
||||
|
|
|
@ -315,7 +315,7 @@
|
|||
|
||||
|
||||
<fields>
|
||||
<field name="id" type="integer" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="id" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="name" type="nametext" indexed="true" stored="true"/>
|
||||
<field name="text" type="text" indexed="true" stored="false"/>
|
||||
<field name="subject" type="text" indexed="true" stored="true"/>
|
||||
|
@ -443,8 +443,8 @@
|
|||
<dynamicField name="*aa" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="*aaa" type="integer" indexed="false" stored="true"/>
|
||||
|
||||
<!-- ignored becuase not stored or indexed -->
|
||||
<dynamicField name="*_ignored" type="text" indexed="false" stored="false"/>
|
||||
<!-- ignored because not stored or indexed -->
|
||||
<dynamicField name="ignored_*" type="text" indexed="false" stored="false"/>
|
||||
|
||||
</fields>
|
||||
|
||||
|
|
|
@ -401,6 +401,9 @@
|
|||
<dynamicField name="*_d" type="sdouble" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="ignored_*" type="ignored"/>
|
||||
<dynamicField name="attr_*" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
|
||||
<dynamicField name="random*" type="random" />
|
||||
|
||||
<!-- uncomment the following to ignore any fields that don't already match an existing
|
||||
|
|
|
@ -640,14 +640,12 @@
|
|||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!--
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler">
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler" startup="lazy">
|
||||
<lst name="defaults">
|
||||
<str name="ext.map.Last-Modified">last_modified</str>
|
||||
<bool name="ext.ignore.und.fl">true</bool>
|
||||
<str name="uprefix">ignored_</str>
|
||||
<str name="map.content">text</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
-->
|
||||
|
||||
|
||||
<!-- A component to return terms and document frequency of those terms.
|
||||
|
|
Loading…
Reference in New Issue