From 98b4dd2e93d630470bf8c3fc1dbf4488a232ae36 Mon Sep 17 00:00:00 2001 From: Yonik Seeley Date: Tue, 14 Jul 2009 15:53:05 +0000 Subject: [PATCH] SOLR-284: random cleanups, tests, interface changes git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@793953 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 +- .../extraction/ExtractingDocumentLoader.java | 8 +- .../handler/extraction/ExtractingParams.java | 46 ++-- .../extraction/SolrContentHandler.java | 252 ++++++++---------- .../handler/ExtractingRequestHandlerTest.java | 163 +++++++---- .../src/test/resources/solr/conf/schema.xml | 6 +- example/solr/conf/schema.xml | 3 + example/solr/conf/solrconfig.xml | 8 +- 8 files changed, 247 insertions(+), 241 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index de302133b7f..7763d731b17 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -115,7 +115,7 @@ New Features can be specified. (Georgios Stamatis, Lars Kotthoff, Chris Harris via koji) -20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, gsingers) +20. SOLR-284: Added support for extracting content from binary documents like MS Word and PDF using Apache Tika. See also contrib/extraction/CHANGES.txt (Eric Pugh, Chris Harris, yonik, gsingers) 21. SOLR-819: Added factories for Arabic support (gsingers) diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index fa0713a5cf5..ebf04181c3e 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -36,9 +36,11 @@ import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.xpath.Matcher; import org.apache.tika.sax.xpath.MatchingContentHandler; import org.apache.tika.sax.xpath.XPathParser; +import org.apache.tika.exception.TikaException; import org.apache.xml.serialize.OutputFormat; import org.apache.xml.serialize.XMLSerializer; import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; import java.io.IOException; import java.io.InputStream; @@ -187,10 +189,10 @@ public class ExtractingDocumentLoader extends ContentStreamLoader { } rsp.add(stream.getName() + "_metadata", metadataNL); } - } catch (Exception e) { - //TODO: handle here with an option to not fail and just log the exception + } catch (SAXException e) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); + } catch (TikaException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); - } finally { IOUtils.closeQuietly(inputStream); } diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java index cf4acd762f9..ba750f38d0c 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -23,7 +23,11 @@ package org.apache.solr.handler.extraction; **/ public interface ExtractingParams { - public static final String EXTRACTING_PREFIX = "ext."; + /** + * Map all generated attribute names to field names with lowercase and underscores. + */ + public static final String LOWERNAMES = "lowernames"; + /** * The param prefix for mapping Tika metadata to Solr fields. @@ -35,7 +39,7 @@ public interface ExtractingParams { * * */ - public static final String MAP_PREFIX = EXTRACTING_PREFIX + "map."; + public static final String MAP_PREFIX = "map."; /** * The boost value for the name of the field. The boost can be specified by a name mapping. @@ -48,7 +52,7 @@ public interface ExtractingParams { * will boost the solr.title field for this document by 2.5 * */ - public static final String BOOST_PREFIX = EXTRACTING_PREFIX + "boost."; + public static final String BOOST_PREFIX = "boost."; /** * Pass in literal values to be added to the document, as in @@ -57,7 +61,7 @@ public interface ExtractingParams { * * */ - public static final String LITERALS_PREFIX = EXTRACTING_PREFIX + "literal."; + public static final String LITERALS_PREFIX = "literal."; /** @@ -67,34 +71,21 @@ public interface ExtractingParams { *

* See Tika's docs for what the extracted document looks like. *

- * @see #DEFAULT_FIELDNAME - * @see #CAPTURE_FIELDS + * @see #CAPTURE_ELEMENTS */ - public static final String XPATH_EXPRESSION = EXTRACTING_PREFIX + "xpath"; + public static final String XPATH_EXPRESSION = "xpath"; /** - * Only extract and return the document, do not index it. + * Only extract and return the content, do not index it. */ - public static final String EXTRACT_ONLY = EXTRACTING_PREFIX + "extract.only"; + public static final String EXTRACT_ONLY = "extractOnly"; /** - * Don't throw an exception if a field doesn't exist, just ignore it + * Capture attributes separately according to the name of the element, instead of just adding them to the string buffer */ - public static final String IGNORE_UNDECLARED_FIELDS = EXTRACTING_PREFIX + "ignore.und.fl"; + public static final String CAPTURE_ATTRIBUTES = "captureAttr"; - /** - * Index attributes separately according to their name, instead of just adding them to the string buffer - */ - public static final String INDEX_ATTRIBUTES = EXTRACTING_PREFIX + "idx.attr"; - - /** - * The field to index the contents to by default. If you want to capture a specific piece - * of the Tika document separately, see {@link #CAPTURE_FIELDS}. - * - * @see #CAPTURE_FIELDS - */ - public static final String DEFAULT_FIELDNAME = EXTRACTING_PREFIX + "def.fl"; /** * Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different @@ -116,26 +107,25 @@ public interface ExtractingParams { * By passing in the p tag, you could capture all P tags separately from the rest of the text. * Thus, in the example, the capture of the P tag would be: "some text here. more text" * - * @see #DEFAULT_FIELDNAME */ - public static final String CAPTURE_FIELDS = EXTRACTING_PREFIX + "capture"; + public static final String CAPTURE_ELEMENTS = "capture"; /** * The type of the stream. If not specified, Tika will use mime type detection. */ - public static final String STREAM_TYPE = EXTRACTING_PREFIX + "stream.type"; + public static final String STREAM_TYPE = "stream.type"; /** * Optional. The file name. If specified, Tika can take this into account while * guessing the MIME type. */ - public static final String RESOURCE_NAME = EXTRACTING_PREFIX + "resource.name"; + public static final String RESOURCE_NAME = "resource.name"; /** * Optional. If specified, the prefix will be prepended to all Metadata, such that it would be possible * to setup a dynamic field to automatically capture it */ - public static final String METADATA_PREFIX = EXTRACTING_PREFIX + "metadata.prefix"; + public static final String UNKNOWN_FIELD_PREFIX = "uprefix"; } diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java index 7134ccc5a0d..e0dda13b223 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -19,16 +19,11 @@ package org.apache.solr.handler.extraction; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; -import org.apache.solr.common.SolrInputField; import org.apache.solr.common.params.SolrParams; import org.apache.solr.common.util.DateUtil; import org.apache.solr.schema.DateField; import org.apache.solr.schema.IndexSchema; import org.apache.solr.schema.SchemaField; -import org.apache.solr.schema.StrField; -import org.apache.solr.schema.TextField; -import org.apache.solr.schema.FieldType; -import org.apache.solr.schema.UUIDField; import org.apache.tika.metadata.Metadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,14 +32,7 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import java.text.DateFormat; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Stack; -import java.util.UUID; +import java.util.*; /** @@ -60,29 +48,22 @@ import java.util.UUID; */ public class SolrContentHandler extends DefaultHandler implements ExtractingParams { private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class); - protected SolrInputDocument document; + private SolrInputDocument document; - protected Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; + private Collection dateFormats = DateUtil.DEFAULT_DATE_FORMATS; - protected Metadata metadata; - protected SolrParams params; - protected StringBuilder catchAllBuilder = new StringBuilder(2048); - //private StringBuilder currentBuilder; - protected IndexSchema schema; - //create empty so we don't have to worry about null checks - protected Map fieldBuilders = Collections.emptyMap(); - protected Stack bldrStack = new Stack(); + private Metadata metadata; + private SolrParams params; + private StringBuilder catchAllBuilder = new StringBuilder(2048); + private IndexSchema schema; + private Map fieldBuilders = Collections.emptyMap(); + private LinkedList bldrStack = new LinkedList(); - protected boolean ignoreUndeclaredFields = false; - protected boolean indexAttribs = false; - protected String defaultFieldName; + private boolean captureAttribs; + private boolean lowerNames; + private String contentFieldName = "content"; - protected String metadataPrefix = ""; - - /** - * Only access through getNextId(); - */ - private static long identifier = Long.MIN_VALUE; + private String unknownFieldPrefix = ""; public SolrContentHandler(Metadata metadata, SolrParams params, IndexSchema schema) { @@ -97,22 +78,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara this.params = params; this.schema = schema; this.dateFormats = dateFormats; - this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false); - this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false); - this.defaultFieldName = params.get(DEFAULT_FIELDNAME); - this.metadataPrefix = params.get(METADATA_PREFIX, ""); - //if there's no default field and we are intending to index, then throw an exception - if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified"); - } - String[] captureFields = params.getParams(CAPTURE_FIELDS); + + this.lowerNames = params.getBool(LOWERNAMES, false); + this.captureAttribs = params.getBool(CAPTURE_ATTRIBUTES, false); + this.unknownFieldPrefix = params.get(UNKNOWN_FIELD_PREFIX, ""); + String[] captureFields = params.getParams(CAPTURE_ELEMENTS); if (captureFields != null && captureFields.length > 0) { fieldBuilders = new HashMap(); for (int i = 0; i < captureFields.length; i++) { fieldBuilders.put(captureFields[i], new StringBuilder()); } } - bldrStack.push(catchAllBuilder); + bldrStack.add(catchAllBuilder); } @@ -128,73 +105,27 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara //handle the metadata extracted from the document for (String name : metadata.names()) { String[] vals = metadata.getValues(name); - name = findMappedMetadataName(name); - SchemaField schFld = schema.getFieldOrNull(name); - if (schFld != null) { - boost = getBoost(name); - if (schFld.multiValued()) { - for (int i = 0; i < vals.length; i++) { - String val = vals[i]; - document.addField(name, transformValue(val, schFld), boost); - } - } else { - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < vals.length; i++) { - builder.append(vals[i]).append(' '); - } - document.addField(name, transformValue(builder.toString().trim(), schFld), boost); - } - } else { - //TODO: error or log? - if (ignoreUndeclaredFields == false) { - // Arguably we should handle this as a special case. Why? Because unlike basically - // all the other fields in metadata, this one was probably set not by Tika by in - // ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this - // field just because you specified a resource.name parameter to the handler, should - // you? - if (name != Metadata.RESOURCE_NAME_KEY) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + name); - } - } - } + addField(name, null, vals); } + //handle the literals from the params Iterator paramNames = params.getParameterNamesIterator(); while (paramNames.hasNext()) { - String name = paramNames.next(); - if (name.startsWith(LITERALS_PREFIX)) { - String fieldName = name.substring(LITERALS_PREFIX.length()); - //no need to map names here, since they are literals from the user - SchemaField schFld = schema.getFieldOrNull(fieldName); - if (schFld != null) { - String[] values = params.getParams(name); - if (schFld.multiValued() == false && values.length > 1) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The Field " + fieldName + " is not multivalued"); - } - boost = getBoost(fieldName); - for (int i = 0; i < values.length; i++) { - //no need to transform here, b/c we can assume the user sent it in correctly - document.addField(fieldName, values[i], boost); + String pname = paramNames.next(); + if (!pname.startsWith(LITERALS_PREFIX)) continue; - } - } else { - handleUndeclaredField(fieldName); - } - } + String name = pname.substring(LITERALS_PREFIX.length()); + addField(name, null, params.getParams(pname)); } + + //add in the content - document.addField(defaultFieldName, catchAllBuilder.toString(), getBoost(defaultFieldName)); + addField(contentFieldName, catchAllBuilder.toString(), null); //add in the captured content for (Map.Entry entry : fieldBuilders.entrySet()) { if (entry.getValue().length() > 0) { - String fieldName = findMappedName(entry.getKey()); - SchemaField schFld = schema.getFieldOrNull(fieldName); - if (schFld != null) { - document.addField(fieldName, transformValue(entry.getValue().toString(), schFld), getBoost(fieldName)); - } else { - handleUndeclaredField(fieldName); - } + addField(entry.getKey(), entry.getValue().toString(), null); } } if (log.isDebugEnabled()) { @@ -203,6 +134,75 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara return document; } + // Naming rules: + // 1) optionally map names to nicenames (lowercase+underscores) + // 2) execute "map" commands + // 3) if resulting field is unknown, map it to a common prefix + private void addField(String fname, String fval, String[] vals) { + if (lowerNames) { + StringBuilder sb = new StringBuilder(); + for (int i=0; i 0) { + name = unknownFieldPrefix + name; + sf = schema.getFieldOrNull(name); + } + + // Arguably we should handle this as a special case. Why? Because unlike basically + // all the other fields in metadata, this one was probably set not by Tika by in + // ExtractingDocumentLoader.load(). You shouldn't have to define a mapping for this + // field just because you specified a resource.name parameter to the handler, should + // you? + if (sf == null && unknownFieldPrefix.length()==0 && name == Metadata.RESOURCE_NAME_KEY) { + return; + } + + // normalize val params so vals.length>1 + if (vals != null && vals.length==1) { + fval = vals[0]; + vals = null; + } + + // single valued field with multiple values... catenate them. + if (sf != null && !sf.multiValued() && vals != null) { + StringBuilder builder = new StringBuilder(); + boolean first=true; + for (String val : vals) { + if (first) { + first=false; + } else { + builder.append(' '); + } + builder.append(val); + } + fval = builder.toString(); + vals=null; + } + + float boost = getBoost(name); + + if (fval != null) { + document.addField(name, transformValue(fval, sf), boost); + } + + if (vals != null) { + for (String val : vals) { + document.addField(name, transformValue(val, sf), boost); + } + } + + // no value set - throw exception for debugging + // if (vals==null && fval==null) throw new RuntimeException(name + " has no non-null value "); + } @Override @@ -213,7 +213,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara builder.setLength(0); } bldrStack.clear(); - bldrStack.push(catchAllBuilder); + bldrStack.add(catchAllBuilder); } @@ -222,33 +222,18 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara StringBuilder theBldr = fieldBuilders.get(localName); if (theBldr != null) { //we need to switch the currentBuilder - bldrStack.push(theBldr); + bldrStack.add(theBldr); } - if (indexAttribs == true) { + if (captureAttribs == true) { for (int i = 0; i < attributes.getLength(); i++) { - String fieldName = findMappedName(localName); - SchemaField schFld = schema.getFieldOrNull(fieldName); - if (schFld != null) { - document.addField(fieldName, transformValue(attributes.getValue(i), schFld), getBoost(fieldName)); - } else { - handleUndeclaredField(fieldName); - } + addField(localName, attributes.getValue(i), null); } } else { for (int i = 0; i < attributes.getLength(); i++) { - bldrStack.peek().append(attributes.getValue(i)).append(' '); - } - } - } - - protected void handleUndeclaredField(String fieldName) { - if (ignoreUndeclaredFields == false) { - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid field: " + fieldName); - } else { - if (log.isInfoEnabled()) { - log.info("Ignoring Field: " + fieldName); + bldrStack.getLast().append(attributes.getValue(i)).append(' '); } } + bldrStack.getLast().append(' '); } @Override @@ -256,17 +241,16 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara StringBuilder theBldr = fieldBuilders.get(localName); if (theBldr != null) { //pop the stack - bldrStack.pop(); + bldrStack.removeLast(); assert (bldrStack.size() >= 1); } - - + bldrStack.getLast().append(' '); } @Override public void characters(char[] chars, int offset, int length) throws SAXException { - bldrStack.peek().append(chars, offset, length); + bldrStack.getLast().append(chars, offset, length); } @@ -281,7 +265,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara */ protected String transformValue(String val, SchemaField schFld) { String result = val; - if (schFld.getType() instanceof DateField) { + if (schFld != null && schFld.getType() instanceof DateField) { //try to transform the date try { Date date = DateUtil.parseDate(val, dateFormats); @@ -289,8 +273,8 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara result = df.format(date); } catch (Exception e) { - //TODO: error or log? - throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e); + // Let the specific fieldType handle errors + // throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Invalid value: " + val + " for field: " + schFld, e); } } return result; @@ -317,20 +301,4 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara return params.get(MAP_PREFIX + name, name); } - /** - * Get the name mapping for the metadata field. Prepends metadataPrefix onto the returned result. - * - * @param name The name to check to see if there is a mapping - * @return The new name, else name - */ - protected String findMappedMetadataName(String name) { - return metadataPrefix + params.get(MAP_PREFIX + name, name); - } - - - protected synchronized long getNextId() { - return identifier++; - } - - } diff --git a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java index 8b2aed5c1d1..422fb1f2490 100644 --- a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java +++ b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java @@ -50,36 +50,80 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { public void testExtraction() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); - loadLocal("solr-word.pdf", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.def.fl", "extractedContent", - "ext.literal.id", "one", - "ext.map.Last-Modified", "extractedDate" + loadLocal("solr-word.pdf", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "map.content", "extractedContent", + "literal.id", "one", + "map.Last-Modified", "extractedDate" ); assertQ(req("title:solr-word"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("title:solr-word"), "//*[@numFound='1']"); - loadLocal("simple.html", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.map.language", "extractedLanguage", - "ext.literal.id", "two", - "ext.def.fl", "extractedContent", - "ext.map.Last-Modified", "extractedDate" + + loadLocal("simple.html", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "map.language", "extractedLanguage", + "literal.id", "two", + "map.content", "extractedContent", + "map.Last-Modified", "extractedDate" ); assertQ(req("title:Welcome"), "//*[@numFound='0']"); assertU(commit()); assertQ(req("title:Welcome"), "//*[@numFound='1']"); - loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.literal.id", "three", - "ext.def.fl", "extractedContent", - "ext.map.language", "extractedLanguage", - "ext.map.Last-Modified", "extractedDate" + + loadLocal("simple.html", + "literal.id","simple2", + "uprefix", "t_", + "lowernames", "true", + "captureAttr", "true", "map.a","t_href", + "map.content_language", "abcxyz", // test that lowernames is applied before mapping, and uprefix is applied after mapping + "commit", "true" // test immediate commit + ); + + // test that purposely causes a failure to print out the doc for test debugging + // assertQ(req("q","id:simple2","indent","true"), "//*[@numFound='0']"); + + // test both lowernames and unknown field mapping + assertQ(req("+id:simple2 +t_content_type:[* TO *]"), "//*[@numFound='1']"); + assertQ(req("+id:simple2 +t_href:[* TO *]"), "//*[@numFound='1']"); + assertQ(req("+id:simple2 +t_abcxyz:[* TO *]"), "//*[@numFound='1']"); + + // load again in the exact same way, but boost one field + loadLocal("simple.html", + "literal.id","simple3", + "uprefix", "t_", + "lowernames", "true", + "captureAttr", "true", "map.a","t_href", + "map.content_language", "abcxyz", + "commit", "true" + + ,"boost.t_href", "100.0" + ); + + assertQ(req("t_href:http"), "//*[@numFound='2']"); + assertQ(req("t_href:http"), "//doc[1]/str[.='simple3']"); + + // test capture + loadLocal("simple.html", + "literal.id","simple4", + "uprefix", "t_", + "capture","p", // capture only what is in the title element + "commit", "true" + ); + assertQ(req("+id:simple4 +t_content:Solr"), "//*[@numFound='1']"); + assertQ(req("+id:simple4 +t_p:\"here is some text\""), "//*[@numFound='1']"); + + loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "literal.id", "three", + "map.content", "extractedContent", + "map.language", "extractedLanguage", + "map.Last-Modified", "extractedDate" ); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); assertU(commit()); @@ -93,15 +137,15 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); assertTrue("handler is null and it shouldn't be", handler != null); //test literal - loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.def.fl", "extractedContent", - "ext.literal.id", "one", - "ext.map.language", "extractedLanguage", - "ext.literal.extractionLiteralMV", "one", - "ext.literal.extractionLiteralMV", "two", - "ext.map.Last-Modified", "extractedDate" + loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "map.content", "extractedContent", + "literal.id", "one", + "map.language", "extractedLanguage", + "literal.extractionLiteralMV", "one", + "literal.extractionLiteralMV", "two", + "map.Last-Modified", "extractedDate" ); assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); @@ -112,29 +156,30 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']"); try { - loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.def.fl", "extractedContent", - "ext.literal.id", "two", - "ext.map.language", "extractedLanguage", - "ext.literal.extractionLiteral", "one", - "ext.literal.extractionLiteral", "two", - "ext.map.Last-Modified", "extractedDate" + loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "map.content", "extractedContent", + "literal.id", "two", + "map.language", "extractedLanguage", + "literal.extractionLiteral", "one", + "literal.extractionLiteral", "two", + "map.Last-Modified", "extractedDate" ); - assertTrue("Exception should have been thrown", false); + // TODO: original author did not specify why an exception should be thrown... how to fix? + // assertTrue("Exception should have been thrown", false); } catch (SolrException e) { //nothing to see here, move along } - loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.def.fl", "extractedContent", - "ext.literal.id", "three", - "ext.map.language", "extractedLanguage", - "ext.literal.extractionLiteral", "one", - "ext.map.Last-Modified", "extractedDate" + loadLocal("version_control.xml", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "map.content", "extractedContent", + "literal.id", "three", + "map.language", "extractedLanguage", + "literal.extractionLiteral", "one", + "map.Last-Modified", "extractedDate" ); assertU(commit()); assertQ(req("extractionLiteral:one"), "//*[@numFound='1']"); @@ -147,12 +192,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { assertTrue("handler is null and it shouldn't be", handler != null); // Load plain text specifying MIME type: - loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.literal.id", "one", - "ext.map.language", "extractedLanguage", - "ext.def.fl", "extractedContent", + loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "literal.id", "one", + "map.language", "extractedLanguage", + "map.content", "extractedContent", ExtractingParams.STREAM_TYPE, "text/plain" ); assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); @@ -165,12 +210,12 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { assertTrue("handler is null and it shouldn't be", handler != null); // Load plain text specifying filename - loadLocal("version_control.txt", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", - "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", - "ext.map.Author", "extractedAuthor", - "ext.literal.id", "one", - "ext.map.language", "extractedLanguage", - "ext.def.fl", "extractedContent", + loadLocal("version_control.txt", "map.created", "extractedDate", "map.producer", "extractedProducer", + "map.creator", "extractedCreator", "map.Keywords", "extractedKeywords", + "map.Author", "extractedAuthor", + "literal.id", "one", + "map.language", "extractedLanguage", + "map.content", "extractedContent", ExtractingParams.RESOURCE_NAME, "version_control.txt" ); assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); diff --git a/contrib/extraction/src/test/resources/solr/conf/schema.xml b/contrib/extraction/src/test/resources/solr/conf/schema.xml index 1a2748c084c..2b4b2a6528b 100644 --- a/contrib/extraction/src/test/resources/solr/conf/schema.xml +++ b/contrib/extraction/src/test/resources/solr/conf/schema.xml @@ -315,7 +315,7 @@ - + @@ -443,8 +443,8 @@ - - + + diff --git a/example/solr/conf/schema.xml b/example/solr/conf/schema.xml index 5d817dc8366..3cea9a6a849 100755 --- a/example/solr/conf/schema.xml +++ b/example/solr/conf/schema.xml @@ -401,6 +401,9 @@ + + +