From 25c1d174489da3aede0e17f59487cf72a6398565 Mon Sep 17 00:00:00 2001 From: Grant Ingersoll Date: Sun, 14 Dec 2008 03:47:42 +0000 Subject: [PATCH] SOLR-284: handle multivalued literals git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@726350 13f79535-47bb-0310-9956-ffa450edef68 --- .../extraction/SolrContentHandler.java | 24 ++--- .../handler/ExtractingRequestHandlerTest.java | 87 +++++++++++++++---- .../src/test/resources/solr/conf/schema.xml | 3 + 3 files changed, 87 insertions(+), 27 deletions(-) diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java index acaa20fd175..eb9467b769a 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -34,13 +34,12 @@ import java.util.UUID; * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s. * This class is not thread-safe. *

- * + *

* User's may wish to override this class to provide their own functionality. * * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory * @see org.apache.solr.handler.extraction.ExtractingRequestHandler * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader - * */ public class SolrContentHandler extends DefaultHandler implements ExtractingParams { private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class); @@ -151,10 +150,16 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara //no need to map names here, since they are literals from the user SchemaField schFld = schema.getFieldOrNull(fieldName); if (schFld != null) { - String value = params.get(name); + String[] values = params.getParams(name); + if (schFld.multiValued() == false && values.length > 1) { + throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "The Field " + fieldName + " is not multivalued"); + } boost = getBoost(fieldName); - //no need to transform here, b/c we can assume the user sent it in correctly - document.addField(fieldName, value, boost); + for (int i = 0; i < values.length; i++) { + //no need to transform here, b/c we can assume the user sent it in correctly + document.addField(fieldName, values[i], boost); + + } } else { handleUndeclaredField(fieldName); } @@ -219,10 +224,9 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara //last chance, just create one uniqId = UUID.randomUUID().toString(); } - } else if (type instanceof UUIDField){ + } else if (type instanceof UUIDField) { uniqId = UUID.randomUUID().toString(); - } - else { + } else { uniqId = String.valueOf(getNextId()); } return uniqId; @@ -294,8 +298,6 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara } - - /** * Can be used to transform input values based on their {@link org.apache.solr.schema.SchemaField} *

@@ -354,7 +356,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara } - protected synchronized long getNextId(){ + protected synchronized long getNextId() { return identifier++; } diff --git a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java index 81266a0e390..ae9e10b3d9a 100644 --- a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java +++ b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java @@ -6,6 +6,7 @@ import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; +import org.apache.solr.common.SolrException; import org.apache.solr.handler.extraction.ExtractingParams; import org.apache.solr.handler.extraction.ExtractingRequestHandler; @@ -19,8 +20,15 @@ import java.io.File; * **/ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { - @Override public String getSchemaFile() { return "schema.xml"; } - @Override public String getSolrConfigFile() { return "solrconfig.xml"; } + @Override + public String getSchemaFile() { + return "schema.xml"; + } + + @Override + public String getSolrConfigFile() { + return "solrconfig.xml"; + } public void testExtraction() throws Exception { @@ -32,9 +40,9 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "ext.def.fl", "extractedContent", "ext.map.Last-Modified", "extractedDate" ); - assertQ(req("title:solr-word"),"//*[@numFound='0']"); + assertQ(req("title:solr-word"), "//*[@numFound='0']"); assertU(commit()); - assertQ(req("title:solr-word"),"//*[@numFound='1']"); + assertQ(req("title:solr-word"), "//*[@numFound='1']"); loadLocal("simple.html", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", @@ -43,9 +51,9 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "ext.def.fl", "extractedContent", "ext.map.Last-Modified", "extractedDate" ); - assertQ(req("title:Welcome"),"//*[@numFound='0']"); + assertQ(req("title:Welcome"), "//*[@numFound='0']"); assertU(commit()); - assertQ(req("title:Welcome"),"//*[@numFound='1']"); + assertQ(req("title:Welcome"), "//*[@numFound='1']"); loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", @@ -53,13 +61,60 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "ext.def.fl", "extractedContent", "ext.map.Last-Modified", "extractedDate" ); - assertQ(req("stream_name:version_control.xml"),"//*[@numFound='0']"); + assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); assertU(commit()); - assertQ(req("stream_name:version_control.xml"),"//*[@numFound='1']"); + assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']"); + + } - + public void testLiterals() throws Exception { + ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); + assertTrue("handler is null and it shouldn't be", handler != null); + //test literal + loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", + "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", + "ext.map.Author", "extractedAuthor", + "ext.def.fl", "extractedContent", + "ext.literal.extractionLiteralMV", "one", + "ext.literal.extractionLiteralMV", "two", + "ext.map.Last-Modified", "extractedDate" + + ); + assertQ(req("stream_name:version_control.xml"), "//*[@numFound='0']"); + assertU(commit()); + assertQ(req("stream_name:version_control.xml"), "//*[@numFound='1']"); + + assertQ(req("extractionLiteralMV:one"), "//*[@numFound='1']"); + assertQ(req("extractionLiteralMV:two"), "//*[@numFound='1']"); + + try { + loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", + "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", + "ext.map.Author", "extractedAuthor", + "ext.def.fl", "extractedContent", + "ext.literal.extractionLiteral", "one", + "ext.literal.extractionLiteral", "two", + "ext.map.Last-Modified", "extractedDate" + ); + assertTrue("Exception should have been thrown", false); + } catch (SolrException e) { + //nothing to see here, move along + } + + loadLocal("version_control.xml", "ext.map.created", "extractedDate", "ext.map.producer", "extractedProducer", + "ext.map.creator", "extractedCreator", "ext.map.Keywords", "extractedKeywords", + "ext.map.Author", "extractedAuthor", + "ext.def.fl", "extractedContent", + "ext.literal.extractionLiteral", "one", + "ext.map.Last-Modified", "extractedDate" + ); + assertU(commit()); + assertQ(req("extractionLiteral:one"), "//*[@numFound='1']"); + + } + public void testPlainTextSpecifyingMimeType() throws Exception { ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract"); @@ -71,11 +126,11 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "ext.map.Author", "extractedAuthor", "ext.map.language", "extractedLanguage", "ext.def.fl", "extractedContent", - ExtractingParams.STREAM_TYPE, "text/plain" + ExtractingParams.STREAM_TYPE, "text/plain" ); - assertQ(req("extractedContent:Apache"),"//*[@numFound='0']"); + assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); assertU(commit()); - assertQ(req("extractedContent:Apache"),"//*[@numFound='1']"); + assertQ(req("extractedContent:Apache"), "//*[@numFound='1']"); } public void testPlainTextSpecifyingResourceName() throws Exception { @@ -88,11 +143,11 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { "ext.map.Author", "extractedAuthor", "ext.map.language", "extractedLanguage", "ext.def.fl", "extractedContent", - ExtractingParams.RESOURCE_NAME, "version_control.txt" + ExtractingParams.RESOURCE_NAME, "version_control.txt" ); - assertQ(req("extractedContent:Apache"),"//*[@numFound='0']"); + assertQ(req("extractedContent:Apache"), "//*[@numFound='0']"); assertU(commit()); - assertQ(req("extractedContent:Apache"),"//*[@numFound='1']"); + assertQ(req("extractedContent:Apache"), "//*[@numFound='1']"); } // Note: If you load a plain text file specifying neither MIME type nor filename, extraction will silently fail. This is because Tika's @@ -128,7 +183,7 @@ public class ExtractingRequestHandlerTest extends AbstractSolrTestCase { SolrQueryResponse loadLocal(String filename, String... args) throws Exception { - LocalSolrQueryRequest req = (LocalSolrQueryRequest)req(args); + LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args); // TODO: stop using locally defined streams once stream.file and // stream.body work everywhere diff --git a/contrib/extraction/src/test/resources/solr/conf/schema.xml b/contrib/extraction/src/test/resources/solr/conf/schema.xml index 4129ebfe78c..1a2748c084c 100644 --- a/contrib/extraction/src/test/resources/solr/conf/schema.xml +++ b/contrib/extraction/src/test/resources/solr/conf/schema.xml @@ -402,6 +402,9 @@ + + +