diff --git a/build.xml b/build.xml index 20ceb7e1c44..c33daf2d828 100644 --- a/build.xml +++ b/build.xml @@ -198,11 +198,12 @@ - + + @@ -395,6 +396,7 @@ + @@ -485,6 +487,9 @@ + + + @@ -668,6 +675,7 @@ + @@ -751,6 +759,16 @@ + + + + + + + + + @@ -796,6 +814,8 @@ + + diff --git a/contrib/extraction/build.xml b/contrib/extraction/build.xml index 2cdab0d6a3c..5aff8086504 100644 --- a/contrib/extraction/build.xml +++ b/contrib/extraction/build.xml @@ -110,7 +110,11 @@ - + + + + + diff --git a/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar b/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar deleted file mode 100644 index b20b524458a..00000000000 --- a/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar +++ /dev/null @@ -1,2 +0,0 @@ -AnyObjectId[16b9a3ed370d5a617d72f0b8935859bf0eac7678] was removed in git history. -Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/lib/tika-0.2.jar b/contrib/extraction/lib/tika-0.2.jar new file mode 100644 index 00000000000..7a5227130db --- /dev/null +++ b/contrib/extraction/lib/tika-0.2.jar @@ -0,0 +1,2 @@ +AnyObjectId[65882f20fd59a46c577fbdfd3ddb63f4d49cb71c] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/contrib/extraction/solr-cell-pom.xml.template b/contrib/extraction/solr-cell-pom.xml.template new file mode 100644 index 00000000000..c2d07e186ab --- /dev/null +++ b/contrib/extraction/solr-cell-pom.xml.template @@ -0,0 +1,46 @@ + + + + + 4.0.0 + + + org.apache.solr + solr-parent + @maven_version@ + + + org.apache.solr + solr-cell + Apache Solr Content Extraction Library + @maven_version@ + Apache Solr Content Extraction Library integrates Apache Tika content extraction framework into Solr + jar + + + + org.apache.tika + tika + 0.2 + + + diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java similarity index 97% rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java index f58a2aa39c4..506c706921f 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java @@ -1,4 +1,4 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; import org.apache.commons.io.IOUtils; import org.apache.solr.common.SolrException; @@ -10,6 +10,7 @@ import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.schema.IndexSchema; import org.apache.solr.update.AddUpdateCommand; import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.apache.solr.handler.ContentStreamLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; @@ -28,7 +29,7 @@ import java.io.StringWriter; /** - * + * The class responsible for loading extracted content into Solr. * **/ public class ExtractingDocumentLoader extends ContentStreamLoader { diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java similarity index 68% rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java index f845a71533b..6f780240c3c 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java @@ -1,8 +1,8 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; /** - * + * Constants used internally by the {@link ExtractingRequestHandler}. * **/ public interface ExtractingMetadataConstants { diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java similarity index 94% rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java index d88b12b4f81..bd259156801 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java @@ -1,8 +1,8 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; /** - * The various parameters to use when extracting content. + * The various Solr Parameters names to use when extracting content. * **/ public interface ExtractingParams { @@ -47,7 +47,7 @@ public interface ExtractingParams { /** * Restrict the extracted parts of a document to be indexed * by passing in an XPath expression. All content that satisfies the XPath expr. - * will be passed to the {@link org.apache.solr.handler.SolrContentHandler}. + * will be passed to the {@link SolrContentHandler}. *

* See Tika's docs for what the extracted document looks like. *

@@ -84,7 +84,7 @@ public interface ExtractingParams { * Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different * then the case of passing in an XPath expression. *

- * The Capture field is based on the localName returned to the {@link org.apache.solr.handler.SolrContentHandler} + * The Capture field is based on the localName returned to the {@link SolrContentHandler} * by Tika, not to be confused by the mapped field. The field name can then * be mapped into the index schema. *

diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java similarity index 96% rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java index 2d00aebfa83..b5d6215a172 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java @@ -1,4 +1,4 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; /** * Licensed to the Apache Software Foundation (ASF) under one or more @@ -25,6 +25,8 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.update.processor.UpdateRequestProcessor; import org.apache.solr.util.plugin.SolrCoreAware; +import org.apache.solr.handler.ContentStreamHandlerBase; +import org.apache.solr.handler.ContentStreamLoader; import org.apache.tika.config.TikaConfig; import org.apache.tika.exception.TikaException; import org.slf4j.Logger; diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java similarity index 91% rename from contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java index b25ee14185a..acaa20fd175 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java @@ -1,4 +1,4 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; @@ -31,7 +31,16 @@ import java.util.UUID; /** - * This class is not thread-safe. It is responsible for responding to Tika extraction events and producing a Solr document + * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s. + * This class is not thread-safe. + *

+ * + * User's may wish to override this class to provide their own functionality. + * + * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory + * @see org.apache.solr.handler.extraction.ExtractingRequestHandler + * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader + * */ public class SolrContentHandler extends DefaultHandler implements ExtractingParams { private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class); @@ -72,15 +81,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara this.params = params; this.schema = schema; this.dateFormats = dateFormats; - this.ignoreUndeclaredFields = params.getBool(ExtractingParams.IGNORE_UNDECLARED_FIELDS, false); - this.indexAttribs = params.getBool(ExtractingParams.INDEX_ATTRIBUTES, false); - this.defaultFieldName = params.get(ExtractingParams.DEFAULT_FIELDNAME); - this.metadataPrefix = params.get(ExtractingParams.METADATA_PREFIX, ""); + this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false); + this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false); + this.defaultFieldName = params.get(DEFAULT_FIELDNAME); + this.metadataPrefix = params.get(METADATA_PREFIX, ""); //if there's no default field and we are intending to index, then throw an exception - if (defaultFieldName == null && params.getBool(ExtractingParams.EXTRACT_ONLY, false) == false) { + if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified"); } - String[] captureFields = params.getParams(ExtractingParams.CAPTURE_FIELDS); + String[] captureFields = params.getParams(CAPTURE_FIELDS); if (captureFields != null && captureFields.length > 0) { fieldBuilders = new HashMap(); for (int i = 0; i < captureFields.length; i++) { @@ -186,7 +195,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara /** * Generate an ID for the document. First try to get - * {@link org.apache.solr.handler.ExtractingMetadataConstants#STREAM_NAME} from the + * {@link ExtractingMetadataConstants#STREAM_NAME} from the * {@link org.apache.tika.metadata.Metadata}, then try {@link ExtractingMetadataConstants#STREAM_SOURCE_INFO} * then try {@link org.apache.tika.metadata.Metadata#IDENTIFIER}. * If those all are null, then generate a random UUID using {@link java.util.UUID#randomUUID()}. @@ -331,7 +340,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara * @return The new name, if there is one, else name */ protected String findMappedName(String name) { - return params.get(ExtractingParams.MAP_PREFIX + name, name); + return params.get(MAP_PREFIX + name, name); } /** @@ -341,7 +350,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara * @return The new name, else name */ protected String findMappedMetadataName(String name) { - return metadataPrefix + params.get(ExtractingParams.MAP_PREFIX + name, name); + return metadataPrefix + params.get(MAP_PREFIX + name, name); } diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java similarity index 92% rename from contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java index 68ef434f9c4..36b5ebdb815 100644 --- a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java +++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java @@ -1,4 +1,4 @@ -package org.apache.solr.handler; +package org.apache.solr.handler.extraction; import org.apache.tika.metadata.Metadata; import org.apache.solr.common.params.SolrParams; diff --git a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java index 8dea409ec62..81266a0e390 100644 --- a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java +++ b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java @@ -6,6 +6,8 @@ import org.apache.solr.request.SolrQueryResponse; import org.apache.solr.common.util.ContentStream; import org.apache.solr.common.util.ContentStreamBase; import org.apache.solr.common.util.NamedList; +import org.apache.solr.handler.extraction.ExtractingParams; +import org.apache.solr.handler.extraction.ExtractingRequestHandler; import java.util.List; import java.util.ArrayList; diff --git a/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml b/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml index 7842824e80a..f7495d646c7 100644 --- a/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml +++ b/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml @@ -308,7 +308,7 @@ false - + diff --git a/example/solr/conf/solrconfig.xml b/example/solr/conf/solrconfig.xml index 095cb99b50b..297f828137b 100755 --- a/example/solr/conf/solrconfig.xml +++ b/example/solr/conf/solrconfig.xml @@ -627,6 +627,16 @@ + + +