diff --git a/build.xml b/build.xml
index 20ceb7e1c44..c33daf2d828 100644
--- a/build.xml
+++ b/build.xml
@@ -198,11 +198,12 @@
-
+
+
@@ -395,6 +396,7 @@
+
@@ -485,6 +487,9 @@
+
+
+
@@ -668,6 +675,7 @@
+
@@ -751,6 +759,16 @@
+
+
+
+
+
+
+
+
+
@@ -796,6 +814,8 @@
+
+
diff --git a/contrib/extraction/build.xml b/contrib/extraction/build.xml
index 2cdab0d6a3c..5aff8086504 100644
--- a/contrib/extraction/build.xml
+++ b/contrib/extraction/build.xml
@@ -110,7 +110,11 @@
-
+
+
+
+
+
diff --git a/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar b/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar
deleted file mode 100644
index b20b524458a..00000000000
--- a/contrib/extraction/lib/tika-0.2-SNAPSHOT.jar
+++ /dev/null
@@ -1,2 +0,0 @@
-AnyObjectId[16b9a3ed370d5a617d72f0b8935859bf0eac7678] was removed in git history.
-Apache SVN contains full history.
\ No newline at end of file
diff --git a/contrib/extraction/lib/tika-0.2.jar b/contrib/extraction/lib/tika-0.2.jar
new file mode 100644
index 00000000000..7a5227130db
--- /dev/null
+++ b/contrib/extraction/lib/tika-0.2.jar
@@ -0,0 +1,2 @@
+AnyObjectId[65882f20fd59a46c577fbdfd3ddb63f4d49cb71c] was removed in git history.
+Apache SVN contains full history.
\ No newline at end of file
diff --git a/contrib/extraction/solr-cell-pom.xml.template b/contrib/extraction/solr-cell-pom.xml.template
new file mode 100644
index 00000000000..c2d07e186ab
--- /dev/null
+++ b/contrib/extraction/solr-cell-pom.xml.template
@@ -0,0 +1,46 @@
+
+
+
+
+ 4.0.0
+
+
+ org.apache.solr
+ solr-parent
+ @maven_version@
+
+
+ org.apache.solr
+ solr-cell
+ Apache Solr Content Extraction Library
+ @maven_version@
+ Apache Solr Content Extraction Library integrates Apache Tika content extraction framework into Solr
+ jar
+
+
+
+ org.apache.tika
+ tika
+ 0.2
+
+
+
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
similarity index 97%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
index f58a2aa39c4..506c706921f 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingDocumentLoader.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
@@ -1,4 +1,4 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
@@ -10,6 +10,7 @@ import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
+import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@@ -28,7 +29,7 @@ import java.io.StringWriter;
/**
- *
+ * The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
similarity index 68%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
index f845a71533b..6f780240c3c 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingMetadataConstants.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingMetadataConstants.java
@@ -1,8 +1,8 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
/**
- *
+ * Constants used internally by the {@link ExtractingRequestHandler}.
*
**/
public interface ExtractingMetadataConstants {
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
similarity index 94%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
index d88b12b4f81..bd259156801 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingParams.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingParams.java
@@ -1,8 +1,8 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
/**
- * The various parameters to use when extracting content.
+ * The various Solr Parameters names to use when extracting content.
*
**/
public interface ExtractingParams {
@@ -47,7 +47,7 @@ public interface ExtractingParams {
/**
* Restrict the extracted parts of a document to be indexed
* by passing in an XPath expression. All content that satisfies the XPath expr.
- * will be passed to the {@link org.apache.solr.handler.SolrContentHandler}.
+ * will be passed to the {@link SolrContentHandler}.
*
* See Tika's docs for what the extracted document looks like.
*
@@ -84,7 +84,7 @@ public interface ExtractingParams {
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
* then the case of passing in an XPath expression.
*
- * The Capture field is based on the localName returned to the {@link org.apache.solr.handler.SolrContentHandler}
+ * The Capture field is based on the localName returned to the {@link SolrContentHandler}
* by Tika, not to be confused by the mapped field. The field name can then
* be mapped into the index schema.
*
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
similarity index 96%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
index 2d00aebfa83..b5d6215a172 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/ExtractingRequestHandler.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/ExtractingRequestHandler.java
@@ -1,4 +1,4 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -25,6 +25,8 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
+import org.apache.solr.handler.ContentStreamHandlerBase;
+import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
similarity index 91%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
index b25ee14185a..acaa20fd175 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandler.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandler.java
@@ -1,4 +1,4 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@@ -31,7 +31,16 @@ import java.util.UUID;
/**
- * This class is not thread-safe. It is responsible for responding to Tika extraction events and producing a Solr document
+ * The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
+ * This class is not thread-safe.
+ *
+ *
+ * User's may wish to override this class to provide their own functionality.
+ *
+ * @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
+ * @see org.apache.solr.handler.extraction.ExtractingRequestHandler
+ * @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
+ *
*/
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
@@ -72,15 +81,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
this.params = params;
this.schema = schema;
this.dateFormats = dateFormats;
- this.ignoreUndeclaredFields = params.getBool(ExtractingParams.IGNORE_UNDECLARED_FIELDS, false);
- this.indexAttribs = params.getBool(ExtractingParams.INDEX_ATTRIBUTES, false);
- this.defaultFieldName = params.get(ExtractingParams.DEFAULT_FIELDNAME);
- this.metadataPrefix = params.get(ExtractingParams.METADATA_PREFIX, "");
+ this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false);
+ this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
+ this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
+ this.metadataPrefix = params.get(METADATA_PREFIX, "");
//if there's no default field and we are intending to index, then throw an exception
- if (defaultFieldName == null && params.getBool(ExtractingParams.EXTRACT_ONLY, false) == false) {
+ if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified");
}
- String[] captureFields = params.getParams(ExtractingParams.CAPTURE_FIELDS);
+ String[] captureFields = params.getParams(CAPTURE_FIELDS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap();
for (int i = 0; i < captureFields.length; i++) {
@@ -186,7 +195,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
/**
* Generate an ID for the document. First try to get
- * {@link org.apache.solr.handler.ExtractingMetadataConstants#STREAM_NAME} from the
+ * {@link ExtractingMetadataConstants#STREAM_NAME} from the
* {@link org.apache.tika.metadata.Metadata}, then try {@link ExtractingMetadataConstants#STREAM_SOURCE_INFO}
* then try {@link org.apache.tika.metadata.Metadata#IDENTIFIER}.
* If those all are null, then generate a random UUID using {@link java.util.UUID#randomUUID()}.
@@ -331,7 +340,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* @return The new name, if there is one, else name
*/
protected String findMappedName(String name) {
- return params.get(ExtractingParams.MAP_PREFIX + name, name);
+ return params.get(MAP_PREFIX + name, name);
}
/**
@@ -341,7 +350,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* @return The new name, else name
*/
protected String findMappedMetadataName(String name) {
- return metadataPrefix + params.get(ExtractingParams.MAP_PREFIX + name, name);
+ return metadataPrefix + params.get(MAP_PREFIX + name, name);
}
diff --git a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
similarity index 92%
rename from contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java
rename to contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
index 68ef434f9c4..36b5ebdb815 100644
--- a/contrib/extraction/src/main/java/org/apache/solr/handler/SolrContentHandlerFactory.java
+++ b/contrib/extraction/src/main/java/org/apache/solr/handler/extraction/SolrContentHandlerFactory.java
@@ -1,4 +1,4 @@
-package org.apache.solr.handler;
+package org.apache.solr.handler.extraction;
import org.apache.tika.metadata.Metadata;
import org.apache.solr.common.params.SolrParams;
diff --git a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
index 8dea409ec62..81266a0e390 100644
--- a/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
+++ b/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
@@ -6,6 +6,8 @@ import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.handler.extraction.ExtractingParams;
+import org.apache.solr.handler.extraction.ExtractingRequestHandler;
import java.util.List;
import java.util.ArrayList;
diff --git a/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml b/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml
index 7842824e80a..f7495d646c7 100644
--- a/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml
+++ b/contrib/extraction/src/test/resources/solr/conf/solrconfig.xml
@@ -308,7 +308,7 @@
false
-
+
diff --git a/example/solr/conf/solrconfig.xml b/example/solr/conf/solrconfig.xml
index 095cb99b50b..297f828137b 100755
--- a/example/solr/conf/solrconfig.xml
+++ b/example/solr/conf/solrconfig.xml
@@ -627,6 +627,16 @@
+
+
+