mirror of https://github.com/apache/lucene.git
SOLR-284: Updated to Tika 0.2. Added in build, pom support
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@726035 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
843ce957a9
commit
7ddfd04ae4
22
build.xml
22
build.xml
|
@ -198,11 +198,12 @@
|
|||
<packageset dir="${src}/java" />
|
||||
<packageset dir="${src}/webapp/src" />
|
||||
<packageset dir="contrib/dataimporthandler/src/main/java" />
|
||||
|
||||
<packageset dir="contrib/extraction/src/main/java" />
|
||||
<group title="Core" packages="org.apache.*" />
|
||||
<group title="Common" packages="org.apache.solr.common.*" />
|
||||
<group title="SolrJ" packages="org.apache.solr.client.solrj*" />
|
||||
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
|
||||
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
|
||||
</sources>
|
||||
</invoke-javadoc>
|
||||
</sequential>
|
||||
|
@ -395,6 +396,7 @@
|
|||
<fileset dir="src/java"/>
|
||||
<fileset dir="src/webapp/src"/>
|
||||
<fileset dir="contrib/dataimporthandler/src/main/java" />
|
||||
<fileset dir="contrib/extraction/src/main/java" />
|
||||
</clover-setup>
|
||||
</target>
|
||||
|
||||
|
@ -485,6 +487,9 @@
|
|||
|
||||
<solr-jar destfile="${dist}/apache-solr-dataimporthandler-src-${version}.jar"
|
||||
basedir="contrib/dataimporthandler/src" />
|
||||
|
||||
<solr-jar destfile="${dist}/apache-solr-cell-src-${version}.jar"
|
||||
basedir="contrib/extraction/src" />
|
||||
</target>
|
||||
|
||||
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
|
||||
|
@ -497,6 +502,8 @@
|
|||
basedir="${build.javadoc}/solrj" />
|
||||
<solr-jar destfile="${dist}/apache-solr-dataimporthandler-docs-${version}.jar"
|
||||
basedir="${build.javadoc}/contrib-solr-dataimporthandler" />
|
||||
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
|
||||
basedir="${build.javadoc}/contrib-solr-cell" />
|
||||
</target>
|
||||
|
||||
<!-- Creates the solr jar. -->
|
||||
|
@ -668,6 +675,7 @@
|
|||
<sign-maven-dependency-artifacts artifact.id="solr-commons-csv"/>
|
||||
<sign-maven-artifacts artifact.id="solr-core"/>
|
||||
<sign-maven-artifacts artifact.id="solr-dataimporthandler"/>
|
||||
<sign-maven-artifacts artifact.id="solr-cell"/>
|
||||
<sign-maven-dependency-artifacts artifact.id="solr-lucene-analyzers"/>
|
||||
<sign-maven-dependency-artifacts artifact.id="solr-lucene-core"/>
|
||||
<sign-maven-dependency-artifacts artifact.id="solr-lucene-highlighter"/>
|
||||
|
@ -751,6 +759,16 @@
|
|||
</artifact-attachments>
|
||||
</m2-deploy>
|
||||
|
||||
<m2-deploy pom.xml="contrib/extraction/solr-cell-pom.xml.template"
|
||||
jar.file="${dist}/apache-solr-cell-${version}.jar">
|
||||
|
||||
<artifact-attachments>
|
||||
<attach file="${dist}/apache-solr-cell-src-${version}.jar" classifier="sources"/>
|
||||
<attach file="${dist}/apache-solr-cell-docs-${version}.jar" classifier="javadoc"/>
|
||||
</artifact-attachments>
|
||||
</m2-deploy>
|
||||
|
||||
|
||||
<m2-deploy pom.xml="${src}/maven/solr-core-pom.xml.template"
|
||||
jar.file="${dist}/apache-solr-core-${version}.jar">
|
||||
|
||||
|
@ -796,6 +814,8 @@
|
|||
</fileset>
|
||||
<fileset dir="contrib/dataimporthandler/src/main/java"/>
|
||||
<fileset dir="contrib/dataimporthandler/src/test/java"/>
|
||||
<fileset dir="contrib/extraction/src/main/java"/>
|
||||
<fileset dir="contrib/extraction/src/test/java"/>
|
||||
</rat:report>
|
||||
</target>
|
||||
|
||||
|
|
|
@ -110,7 +110,11 @@
|
|||
</target>
|
||||
|
||||
<target name="dist" depends="build">
|
||||
|
||||
<mkdir dir="${solr-path}/dist/solr-cell-lib"/>
|
||||
<copy file="build/${fullnamever}.jar" todir="${solr-path}/dist"/>
|
||||
<copy todir="${solr-path}/dist/solr-cell-lib">
|
||||
<fileset dir="lib"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<target name="example" depends="build">
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
AnyObjectId[16b9a3ed370d5a617d72f0b8935859bf0eac7678] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[65882f20fd59a46c577fbdfd3ddb63f4d49cb71c] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,46 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-parent</artifactId>
|
||||
<version>@maven_version@</version>
|
||||
</parent>
|
||||
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-cell</artifactId>
|
||||
<name>Apache Solr Content Extraction Library</name>
|
||||
<version>@maven_version@</version>
|
||||
<description>Apache Solr Content Extraction Library integrates Apache Tika content extraction framework into Solr</description>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika</artifactId>
|
||||
<version>0.2</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.common.SolrException;
|
||||
|
@ -10,6 +10,7 @@ import org.apache.solr.request.SolrQueryResponse;
|
|||
import org.apache.solr.schema.IndexSchema;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.solr.handler.ContentStreamLoader;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.tika.parser.AutoDetectParser;
|
||||
|
@ -28,7 +29,7 @@ import java.io.StringWriter;
|
|||
|
||||
|
||||
/**
|
||||
*
|
||||
* The class responsible for loading extracted content into Solr.
|
||||
*
|
||||
**/
|
||||
public class ExtractingDocumentLoader extends ContentStreamLoader {
|
|
@ -1,8 +1,8 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
|
||||
/**
|
||||
*
|
||||
* Constants used internally by the {@link ExtractingRequestHandler}.
|
||||
*
|
||||
**/
|
||||
public interface ExtractingMetadataConstants {
|
|
@ -1,8 +1,8 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
|
||||
/**
|
||||
* The various parameters to use when extracting content.
|
||||
* The various Solr Parameters names to use when extracting content.
|
||||
*
|
||||
**/
|
||||
public interface ExtractingParams {
|
||||
|
@ -47,7 +47,7 @@ public interface ExtractingParams {
|
|||
/**
|
||||
* Restrict the extracted parts of a document to be indexed
|
||||
* by passing in an XPath expression. All content that satisfies the XPath expr.
|
||||
* will be passed to the {@link org.apache.solr.handler.SolrContentHandler}.
|
||||
* will be passed to the {@link SolrContentHandler}.
|
||||
* <p/>
|
||||
* See Tika's docs for what the extracted document looks like.
|
||||
* <p/>
|
||||
|
@ -84,7 +84,7 @@ public interface ExtractingParams {
|
|||
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
|
||||
* then the case of passing in an XPath expression.
|
||||
* <p/>
|
||||
* The Capture field is based on the localName returned to the {@link org.apache.solr.handler.SolrContentHandler}
|
||||
* The Capture field is based on the localName returned to the {@link SolrContentHandler}
|
||||
* by Tika, not to be confused by the mapped field. The field name can then
|
||||
* be mapped into the index schema.
|
||||
* <p/>
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
|
@ -25,6 +25,8 @@ import org.apache.solr.core.SolrCore;
|
|||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.solr.util.plugin.SolrCoreAware;
|
||||
import org.apache.solr.handler.ContentStreamHandlerBase;
|
||||
import org.apache.solr.handler.ContentStreamLoader;
|
||||
import org.apache.tika.config.TikaConfig;
|
||||
import org.apache.tika.exception.TikaException;
|
||||
import org.slf4j.Logger;
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
|
@ -31,7 +31,16 @@ import java.util.UUID;
|
|||
|
||||
|
||||
/**
|
||||
* This class is not thread-safe. It is responsible for responding to Tika extraction events and producing a Solr document
|
||||
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
|
||||
* <B>This class is not thread-safe.</B>
|
||||
* <p/>
|
||||
*
|
||||
* User's may wish to override this class to provide their own functionality.
|
||||
*
|
||||
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
|
||||
* @see org.apache.solr.handler.extraction.ExtractingRequestHandler
|
||||
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
|
||||
*
|
||||
*/
|
||||
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
|
||||
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
|
||||
|
@ -72,15 +81,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
this.params = params;
|
||||
this.schema = schema;
|
||||
this.dateFormats = dateFormats;
|
||||
this.ignoreUndeclaredFields = params.getBool(ExtractingParams.IGNORE_UNDECLARED_FIELDS, false);
|
||||
this.indexAttribs = params.getBool(ExtractingParams.INDEX_ATTRIBUTES, false);
|
||||
this.defaultFieldName = params.get(ExtractingParams.DEFAULT_FIELDNAME);
|
||||
this.metadataPrefix = params.get(ExtractingParams.METADATA_PREFIX, "");
|
||||
this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false);
|
||||
this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
|
||||
this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
|
||||
this.metadataPrefix = params.get(METADATA_PREFIX, "");
|
||||
//if there's no default field and we are intending to index, then throw an exception
|
||||
if (defaultFieldName == null && params.getBool(ExtractingParams.EXTRACT_ONLY, false) == false) {
|
||||
if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) {
|
||||
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified");
|
||||
}
|
||||
String[] captureFields = params.getParams(ExtractingParams.CAPTURE_FIELDS);
|
||||
String[] captureFields = params.getParams(CAPTURE_FIELDS);
|
||||
if (captureFields != null && captureFields.length > 0) {
|
||||
fieldBuilders = new HashMap<String, StringBuilder>();
|
||||
for (int i = 0; i < captureFields.length; i++) {
|
||||
|
@ -186,7 +195,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
|
||||
/**
|
||||
* Generate an ID for the document. First try to get
|
||||
* {@link org.apache.solr.handler.ExtractingMetadataConstants#STREAM_NAME} from the
|
||||
* {@link ExtractingMetadataConstants#STREAM_NAME} from the
|
||||
* {@link org.apache.tika.metadata.Metadata}, then try {@link ExtractingMetadataConstants#STREAM_SOURCE_INFO}
|
||||
* then try {@link org.apache.tika.metadata.Metadata#IDENTIFIER}.
|
||||
* If those all are null, then generate a random UUID using {@link java.util.UUID#randomUUID()}.
|
||||
|
@ -331,7 +340,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
* @return The new name, if there is one, else <code>name</code>
|
||||
*/
|
||||
protected String findMappedName(String name) {
|
||||
return params.get(ExtractingParams.MAP_PREFIX + name, name);
|
||||
return params.get(MAP_PREFIX + name, name);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -341,7 +350,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
|
|||
* @return The new name, else <code>name</code>
|
||||
*/
|
||||
protected String findMappedMetadataName(String name) {
|
||||
return metadataPrefix + params.get(ExtractingParams.MAP_PREFIX + name, name);
|
||||
return metadataPrefix + params.get(MAP_PREFIX + name, name);
|
||||
}
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package org.apache.solr.handler;
|
||||
package org.apache.solr.handler.extraction;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.apache.solr.common.params.SolrParams;
|
|
@ -6,6 +6,8 @@ import org.apache.solr.request.SolrQueryResponse;
|
|||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.util.ContentStreamBase;
|
||||
import org.apache.solr.common.util.NamedList;
|
||||
import org.apache.solr.handler.extraction.ExtractingParams;
|
||||
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
|
|
@ -308,7 +308,7 @@
|
|||
<bool name="httpCaching">false</bool>
|
||||
</requestHandler>
|
||||
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.ExtractingRequestHandler"/>
|
||||
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
|
||||
|
||||
|
||||
<highlighting>
|
||||
|
|
|
@ -627,6 +627,16 @@
|
|||
</arr>
|
||||
</requestHandler>
|
||||
|
||||
<!--
|
||||
<requestHandler name="/update/extract" class="solr.ExtractingRequestHandler">
|
||||
<lst name="defaults">
|
||||
<str name="ext.map.Last-Modified">last_modified</str>
|
||||
<bool name="ext.ignore.und.fl">true</bool>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
-->
|
||||
|
||||
|
||||
|
||||
<searchComponent name="termsComp" class="org.apache.solr.handler.component.TermsComponent"/>
|
||||
|
||||
|
|
Loading…
Reference in New Issue