SOLR-284: Updated to Tika 0.2. Added in build, pom support

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@726035 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Grant Ingersoll 2008-12-12 15:18:24 +00:00
parent 843ce957a9
commit 7ddfd04ae4
14 changed files with 120 additions and 26 deletions

View File

@ -198,11 +198,12 @@
<packageset dir="${src}/java" />
<packageset dir="${src}/webapp/src" />
<packageset dir="contrib/dataimporthandler/src/main/java" />
<packageset dir="contrib/extraction/src/main/java" />
<group title="Core" packages="org.apache.*" />
<group title="Common" packages="org.apache.solr.common.*" />
<group title="SolrJ" packages="org.apache.solr.client.solrj*" />
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
</sources>
</invoke-javadoc>
</sequential>
@ -395,6 +396,7 @@
<fileset dir="src/java"/>
<fileset dir="src/webapp/src"/>
<fileset dir="contrib/dataimporthandler/src/main/java" />
<fileset dir="contrib/extraction/src/main/java" />
</clover-setup>
</target>
@ -485,6 +487,9 @@
<solr-jar destfile="${dist}/apache-solr-dataimporthandler-src-${version}.jar"
basedir="contrib/dataimporthandler/src" />
<solr-jar destfile="${dist}/apache-solr-cell-src-${version}.jar"
basedir="contrib/extraction/src" />
</target>
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
@ -497,6 +502,8 @@
basedir="${build.javadoc}/solrj" />
<solr-jar destfile="${dist}/apache-solr-dataimporthandler-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-dataimporthandler" />
<solr-jar destfile="${dist}/apache-solr-cell-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-cell" />
</target>
<!-- Creates the solr jar. -->
@ -668,6 +675,7 @@
<sign-maven-dependency-artifacts artifact.id="solr-commons-csv"/>
<sign-maven-artifacts artifact.id="solr-core"/>
<sign-maven-artifacts artifact.id="solr-dataimporthandler"/>
<sign-maven-artifacts artifact.id="solr-cell"/>
<sign-maven-dependency-artifacts artifact.id="solr-lucene-analyzers"/>
<sign-maven-dependency-artifacts artifact.id="solr-lucene-core"/>
<sign-maven-dependency-artifacts artifact.id="solr-lucene-highlighter"/>
@ -751,6 +759,16 @@
</artifact-attachments>
</m2-deploy>
<m2-deploy pom.xml="contrib/extraction/solr-cell-pom.xml.template"
jar.file="${dist}/apache-solr-cell-${version}.jar">
<artifact-attachments>
<attach file="${dist}/apache-solr-cell-src-${version}.jar" classifier="sources"/>
<attach file="${dist}/apache-solr-cell-docs-${version}.jar" classifier="javadoc"/>
</artifact-attachments>
</m2-deploy>
<m2-deploy pom.xml="${src}/maven/solr-core-pom.xml.template"
jar.file="${dist}/apache-solr-core-${version}.jar">
@ -796,6 +814,8 @@
</fileset>
<fileset dir="contrib/dataimporthandler/src/main/java"/>
<fileset dir="contrib/dataimporthandler/src/test/java"/>
<fileset dir="contrib/extraction/src/main/java"/>
<fileset dir="contrib/extraction/src/test/java"/>
</rat:report>
</target>

View File

@ -110,7 +110,11 @@
</target>
<target name="dist" depends="build">
<mkdir dir="${solr-path}/dist/solr-cell-lib"/>
<copy file="build/${fullnamever}.jar" todir="${solr-path}/dist"/>
<copy todir="${solr-path}/dist/solr-cell-lib">
<fileset dir="lib"/>
</copy>
</target>
<target name="example" depends="build">

View File

@ -1,2 +0,0 @@
AnyObjectId[16b9a3ed370d5a617d72f0b8935859bf0eac7678] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[65882f20fd59a46c577fbdfd3ddb63f4d49cb71c] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,46 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-parent</artifactId>
<version>@maven_version@</version>
</parent>
<groupId>org.apache.solr</groupId>
<artifactId>solr-cell</artifactId>
<name>Apache Solr Content Extraction Library</name>
<version>@maven_version@</version>
<description>Apache Solr Content Extraction Library integrates Apache Tika content extraction framework into Solr</description>
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika</artifactId>
<version>0.2</version>
</dependency>
</dependencies>
</project>

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
import org.apache.commons.io.IOUtils;
import org.apache.solr.common.SolrException;
@ -10,6 +10,7 @@ import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
@ -28,7 +29,7 @@ import java.io.StringWriter;
/**
*
* The class responsible for loading extracted content into Solr.
*
**/
public class ExtractingDocumentLoader extends ContentStreamLoader {

View File

@ -1,8 +1,8 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
/**
*
* Constants used internally by the {@link ExtractingRequestHandler}.
*
**/
public interface ExtractingMetadataConstants {

View File

@ -1,8 +1,8 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
/**
* The various parameters to use when extracting content.
* The various Solr Parameters names to use when extracting content.
*
**/
public interface ExtractingParams {
@ -47,7 +47,7 @@ public interface ExtractingParams {
/**
* Restrict the extracted parts of a document to be indexed
* by passing in an XPath expression. All content that satisfies the XPath expr.
* will be passed to the {@link org.apache.solr.handler.SolrContentHandler}.
* will be passed to the {@link SolrContentHandler}.
* <p/>
* See Tika's docs for what the extracted document looks like.
* <p/>
@ -84,7 +84,7 @@ public interface ExtractingParams {
* Capture the specified fields (and everything included below it that isn't capture by some other capture field) separately from the default. This is different
* then the case of passing in an XPath expression.
* <p/>
* The Capture field is based on the localName returned to the {@link org.apache.solr.handler.SolrContentHandler}
* The Capture field is based on the localName returned to the {@link SolrContentHandler}
* by Tika, not to be confused by the mapped field. The field name can then
* be mapped into the index schema.
* <p/>

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@ -25,6 +25,8 @@ import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.util.plugin.SolrCoreAware;
import org.apache.solr.handler.ContentStreamHandlerBase;
import org.apache.solr.handler.ContentStreamLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.slf4j.Logger;

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
@ -31,7 +31,16 @@ import java.util.UUID;
/**
* This class is not thread-safe. It is responsible for responding to Tika extraction events and producing a Solr document
* The class responsible for handling Tika events and translating them into {@link org.apache.solr.common.SolrInputDocument}s.
* <B>This class is not thread-safe.</B>
* <p/>
*
* User's may wish to override this class to provide their own functionality.
*
* @see org.apache.solr.handler.extraction.SolrContentHandlerFactory
* @see org.apache.solr.handler.extraction.ExtractingRequestHandler
* @see org.apache.solr.handler.extraction.ExtractingDocumentLoader
*
*/
public class SolrContentHandler extends DefaultHandler implements ExtractingParams {
private transient static Logger log = LoggerFactory.getLogger(SolrContentHandler.class);
@ -72,15 +81,15 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
this.params = params;
this.schema = schema;
this.dateFormats = dateFormats;
this.ignoreUndeclaredFields = params.getBool(ExtractingParams.IGNORE_UNDECLARED_FIELDS, false);
this.indexAttribs = params.getBool(ExtractingParams.INDEX_ATTRIBUTES, false);
this.defaultFieldName = params.get(ExtractingParams.DEFAULT_FIELDNAME);
this.metadataPrefix = params.get(ExtractingParams.METADATA_PREFIX, "");
this.ignoreUndeclaredFields = params.getBool(IGNORE_UNDECLARED_FIELDS, false);
this.indexAttribs = params.getBool(INDEX_ATTRIBUTES, false);
this.defaultFieldName = params.get(DEFAULT_FIELDNAME);
this.metadataPrefix = params.get(METADATA_PREFIX, "");
//if there's no default field and we are intending to index, then throw an exception
if (defaultFieldName == null && params.getBool(ExtractingParams.EXTRACT_ONLY, false) == false) {
if (defaultFieldName == null && params.getBool(EXTRACT_ONLY, false) == false) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "No default field name specified");
}
String[] captureFields = params.getParams(ExtractingParams.CAPTURE_FIELDS);
String[] captureFields = params.getParams(CAPTURE_FIELDS);
if (captureFields != null && captureFields.length > 0) {
fieldBuilders = new HashMap<String, StringBuilder>();
for (int i = 0; i < captureFields.length; i++) {
@ -186,7 +195,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
/**
* Generate an ID for the document. First try to get
* {@link org.apache.solr.handler.ExtractingMetadataConstants#STREAM_NAME} from the
* {@link ExtractingMetadataConstants#STREAM_NAME} from the
* {@link org.apache.tika.metadata.Metadata}, then try {@link ExtractingMetadataConstants#STREAM_SOURCE_INFO}
* then try {@link org.apache.tika.metadata.Metadata#IDENTIFIER}.
* If those all are null, then generate a random UUID using {@link java.util.UUID#randomUUID()}.
@ -331,7 +340,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* @return The new name, if there is one, else <code>name</code>
*/
protected String findMappedName(String name) {
return params.get(ExtractingParams.MAP_PREFIX + name, name);
return params.get(MAP_PREFIX + name, name);
}
/**
@ -341,7 +350,7 @@ public class SolrContentHandler extends DefaultHandler implements ExtractingPara
* @return The new name, else <code>name</code>
*/
protected String findMappedMetadataName(String name) {
return metadataPrefix + params.get(ExtractingParams.MAP_PREFIX + name, name);
return metadataPrefix + params.get(MAP_PREFIX + name, name);
}

View File

@ -1,4 +1,4 @@
package org.apache.solr.handler;
package org.apache.solr.handler.extraction;
import org.apache.tika.metadata.Metadata;
import org.apache.solr.common.params.SolrParams;

View File

@ -6,6 +6,8 @@ import org.apache.solr.request.SolrQueryResponse;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.extraction.ExtractingParams;
import org.apache.solr.handler.extraction.ExtractingRequestHandler;
import java.util.List;
import java.util.ArrayList;

View File

@ -308,7 +308,7 @@
<bool name="httpCaching">false</bool>
</requestHandler>
<requestHandler name="/update/extract" class="org.apache.solr.handler.ExtractingRequestHandler"/>
<requestHandler name="/update/extract" class="org.apache.solr.handler.extraction.ExtractingRequestHandler"/>
<highlighting>

View File

@ -627,6 +627,16 @@
</arr>
</requestHandler>
<!--
<requestHandler name="/update/extract" class="solr.ExtractingRequestHandler">
<lst name="defaults">
<str name="ext.map.Last-Modified">last_modified</str>
<bool name="ext.ignore.und.fl">true</bool>
</lst>
</requestHandler>
-->
<searchComponent name="termsComp" class="org.apache.solr.handler.component.TermsComponent"/>