SOLR-2129: Provide a Solr module for dynamic metadata extraction/indexing with Apache UIMA

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1062604 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2011-01-24 01:58:00 +00:00
parent 68dc071064
commit 6c05d94c93
43 changed files with 4751 additions and 1 deletions

View File

@ -73,6 +73,10 @@
<classpathentry kind="src" path="solr/contrib/extraction/src/main/java"/> <classpathentry kind="src" path="solr/contrib/extraction/src/main/java"/>
<classpathentry kind="src" path="solr/contrib/extraction/src/test/java"/> <classpathentry kind="src" path="solr/contrib/extraction/src/test/java"/>
<classpathentry kind="src" path="solr/contrib/extraction/src/test/resources"/> <classpathentry kind="src" path="solr/contrib/extraction/src/test/resources"/>
<classpathentry kind="src" path="solr/contrib/uima/src/main/java"/>
<classpathentry kind="src" path="solr/contrib/uima/src/main/resources"/>
<classpathentry kind="src" path="solr/contrib/uima/src/test/java"/>
<classpathentry kind="src" path="solr/contrib/uima/src/test/resources"/>
<classpathentry kind="lib" path="lucene/lib/ant-1.7.1.jar"/> <classpathentry kind="lib" path="lucene/lib/ant-1.7.1.jar"/>
<classpathentry kind="lib" path="lucene/lib/ant-junit-1.7.1.jar"/> <classpathentry kind="lib" path="lucene/lib/ant-junit-1.7.1.jar"/>
<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/> <classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
@ -151,6 +155,12 @@
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.8.jar"/> <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.8.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.8.jar"/> <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.8.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/> <classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/commons-digester-2.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-alchemy.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-calais.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-tagger.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-wst.jar"/>
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-core.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="output" path="bin"/> <classpathentry kind="output" path="bin"/>
</classpath> </classpath>

View File

@ -411,6 +411,9 @@ New Features
* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer, * SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer,
StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe) StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe)
* SOLR-2129: Added a Solr module for dynamic metadata extraction/indexing with Apache UIMA.
See contrib/uima/README.txt for more information. (Tommaso Teofili via rmuir)
Optimizations Optimizations
---------------------- ----------------------

View File

@ -218,6 +218,7 @@
<packageset dir="contrib/dataimporthandler/src/main/java" /> <packageset dir="contrib/dataimporthandler/src/main/java" />
<packageset dir="contrib/clustering/src/main/java" /> <packageset dir="contrib/clustering/src/main/java" />
<packageset dir="contrib/extraction/src/main/java" /> <packageset dir="contrib/extraction/src/main/java" />
<packageset dir="contrib/uima/src/main/java" />
<packageset dir="contrib/analysis-extras/src/java" /> <packageset dir="contrib/analysis-extras/src/java" />
<group title="Core" packages="org.apache.*" /> <group title="Core" packages="org.apache.*" />
<group title="Common" packages="org.apache.solr.common.*" /> <group title="Common" packages="org.apache.solr.common.*" />
@ -225,6 +226,7 @@
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" /> <group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
<group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" /> <group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" /> <group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
<group title="contrib: Solr UIMA" packages="org.apache.solr.uima*" />
</sources> </sources>
</invoke-javadoc> </invoke-javadoc>
</sequential> </sequential>
@ -514,6 +516,7 @@
<fileset dir="contrib/dataimporthandler/src/main/java" /> <fileset dir="contrib/dataimporthandler/src/main/java" />
<fileset dir="contrib/clustering/src/main/java" /> <fileset dir="contrib/clustering/src/main/java" />
<fileset dir="contrib/extraction/src/main/java" /> <fileset dir="contrib/extraction/src/main/java" />
<fileset dir="contrib/uima/src/main/java" />
<fileset dir="contrib/analysis-extras/src/java" /> <fileset dir="contrib/analysis-extras/src/java" />
</clover-setup> </clover-setup>
</target> </target>
@ -617,6 +620,10 @@
basedir="contrib/clustering/src" /> basedir="contrib/clustering/src" />
<solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar" <solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
basedir="contrib/analysis-extras/src" /> basedir="contrib/analysis-extras/src" />
<solr-jar destfile="${dist}/apache-solr-uima-src-${version}.jar"
basedir="contrib/uima/src/main/java" >
<fileset dir="contrib/uima/src/main/resources" />
</solr-jar>
</target> </target>
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files" <target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
@ -635,6 +642,8 @@
basedir="${build.javadoc}/contrib-solr-cell" /> basedir="${build.javadoc}/contrib-solr-cell" />
<solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar" <solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-analysis-extras" /> basedir="${build.javadoc}/contrib-solr-analysis-extras" />
<solr-jar destfile="${dist}/apache-solr-uima-docs-${version}.jar"
basedir="${build.javadoc}/contrib-solr-uima" />
</target> </target>
<!-- Creates the solr jar. --> <!-- Creates the solr jar. -->
@ -731,7 +740,7 @@
<tarfileset dir="." <tarfileset dir="."
prefix="${fullnamever}" prefix="${fullnamever}"
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/" includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" /> excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/** contrib/uima/lib/**" />
<tarfileset dir="." <tarfileset dir="."
prefix="${fullnamever}" prefix="${fullnamever}"
includes="src/test-files/solr/lib/classes/empty-file-main-lib.txt" /> includes="src/test-files/solr/lib/classes/empty-file-main-lib.txt" />
@ -903,6 +912,14 @@
</artifact-attachments> </artifact-attachments>
</m2-deploy> </m2-deploy>
<m2-deploy pom.xml="contrib/extraction/solr-uima-pom.xml.template"
jar.file="${dist}/apache-solr-uima-${version}.jar">
<artifact-attachments>
<attach file="${dist}/apache-solr-uima-src-${version}.jar" classifier="sources"/>
<attach file="${dist}/apache-solr-uima-docs-${version}.jar" classifier="javadoc"/>
</artifact-attachments>
</m2-deploy>
<m2-deploy pom.xml="src/pom.xml" <m2-deploy pom.xml="src/pom.xml"
jar.file="${dist}/apache-solr-core-${version}.jar"> jar.file="${dist}/apache-solr-core-${version}.jar">
<artifact-attachments> <artifact-attachments>
@ -952,6 +969,8 @@
<fileset dir="contrib/extraction/src/test/java"/> <fileset dir="contrib/extraction/src/test/java"/>
<fileset dir="contrib/analysis-extras/src/test"/> <fileset dir="contrib/analysis-extras/src/test"/>
<fileset dir="contrib/analysis-extras/src/test"/> <fileset dir="contrib/analysis-extras/src/test"/>
<fileset dir="contrib/uima/src/main/java"/>
<fileset dir="contrib/uima/src/test/java"/>
</rat:report> </rat:report>
</target> </target>

View File

@ -0,0 +1,17 @@
Apache Solr UIMA Metadata Extraction Library
Release Notes
This file describes changes to the Solr UIMA (contrib/uima) module. See SOLR-2129 for details.
Introduction
------------
This module is intended to be used while indexing documents.
Its purpose is to provide additional on the fly automatically generated fields to the Solr index.
Such fields could be language, concepts, keywords, sentences, named entities, etc.
UIMA Dependency
---------------
uima-core, OpenCalaisAnnotator, WhitespaceTokenizer, HMMTagger, AlchemyAPIAnnotator
Current Version: 2.3.1-SNAPSHOT rev. 999276
$Id$

View File

@ -0,0 +1,60 @@
Getting Started
---------------
To start using Solr UIMA Metadata Extraction Library you should go through the following configuration steps:
1. copy generated solr-uima jar and its libs (under contrib/uima/lib) inside a Solr libraries directory.
2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:
3. for example you could specify the following
<field name="language" type="string" indexed="true" stored="true" required="false"/>
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
4. modify your solrconfig.xml adding the following snippet:
<uimaConfig>
<runtimeParameters>
<keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
<concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
<lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
<cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
<entities_apikey>VALID_ALCHEMYAPI_KEY</entities_apikey>
<oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
</runtimeParameters>
<analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
<analyzeFields merge="false">text</analyzeFields>
<fieldMapping>
<type name="org.apache.uima.alchemy.ts.concept.ConceptFS">
<map feature="text" field="concept"/>
</type>
<type name="org.apache.uima.alchemy.ts.language.LanguageFS">
<map feature="language" field="language"/>
</type>
<type name="org.apache.uima.SentenceAnnotation">
<map feature="coveredText" field="sentence"/>
</type>
</fieldMapping>
</uimaConfig>
5. the analysisEngine tag must contain an AE descriptor inside the specified path in the classpath
6. the analyzeFields tag must contain the input fields that need to be analyzed by UIMA,
if merge=true then their content will be merged and analyzed only once
7. field mapping describes which features of which types should go in a field
8. define in your solrconfig.xml an UpdateRequestProcessorChain as following:
<updateRequestProcessorChain name="uima">
<processor class="org.apache.solr.uima.processor.UIMAProcessorFactory"/>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
9. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...) or create a new UpdateRequestHandler with the following:
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
<lst name="defaults">
<str name="update.processor">uima</str>
</lst>
</requestHandler>
Once you're done with the configuration you can index documents which will be automatically enriched with the specified fields

189
solr/contrib/uima/build.xml Normal file
View File

@ -0,0 +1,189 @@
<?xml version="1.0"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<project name="solr-uima" default="build">
<property name="solr-path" value="../.." />
<import file="../../common-build.xml"/>
<description>
Solr Integration with UIMA for extracting metadata from arbitrary (text) fields and enrich document with features extracted from UIMA types (language, sentences, concepts, named entities, etc.)
</description>
<path id="common.classpath">
<pathelement location="${solr-path}/build/solr" />
<pathelement location="${solr-path}/build/solrj" />
<fileset dir="lib" includes="*.jar"/>
<fileset dir="${solr-path}/lib" includes="*.jar"/>
<path refid="lucene.classpath"/>
<pathelement location="${basedir}/src/main/resources" />
</path>
<path id="test.classpath">
<path refid="common.classpath" />
<pathelement path="${dest}/classes" />
<pathelement path="${dest}/test-classes" />
<pathelement location="${solr-path}/build/tests"/> <!-- include solr test code -->
<pathelement location="${solr-path}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
<pathelement path="${java.class.path}"/>
</path>
<target name="clean">
<delete failonerror="false" dir="${dest}"/>
</target>
<target name="init">
<mkdir dir="${dest}/classes"/>
<mkdir dir="${build.javadoc}" />
<subant target="compileTests">
<fileset dir="${solr-path}" includes="build.xml"/>
</subant>
<subant target="make-manifest">
<fileset dir="${solr-path}" includes="build.xml"/>
</subant>
</target>
<target name="compile" depends="init">
<solr-javac destdir="${dest}/classes"
classpathref="common.classpath">
<src path="src/main/java" />
</solr-javac>
</target>
<target name="build" depends="compile">
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
manifest="../../${dest}/META-INF/MANIFEST.MF">
<fileset dir="src/main/resources" />
</solr-jar>
</target>
<target name="compileTests" depends="compile">
<solr-javac destdir="${dest}/test-classes"
classpathref="test.classpath">
<src path="src/test/java" />
</solr-javac>
<copy todir="${dest}/test-classes">
<fileset dir="src/test/resources" excludes="**/*.java"/>
</copy>
</target>
<property name="tempDir" value="${junit.output.dir}/temp" />
<target name="test" depends="compileTests">
<mkdir dir="${junit.output.dir}"/>
<!-- <mkdir dir="@{tempDir}/@{pattern}"/>
This is very loud and obnoxious. abuse touch instead for a "quiet" mkdir
-->
<touch file="${tempDir}/quiet.ant" verbose="false" mkdirs="true"/>
<condition property="runall">
<not>
<or>
<isset property="testcase"/>
<isset property="testpackage"/>
<isset property="testpackageroot"/>
</or>
</not>
</condition>
<junit printsummary="no"
haltonfailure="no"
maxmemory="512M"
errorProperty="tests.failed"
failureProperty="tests.failed"
dir="${tempDir}"
tempdir="${tempDir}"
forkmode="perBatch"
>
<sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
<sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
<sysproperty key="tests.codec" value="${tests.codec}"/>
<sysproperty key="tests.locale" value="${tests.locale}"/>
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
<sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
<sysproperty key="tests.iter" value="${tests.iter}"/>
<sysproperty key="tests.seed" value="${tests.seed}"/>
<sysproperty key="jetty.insecurerandom" value="1"/>
<sysproperty key="tempDir" file="${tempDir}"/>
<sysproperty key="testmethod" value="${testmethod}"/>
<jvmarg line="${args}"/>
<formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
<classpath refid="test.classpath"/>
<assertions>
<enable package="org.apache.lucene"/>
<enable package="org.apache.solr"/>
</assertions>
<formatter type="${junit.formatter}"/>
<batchtest fork="yes" todir="${junit.output.dir}" if="runall">
<fileset dir="src/test/java" includes="${junit.includes}"/>
</batchtest>
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
<fileset dir="src/test/java" includes="**/${testcase}.java"/>
</batchtest>
<batchtest fork="yes" todir="${junit.output.dir}" if="testpackage">
<fileset dir="src/test/java" includes="**/${testpackage}/**/Test*.java,**/${testpackage}/**/*Test.java"/>
</batchtest>
<batchtest fork="yes" todir="${junit.output.dir}" if="testpackageroot">
<fileset dir="src/test/java" includes="**/${testpackageroot}/Test*.java,**/${testpackageroot}/*Test.java"/>
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</target>
<target name="test-reports"
description="Generates HTML test reports.">
<mkdir dir="${junit.reports}"/>
<junitreport todir="${junit.output.dir}">
<fileset dir="${junit.output.dir}">
<include name="TEST-*.xml"/>
</fileset>
<report format="frames" todir="${junit.reports}"/>
</junitreport>
</target>
<target name="dist" depends="build">
<copy file="build/${fullnamever}.jar" todir="${solr-path}/dist"/>
</target>
<target name="example" depends="build">
<!-- :NOOP: this use to copy libs but now we can refer to them by path -->
</target>
<target name="javadoc">
<sequential>
<mkdir dir="${build.javadoc}/contrib-${name}"/>
<path id="javadoc.classpath">
<path refid="common.classpath"/>
</path>
<invoke-javadoc
destdir="${build.javadoc}/contrib-${name}"
title="${Name} ${version} contrib-${fullnamever} API">
<sources>
<packageset dir="src/main/java"/>
</sources>
</invoke-javadoc>
</sequential>
</target>
</project>

View File

@ -0,0 +1,2 @@
AnyObjectId[9c8bd13a2002a9ff5b35b873b9f111d5281ad201] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[532939ecab6b77ccb77af3635c55ff9752b70ab7] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[33165678da937e03cb069449b40f1cf690beda0a] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[5dfc32bce5e444a9bb3387d664485f7bfdc438ad] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[bf90c19d2c1f77e300b94363385841ec1225b4b9] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[9518da64cdf5d378273ab40a06823a7768f18ece] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,2 @@
AnyObjectId[72991424bdfe4776f66feab7ff4e8564f12d2659] was removed in git history.
Apache SVN contains full history.

View File

@ -0,0 +1,115 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<modelVersion>4.0.0</modelVersion>
<groupId>org.apache.solr</groupId>
<artifactId>solr-uima</artifactId>
<version>0.0.2-SNAPSHOT</version>
<name>Solr - UIMA integration</name>
<properties>
<uimaVersion>2.3.1-SNAPSHOT</uimaVersion>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.solr</groupId>
<artifactId>solr-core</artifactId>
<version>1.4.1</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>uimaj-core</artifactId>
<version>${uimaVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>alchemy-annotator</artifactId>
<version>${uimaVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>OpenCalaisAnnotator</artifactId>
<version>${uimaVersion}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<type>jar</type>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-simple</artifactId>
<version>1.5.5</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>WhitespaceTokenizer</artifactId>
<version>${uimaVersion}</version>
</dependency>
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>Tagger</artifactId>
<version>${uimaVersion}</version>
</dependency>
</dependencies>
<build>
<pluginManagement>
<plugins>
<plugin>
<groupId>com.googlecode.maven-gcu-plugin</groupId>
<artifactId>maven-gcu-plugin</artifactId>
<version>1.0</version>
</plugin>
</plugins>
</pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>2.3.1</version>
<configuration>
<source>1.5</source>
<target>1.5</target>
</configuration>
</plugin>
<plugin>
<groupId>com.googlecode.maven-gcu-plugin</groupId>
<artifactId>maven-gcu-plugin</artifactId>
<version>1.0</version>
<configuration>
<serverId>googlecode</serverId>
<failsOnError>true</failsOnError>
<projectName>${project.artifactId}</projectName>
<uploads>
<upload>
<file>${project.build.directory}/${project.artifactId}-${project.version}.${project.packaging}</file>
<summary>${project.name} sources bundle ${project.version}</summary>
<labels>
<label>Featured</label>
<label>Type-Archive</label>
</labels>
</upload>
</uploads>
</configuration>
</plugin>
</plugins>
</build>
</project>

View File

@ -0,0 +1,69 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
/**
* Configuration holding all the configurable parameters for calling UIMA inside Solr
*
* @version $Id$
*/
public class SolrUIMAConfiguration {
private String[] fieldsToAnalyze;
private boolean fieldsMerging;
private Map<String, Map<String, String>> typesFeaturesFieldsMapping;
private String aePath;
private Map<String, String> runtimeParameters;
public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
Map<String, Map<String, String>> typesFeaturesFieldsMapping,
Map<String, String> runtimeParameters) {
this.aePath = aePath;
this.fieldsToAnalyze = fieldsToAnalyze;
this.fieldsMerging = fieldsMerging;
this.runtimeParameters = runtimeParameters;
this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
}
public String[] getFieldsToAnalyze() {
return fieldsToAnalyze;
}
public boolean isFieldsMerging() {
return fieldsMerging;
}
public Map<String, Map<String, String>> getTypesFeaturesFieldsMapping() {
return typesFeaturesFieldsMapping;
}
public String getAePath() {
return aePath;
}
public Map<String, String> getRuntimeParameters() {
return runtimeParameters;
}
}

View File

@ -0,0 +1,125 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.core.SolrConfig;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* Read configuration for Solr-UIMA integration
*
* @version $Id$
*
*/
public class SolrUIMAConfigurationReader {
private static final String AE_RUNTIME_PARAMETERS_NODE_PATH = "/config/uimaConfig/runtimeParameters";
private static final String FIELD_MAPPING_NODE_PATH = "/config/uimaConfig/fieldMapping";
private static final String ANALYZE_FIELDS_NODE_PATH = "/config/uimaConfig/analyzeFields";
private static final String ANALYSIS_ENGINE_NODE_PATH = "/config/uimaConfig/analysisEngine";
private SolrConfig solrConfig;
public SolrUIMAConfigurationReader(SolrConfig solrConfig) {
this.solrConfig = solrConfig;
}
public SolrUIMAConfiguration readSolrUIMAConfiguration() {
return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
}
private String readAEPath() {
return solrConfig.getNode(ANALYSIS_ENGINE_NODE_PATH, true).getTextContent();
}
private String[] readFieldsToAnalyze() {
Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
return analyzeFieldsNode.getTextContent().split(",");
}
private boolean readFieldsMerging() {
Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
Node mergeNode = analyzeFieldsNode.getAttributes().getNamedItem("merge");
return Boolean.valueOf(mergeNode.getNodeValue());
}
private Map<String, Map<String, String>> readTypesFeaturesFieldsMapping() {
Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
Node fieldMappingNode = solrConfig.getNode(FIELD_MAPPING_NODE_PATH, true);
/* iterate over UIMA types */
if (fieldMappingNode.hasChildNodes()) {
NodeList typeNodes = fieldMappingNode.getChildNodes();
for (int i = 0; i < typeNodes.getLength(); i++) {
/* <type> node */
Node typeNode = typeNodes.item(i);
if (typeNode.getNodeType() != Node.TEXT_NODE) {
Node typeNameAttribute = typeNode.getAttributes().getNamedItem("name");
/* get a UIMA typename */
String typeName = typeNameAttribute.getNodeValue();
/* create entry for UIMA type */
map.put(typeName, new HashMap<String, String>());
if (typeNode.hasChildNodes()) {
/* iterate over features */
NodeList featuresNodeList = typeNode.getChildNodes();
for (int j = 0; j < featuresNodeList.getLength(); j++) {
Node mappingNode = featuresNodeList.item(j);
if (mappingNode.getNodeType() != Node.TEXT_NODE) {
/* get field name */
Node fieldNameNode = mappingNode.getAttributes().getNamedItem("field");
String mappedFieldName = fieldNameNode.getNodeValue();
/* get feature name */
Node featureNameNode = mappingNode.getAttributes().getNamedItem("feature");
String featureName = featureNameNode.getNodeValue();
/* map the feature to the field for the specified type */
map.get(typeName).put(featureName, mappedFieldName);
}
}
}
}
}
}
return map;
}
private Map<String, String> readAEOverridingParameters() {
Map<String, String> runtimeParameters = new HashMap<String, String>();
Node uimaConfigNode = solrConfig.getNode(AE_RUNTIME_PARAMETERS_NODE_PATH, true);
if (uimaConfigNode.hasChildNodes()) {
NodeList overridingNodes = uimaConfigNode.getChildNodes();
for (int i = 0; i < overridingNodes.getLength(); i++) {
Node overridingNode = overridingNodes.item(i);
if (overridingNode.getNodeType() != Node.TEXT_NODE) {
runtimeParameters.put(overridingNode.getNodeName(), overridingNode.getTextContent());
}
}
}
return runtimeParameters;
}
}

View File

@ -0,0 +1,83 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Map UIMA types and features over fields of a Solr document
*
* @version $Id$
*/
public class UIMAToSolrMapper {
private final Logger log = LoggerFactory.getLogger(UIMAToSolrMapper.class);
private SolrInputDocument document;
private JCas cas;
public UIMAToSolrMapper(SolrInputDocument document, JCas cas) {
this.document = document;
this.cas = cas;
}
/**
* map features of a certain UIMA type to corresponding Solr fields based on the mapping
*
* @param typeName
* name of UIMA type to map
* @param featureFieldsmapping
*/
public void map(String typeName, Map<String, String> featureFieldsmapping) {
try {
FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor(
JCas.class).newInstance(cas);
Type type = fsMock.getType();
for (FSIterator<FeatureStructure> iterator = cas.getFSIndexRepository().getAllIndexedFS(type); iterator
.hasNext();) {
FeatureStructure fs = iterator.next();
for (String featureName : featureFieldsmapping.keySet()) {
String fieldName = featureFieldsmapping.get(featureName);
log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName)
.append(" to ").append(fieldName).toString());
String featureValue = null;
if (fs instanceof Annotation && "coveredText".equals(featureName)) {
featureValue = ((Annotation) fs).getCoveredText();
} else {
featureValue = fs.getFeatureValueAsString(type.getFeatureByBaseName(featureName));
}
log.info(new StringBuffer("writing ").append(featureValue).append(" in ").append(
fieldName).toString());
document.addField(fieldName, featureValue, 1.0f);
}
}
} catch (Exception e) {
log.error(e.getLocalizedMessage());
}
}
}

View File

@ -0,0 +1,126 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.Map;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.uima.processor.ae.AEProvider;
import org.apache.solr.uima.processor.ae.AEProviderFactory;
import org.apache.solr.update.AddUpdateCommand;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
/**
* Update document(s) to be indexed with UIMA extracted information
*
* @version $Id$
*/
public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
private SolrUIMAConfiguration solrUIMAConfiguration;
private AEProvider aeProvider;
public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore) {
super(next);
initialize(solrCore);
}
private void initialize(SolrCore solrCore) {
SolrUIMAConfigurationReader uimaConfigurationReader = new SolrUIMAConfigurationReader(solrCore
.getSolrConfig());
solrUIMAConfiguration = uimaConfigurationReader.readSolrUIMAConfiguration();
aeProvider = AEProviderFactory.getInstance().getAEProvider(solrCore.getName(),
solrUIMAConfiguration.getAePath(), solrUIMAConfiguration.getRuntimeParameters());
}
public void processAdd(AddUpdateCommand cmd) throws IOException {
try {
/* get Solr document */
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
/* get the fields to analyze */
for (String text : getTextsToAnalyze(solrInputDocument)) {
if (text != null && !"".equals(text)) {
/* process the text value */
JCas jcas = processText(text);
UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas);
/* get field mapping from config */
Map<String, Map<String, String>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
.getTypesFeaturesFieldsMapping();
/* map type features on fields */
for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) {
uimaToSolrMapper.map(typeFQN, typesAndFeaturesFieldsMap.get(typeFQN));
}
}
}
} catch (UIMAException e) {
throw new RuntimeException(e);
}
super.processAdd(cmd);
}
/*
* get the texts to analyze from the corresponding fields
*/
private String[] getTextsToAnalyze(SolrInputDocument solrInputDocument) {
String[] fieldsToAnalyze = solrUIMAConfiguration.getFieldsToAnalyze();
boolean merge = solrUIMAConfiguration.isFieldsMerging();
String[] textVals = null;
if (merge) {
StringBuilder unifiedText = new StringBuilder("");
for (int i = 0; i < fieldsToAnalyze.length; i++) {
unifiedText.append(String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i])));
}
textVals = new String[1];
textVals[0] = unifiedText.toString();
} else {
textVals = new String[fieldsToAnalyze.length];
for (int i = 0; i < fieldsToAnalyze.length; i++) {
textVals[i] = String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i]));
}
}
return textVals;
}
/* process a field value executing UIMA the CAS containing it as document text */
private JCas processText(String textFieldValue) throws ResourceInitializationException,
AnalysisEngineProcessException {
log.info(new StringBuffer("Analazying text").toString());
/* get the UIMA analysis engine */
AnalysisEngine ae = aeProvider.getAE();
/* create a JCas which contain the text to analyze */
JCas jcas = ae.newJCas();
jcas.setDocumentText(textFieldValue);
/* perform analysis on text field */
ae.process(jcas);
log.info(new StringBuilder("Text processing completed").toString());
return jcas;
}
}

View File

@ -0,0 +1,37 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessor;
import org.apache.solr.update.processor.UpdateRequestProcessorFactory;
/**
* Factory for {@link UIMAUpdateRequestProcessor}
*
* @version $Id$
*/
public class UIMAUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory {
public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
UpdateRequestProcessor next) {
return new UIMAUpdateRequestProcessor(next, req.getCore());
}
}

View File

@ -0,0 +1,32 @@
package org.apache.solr.uima.processor.ae;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.resource.ResourceInitializationException;
/**
* provide an Apache UIMA {@link AnalysisEngine}
*
* @version $Id$
*/
public interface AEProvider {
public AnalysisEngine getAE() throws ResourceInitializationException;
}

View File

@ -0,0 +1,53 @@
package org.apache.solr.uima.processor.ae;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.util.HashMap;
import java.util.Map;
/**
* Singleton factory class responsible of {@link AEProvider}s' creation
*
* @version $Id$
*/
public class AEProviderFactory {
private static AEProviderFactory instance;
private Map<String, AEProvider> providerCache = new HashMap<String, AEProvider>();
private AEProviderFactory() {
// Singleton
}
public static AEProviderFactory getInstance() {
if (instance == null) {
instance = new AEProviderFactory();
}
return instance;
}
public synchronized AEProvider getAEProvider(String core, String aePath,
Map<String, String> runtimeParameters) {
String key = new StringBuilder(core).append(aePath).toString();
if (providerCache.get(key) == null) {
providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters));
}
return providerCache.get(key);
}
}

View File

@ -0,0 +1,89 @@
package org.apache.solr.uima.processor.ae;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.net.URL;
import java.util.Map;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.XMLInputSource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* {@link AEProvider} implementation that creates an Aggregate AE from the given path, also
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
* them as overriding parameters in the aggregate AE
*
* @version $Id$
*/
public class OverridingParamsAEProvider implements AEProvider {
private static Logger log = LoggerFactory.getLogger(OverridingParamsAEProvider.class);
private String aeFilePath;
private AnalysisEngine cachedAE;
private Map<String, String> runtimeParameters;
public OverridingParamsAEProvider(String aeFilePath, Map<String, String> runtimeParameters) {
this.aeFilePath = aeFilePath;
this.runtimeParameters = runtimeParameters;
}
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
try {
if (cachedAE == null) {
// get Resource Specifier from XML file
URL url = this.getClass().getResource(aeFilePath);
XMLInputSource in = new XMLInputSource(url);
// get AE description
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(in);
/* iterate over each AE (to set runtime parameters) */
for (String attributeName : runtimeParameters.keySet()) {
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
attributeName, runtimeParameters.get(attributeName));
log.info(new StringBuilder("setting ").append(attributeName).append(" : ").append(
runtimeParameters.get(attributeName)).toString());
}
// create AE here
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
if (log.isDebugEnabled())
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
.append(" created from descriptor ").append(aeFilePath).toString());
} else {
cachedAE.reconfigure();
if (log.isDebugEnabled())
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
.append(" at path ").append(aeFilePath).append(" reconfigured ").toString());
}
} catch (Exception e) {
cachedAE = null;
throw new ResourceInitializationException(e);
}
return cachedAE;
}
}

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>false</primitive>
<delegateAnalysisEngineSpecifiers>
<delegateAnalysisEngine key="HmmTagger">
<import name="HmmTagger"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="WhitespaceTokenizer">
<import name="WhitespaceTokenizer"/>
</delegateAnalysisEngine>
</delegateAnalysisEngineSpecifiers>
<analysisEngineMetaData>
<name>AggregateSentenceAE</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters/>
<configurationParameterSettings/>
<flowConstraints>
<fixedFlow>
<node>WhitespaceTokenizer</node>
<node>HmmTagger</node>
</fixedFlow>
</flowConstraints>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,57 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>false</primitive>
<delegateAnalysisEngineSpecifiers>
<delegateAnalysisEngine key="TextConceptTaggingAEDescriptor">
<import name="TextConceptTaggingAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextKeywordExtractionAEDescriptor">
<import name="TextKeywordExtractionAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="OpenCalaisAnnotator">
<import name="OpenCalaisAnnotator"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextCategorizationAEDescriptor">
<import name="TextCategorizationAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextLanguageDetectionAEDescriptor">
<import name="TextLanguageDetectionAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextRankedEntityExtractionAEDescriptor">
<import name="TextRankedEntityExtractionAEDescriptor"/>
</delegateAnalysisEngine>
</delegateAnalysisEngineSpecifiers>
<analysisEngineMetaData>
<name>ExtServicesAE</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters/>
<configurationParameterSettings/>
<flowConstraints>
<fixedFlow>
<node>OpenCalaisAnnotator</node>
<node>TextKeywordExtractionAEDescriptor</node>
<node>TextLanguageDetectionAEDescriptor</node>
<node>TextCategorizationAEDescriptor</node>
<node>TextConceptTaggingAEDescriptor</node>
<node>TextRankedEntityExtractionAEDescriptor</node>
</fixedFlow>
</flowConstraints>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,121 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.examples.tagger.HMMTagger</annotatorImplementationName>
<analysisEngineMetaData>
<name>Hidden Markov Model - Part of Speech Tagger</name>
<description>A configuration of the HmmTaggerAnnotator that looks for
parts of speech of identified tokens within existing
Sentence and Token annotations. See also
WhitespaceTokenizer.xml.</description>
<version>1.0</version>
<vendor>The Apache Software Foundation</vendor>
<configurationParameters>
<configurationParameter>
<name>NGRAM_SIZE</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>NGRAM_SIZE</name>
<value>
<integer>3</integer>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.TokenAnnotation</name>
<description>Single token annotation</description>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>posTag</name>
<description>contains part-of-speech of a
corresponding token</description>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>org.apache.uima.SentenceAnnotation</name>
<description>sentence annotation</description>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs>
<type>org.apache.uima.TokenAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.uima.SentenceAnnotation</type>
<feature>org.apache.uima.TokenAnnotation:end</feature>
<feature>org.apache.uima.TokenAnnotation:begin</feature>
</inputs>
<outputs>
<type>org.apache.uima.TokenAnnotation</type>
<feature>org.apache.uima.TokenAnnotation:posTag</feature>
<feature>org.apache.uima.TokenAnnotation:end</feature>
<feature>org.apache.uima.TokenAnnotation:begin</feature>
</outputs>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<externalResourceDependencies>
<externalResourceDependency>
<key>Model</key>
<description>HMM Tagger model file</description>
<interfaceName>org.apache.uima.examples.tagger.IModelResource</interfaceName>
<optional>false</optional>
</externalResourceDependency>
</externalResourceDependencies>
<resourceManagerConfiguration>
<externalResources>
<externalResource>
<name>ModelFile</name>
<description>HMM Tagger model file</description>
<fileResourceSpecifier>
<fileUrl>file:english/BrownModel.dat</fileUrl>
</fileResourceSpecifier>
<implementationName>org.apache.uima.examples.tagger.ModelResource</implementationName>
</externalResource>
</externalResources>
<externalResourceBindings>
<externalResourceBinding>
<key>Model</key>
<resourceName>ModelFile</resourceName>
</externalResourceBinding>
</externalResourceBindings>
</resourceManagerConfiguration>
</analysisEngineDescription>

View File

@ -0,0 +1,194 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.annotator.calais.OpenCalaisAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>OpenCalaisAnnotator</name>
<description/>
<configurationParameters>
<configurationParameter>
<name>allowDistribution</name>
<description/>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>allowSearch</name>
<description/>
<type>Boolean</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>submitter</name>
<description/>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>licenseID</name>
<description/>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>allowDistribution</name>
<value>
<boolean>false</boolean>
</value>
</nameValuePair>
<nameValuePair>
<name>allowSearch</name>
<value>
<boolean>false</boolean>
</value>
</nameValuePair>
<nameValuePair>
<name>submitter</name>
<value>
<string/>
</value>
</nameValuePair>
<nameValuePair>
<name>licenseID</name>
<value>
<string>OC_LICENSE_ID</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.calais.Person</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Anniversary</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.City</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Company</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Continent</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Country</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Currency</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.EmailAddress</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Facility</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.FaxNumber</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Holiday</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.IndustryTerm</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.NaturalDisaster</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.NaturalFeature</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Organization</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.PhoneNumber</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.ProviceOrState</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Region</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.Technology</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.URL</name>
<description/>
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.calais.BaseType</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>calaisType</name>
<description>OpenCalais type</description>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
</analysisEngineDescription>

View File

@ -0,0 +1,147 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>false</primitive>
<delegateAnalysisEngineSpecifiers>
<delegateAnalysisEngine key="TextKeywordExtractionAEDescriptor">
<import name="TextKeywordExtractionAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextConceptTaggingAEDescriptor">
<import name="TextConceptTaggingAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="OpenCalaisAnnotator">
<import name="OpenCalaisAnnotator"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextLanguageDetectionAEDescriptor">
<import name="TextLanguageDetectionAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextCategorizationAEDescriptor">
<import name="TextCategorizationAEDescriptor"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="AggregateSentenceAE">
<import location="AggregateSentenceAE.xml"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="TextRankedEntityExtractionAEDescriptor">
<import name="TextRankedEntityExtractionAEDescriptor"/>
</delegateAnalysisEngine>
</delegateAnalysisEngineSpecifiers>
<analysisEngineMetaData>
<name>ExtServicesAE</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters searchStrategy="language_fallback">
<configurationParameter>
<name>oc_licenseID</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>OpenCalaisAnnotator/licenseID</parameter>
</overrides>
</configurationParameter>
<configurationParameter>
<name>keyword_apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>TextKeywordExtractionAEDescriptor/apikey</parameter>
</overrides>
</configurationParameter>
<configurationParameter>
<name>concept_apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>TextConceptTaggingAEDescriptor/apikey</parameter>
</overrides>
</configurationParameter>
<configurationParameter>
<name>lang_apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>TextLanguageDetectionAEDescriptor/apikey</parameter>
</overrides>
</configurationParameter>
<configurationParameter>
<name>cat_apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>TextCategorizationAEDescriptor/apikey</parameter>
</overrides>
</configurationParameter>
<configurationParameter>
<name>entities_apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
<overrides>
<parameter>TextRankedEntityExtractionAEDescriptor/apikey</parameter>
</overrides>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>oc_licenseID</name>
<value>
<string>licenseid</string>
</value>
</nameValuePair>
<nameValuePair>
<name>keyword_apikey</name>
<value>
<string>apikey</string>
</value>
</nameValuePair>
<nameValuePair>
<name>concept_apikey</name>
<value>
<string>apikey</string>
</value>
</nameValuePair>
<nameValuePair>
<name>lang_apikey</name>
<value>
<string>apikey</string>
</value>
</nameValuePair>
<nameValuePair>
<name>cat_apikey</name>
<value>
<string>apikey</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<flowConstraints>
<fixedFlow>
<node>AggregateSentenceAE</node>
<node>OpenCalaisAnnotator</node>
<node>TextKeywordExtractionAEDescriptor</node>
<node>TextLanguageDetectionAEDescriptor</node>
<node>TextCategorizationAEDescriptor</node>
<node>TextConceptTaggingAEDescriptor</node>
<node>TextRankedEntityExtractionAEDescriptor</node>
</fixedFlow>
</flowConstraints>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,102 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextCategorizationAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>TextCategorizationAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters>
<configurationParameter>
<name>apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>outputMode</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>baseUrl</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>outputMode</name>
<value>
<string>xml</string>
</value>
</nameValuePair>
<nameValuePair>
<name>apikey</name>
<value>
<string>AA_API_KEY</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.categorization.Category</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>score</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>text</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,196 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextConceptTaggingAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>TextConceptTaggingAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters>
<configurationParameter>
<name>apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>outputMode</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>linkedData</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>showSourceText</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>maxRetrieve</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>url</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>apikey</name>
<value>
<string/>
</value>
</nameValuePair>
<nameValuePair>
<name>outputMode</name>
<value>
<string>xml</string>
</value>
</nameValuePair>
<nameValuePair>
<name>linkedData</name>
<value>
<string>1</string>
</value>
</nameValuePair>
<nameValuePair>
<name>showSourceText</name>
<value>
<integer>0</integer>
</value>
</nameValuePair>
<nameValuePair>
<name>maxRetrieve</name>
<value>
<string>8</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.concept.ConceptFS</name>
<description>a concept tag</description>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>text</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>relevance</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>website</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>geo</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>dbpedia</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>yago</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>opencyc</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>freebase</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>ciaFactbook</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>census</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>geonames</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>musicBrainz</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>crunchbase</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>semanticCrunchbase</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextKeywordExtractionAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>TextKeywordExtractionAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters>
<configurationParameter>
<name>apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>outputMode</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>baseUrl</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>url</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>maxRetrieve</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>showSourceText</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>outputMode</name>
<value>
<string>xml</string>
</value>
</nameValuePair>
<nameValuePair>
<name>apikey</name>
<value>
<string>04490000a72fe7ec5cb3497f14e77f338c86f2fe</string>
</value>
</nameValuePair>
<nameValuePair>
<name>maxRetrieve</name>
<value>
<integer>10</integer>
</value>
</nameValuePair>
<nameValuePair>
<name>showSourceText</name>
<value>
<integer>0</integer>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.keywords.KeywordFS</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>text</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,107 @@
<?xml version="1.0" encoding="UTF-8"?>
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextLanguageDetectionAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>TextLanguageDetectionAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters>
<configurationParameter>
<name>apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>outputMode</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>url</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>outputMode</name>
<value>
<string>xml</string>
</value>
</nameValuePair>
<nameValuePair>
<name>apikey</name>
<value>
<string>AA_API_KEY</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.language.LanguageFS</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>language</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>iso6391</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>iso6392</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>iso6393</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>ethnologue</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>nativeSpeakers</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>wikipedia</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,403 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextRankedNamedEntityExtractionAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>TextRankedEntityExtractionAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters>
<configurationParameter>
<name>apikey</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>outputMode</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>disambiguate</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>linkedData</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>showSourceText</name>
<type>Integer</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
<configurationParameter>
<name>baseUrl</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>url</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>coreference</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
<configurationParameter>
<name>quotations</name>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>apikey</name>
<value>
<string/>
</value>
</nameValuePair>
<nameValuePair>
<name>outputMode</name>
<value>
<string>xml</string>
</value>
</nameValuePair>
<nameValuePair>
<name>disambiguate</name>
<value>
<integer>1</integer>
</value>
</nameValuePair>
<nameValuePair>
<name>linkedData</name>
<value>
<string>1</string>
</value>
</nameValuePair>
<nameValuePair>
<name>coreference</name>
<value>
<string>1</string>
</value>
</nameValuePair>
<nameValuePair>
<name>showSourceText</name>
<value>
<integer>0</integer>
</value>
</nameValuePair>
<nameValuePair>
<name>quotations</name>
<value>
<string>1</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<imports>
<import location="baseAlchemyTypeSystemDescriptor.xml"/>
</imports>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Anniversary</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Automobile</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.City</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Company</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Continent</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Country</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.EntertainmentAward</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Facility</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.FieldTerminology</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.FinancialMarketIndex</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.GeographicFeature</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.HealthCondition</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Holiday</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Movie</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.MusicGroup</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.NaturalDisaster</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Organization</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Person</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.PrintMedia</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.RadioProgram</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.RadioStation</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Region</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Sport</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.StateOrCounty</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Technology</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.TelevisionShow</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.TelevisionStation</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.OperatingSystem</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.SportingEvent</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.Drug</name>
<description/>
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
</typeDescription>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.BaseEntity</name>
<description/>
<supertypeName>uima.cas.TOP</supertypeName>
<features>
<featureDescription>
<name>text</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>count</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>relevance</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>disambiguation</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>subType</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>website</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>geo</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>dbpedia</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>yago</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>opencyc</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>umbel</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>freebase</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>ciaFactbook</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>census</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>geonames</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>musicBrainz</name>
<description/>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
<featureDescription>
<name>quotations</name>
<description/>
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
<multipleReferencesAllowed>true</multipleReferencesAllowed>
</featureDescription>
<featureDescription>
<name>occurrences</name>
<description>A list of annotations annotating this entity</description>
<rangeTypeName>uima.cas.FSList</rangeTypeName>
<elementType>uima.tcas.Annotation</elementType>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,115 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
***************************************************************
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
***************************************************************
-->
<analysisEngineDescription
xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>
org.apache.uima.java
</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>
org.apache.uima.annotator.WhitespaceTokenizer
</annotatorImplementationName>
<analysisEngineMetaData>
<name>WhitespaceTokenizer</name>
<description>
creates token and sentence annotations for whitespace
separated languages
</description>
<version>1.0</version>
<vendor>The Apache Software Foundation</vendor>
<configurationParameters>
<configurationParameter>
<name>SofaNames</name>
<description>
The Sofa names the annotator should work on. If no
names are specified, the annotator works on the
default sofa.
</description>
<type>String</type>
<multiValued>true</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<!--
<nameValuePair>
<name>SofaNames</name>
<value>
<array>
<string>sofaName</string>
</array>
</value>
</nameValuePair>
-->
</configurationParameterSettings>
<typeSystemDescription>
<typeDescription>
<name>org.apache.uima.TokenAnnotation</name>
<description>Single token annotation</description>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>tokenType</name>
<description>token type</description>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>org.apache.uima.SentenceAnnotation</name>
<description>sentence annotation</description>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
</features>
</typeDescription>
</typeSystemDescription>
<fsIndexes />
<capabilities>
<capability>
<inputs />
<outputs>
<type>org.apache.uima.TokenAnnotation</type>
<feature>
org.apache.uima.TokenAnnotation:tokentype
</feature>
<type>org.apache.uima.SentenceAnnotation</type>
</outputs>
<languagesSupported>
<language>x-unspecified</language>
</languagesSupported>
</capability>
</capabilities>
</analysisEngineMetaData>
</analysisEngineDescription>

View File

@ -0,0 +1,41 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
<name>baseAlchemyTypeSystemDescriptor</name>
<description/>
<version>1.0</version>
<vendor/>
<types>
<typeDescription>
<name>org.apache.uima.alchemy.ts.entity.AlchemyAnnotation</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>alchemyType</name>
<description>alchemyAPI type</description>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</types>
</typeSystemDescription>

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version
2.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
applicable law or agreed to in writing, software distributed under
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and
limitations under the License.
-->
<uimaConfig>
<runtimeParameters>
<keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
<concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
<lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
<cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
<oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
</runtimeParameters>
<analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
<analyzeFields merge="false">text,title</analyzeFields>
<fieldMapping>
<type name="org.apache.uima.jcas.tcas.Annotation">
<map feature="coveredText" field="tag"/>
</type>
</fieldMapping>
</uimaConfig>

View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8" ?>
<fields>
<field name="language" type="string" indexed="true" stored="true" required="false"/>
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="keyword" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="suggested_category" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
<dynamicField name="entity*" type="text" indexed="true" stored="true" multiValued="true"/>
</fields>

View File

@ -0,0 +1,137 @@
package org.apache.solr.uima.processor;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.params.MultiMapSolrParams;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.params.UpdateParams;
import org.apache.solr.common.util.ContentStream;
import org.apache.solr.common.util.ContentStreamBase;
import org.apache.solr.core.SolrCore;
import org.apache.solr.handler.XmlUpdateRequestHandler;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.update.processor.UpdateRequestProcessorChain;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
/**
* TestCase for {@link UIMAUpdateRequestProcessor}
*
* @version $Id$
*/
public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 {
@BeforeClass
public static void beforeClass() throws Exception {
initCore("solrconfig.xml", "schema.xml", "solr-uima");
}
@Before
public void setUp() throws Exception {
super.setUp();
clearIndex();
assertU(commit());
}
@Test
public void testProcessorConfiguration() {
SolrCore core = h.getCore();
UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima");
assertNotNull(chained);
UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
.getFactories()[0];
assertNotNull(factory);
}
@Test
public void testProcessing() throws Exception {
// this test requires an internet connection (e.g. opencalais api)
checkInternetConnection();
addDoc(adoc(
"id",
"2312312321312",
"text",
"SpellCheckComponent got improvement related to recent Lucene changes. \n "
+ "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+ "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+ " attached if you need it, but it is also committed to trunk and 3_x branch."
+ " Last Lucene European Conference has been held in Prague."));
assertU(commit());
assertQ(req("language:english"), "//*[@numFound='1']");
}
@Test
public void testTwoUpdates() {
// this test requires an internet connection (e.g. opencalais api)
checkInternetConnection();
try {
addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+ "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
+ "Hemisphere!"));
assertU(commit());
assertQ(req("language:english"), "//*[@numFound='1']");
addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+ "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
+ " attendee-driven, facilitated by members of the Apache community and will "
+ "focus on the Apache..."));
assertU(commit());
assertQ(req("language:english"), "//*[@numFound='2']");
} catch (Exception e) {
assumeNoException("Multiple updates on same instance didn't work", e);
}
}
private void addDoc(String doc) throws Exception {
Map<String, String[]> params = new HashMap<String, String[]>();
params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "uima" });
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
};
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
handler.init(null);
ArrayList<ContentStream> streams = new ArrayList<ContentStream>(2);
streams.add(new ContentStreamBase.StringStream(doc));
req.setContentStreams(streams);
handler.handleRequestBody(req, new SolrQueryResponse());
}
private void checkInternetConnection() {
try {
URLConnection conn = new URL("http://www.apache.org/").openConnection();
conn.setConnectTimeout(5000);
conn.setReadTimeout(5000);
conn.connect();
} catch (Exception ex) {
assumeNoException("This test requires an internet connection", ex);
}
}
}

View File

@ -0,0 +1,21 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
# Use a protected word file to protect against the stemmer reducing two
# unrelated words to the same base word.
# Some non-words that normally won't be encountered,
# just to test that they won't be stemmed.
dontstems
zwhacky

View File

@ -0,0 +1,679 @@
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version
2.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
applicable law or agreed to in writing, software distributed under
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
OR CONDITIONS OF ANY KIND, either express or implied. See the
License for the specific language governing permissions and
limitations under the License.
-->
<!--
This is the Solr schema file. This file should be named "schema.xml"
and should be in the conf directory under the solr home (i.e.
./solr/conf/schema.xml by default) or located where the classloader
for the Solr webapp can find it. This example schema is the
recommended starting point for users. It should be kept correct and
concise, usable out-of-the-box. For more information, on how to
customize this file, please see
http://wiki.apache.org/solr/SchemaXml PERFORMANCE NOTE: this schema
includes many optional features and should not be used for
benchmarking. To improve performance one could - set stored="false"
for all fields possible (esp large fields) when you only need to
search on the field but don't need to return the original value. -
set indexed="false" if you don't need to search on the field, but
only return the field as a result of searching on other indexed
fields. - remove all unneeded copyField statements - for best index
size and searching performance, set "index" to false for all general
text fields, use copyField to copy them to the catchall "text"
field, and use that for searching. - For maximum indexing
performance, use the StreamingUpdateSolrServer java client. -
Remember to run the JVM in server mode, and use a higher logging
level that avoids logging every request
-->
<schema name="sample" version="1.2">
<!--
attribute "name" is the name of this schema and is only used for
display purposes. Applications should change this to reflect the
nature of the search collection. version="1.2" is Solr's version
number for the schema syntax and semantics. It should not normally
be changed by applications. 1.0: multiValued attribute did not
exist, all fields are multiValued by nature 1.1: multiValued
attribute introduced, false by default 1.2: omitTermFreqAndPositions
attribute introduced, true by default except for text fields.
-->
<types>
<!--
field type definitions. The "name" attribute is just a label to be
used by field definitions. The "class" attribute and any other
attributes determine the real behavior of the fieldType. Class
names starting with "solr" refer to java classes in the
org.apache.solr.analysis package.
-->
<!--
The StrField type is not analyzed, but indexed/stored verbatim. -
StrField and TextField support an optional compressThreshold which
limits compression (if enabled in the derived fields) to values
which exceed a certain size (in characters).
-->
<fieldType name="string" class="solr.StrField"
sortMissingLast="true" omitNorms="true" />
<!-- boolean type: "true" or "false" -->
<fieldType name="boolean" class="solr.BoolField"
sortMissingLast="true" omitNorms="true" />
<!--
Binary data type. The data should be sent/retrieved in as Base64
encoded Strings
-->
<fieldtype name="binary" class="solr.BinaryField" />
<!--
The optional sortMissingLast and sortMissingFirst attributes are
currently supported on types that are sorted internally as
strings. This includes
"string","boolean","sint","slong","sfloat","sdouble","pdate" - If
sortMissingLast="true", then a sort on this field will cause
documents without the field to come after documents with the
field, regardless of the requested sort order (asc or desc). - If
sortMissingFirst="true", then a sort on this field will cause
documents without the field to come before documents with the
field, regardless of the requested sort order. - If
sortMissingLast="false" and sortMissingFirst="false" (the
default), then default lucene sorting will be used which places
docs without the field first in an ascending sort and last in a
descending sort.
-->
<!--
Default numeric field types. For faster range queries, consider
the tint/tfloat/tlong/tdouble types.
-->
<fieldType name="int" class="solr.TrieIntField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="float" class="solr.TrieFloatField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="long" class="solr.TrieLongField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<fieldType name="double" class="solr.TrieDoubleField"
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
<!--
Numeric field types that index each value at various levels of
precision to accelerate range queries when the number of values
between the range endpoints is large. See the javadoc for
NumericRangeQuery for internal implementation details. Smaller
precisionStep values (specified in bits) will lead to more tokens
indexed per value, slightly larger index size, and faster range
queries. A precisionStep of 0 disables indexing at different
precision levels.
-->
<fieldType name="tint" class="solr.TrieIntField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tfloat" class="solr.TrieFloatField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tlong" class="solr.TrieLongField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<fieldType name="tdouble" class="solr.TrieDoubleField"
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
<!--
The format for this date field is of the form
1995-12-31T23:59:59Z, and is a more restricted form of the
canonical representation of dateTime
http://www.w3.org/TR/xmlschema-2/#dateTime The trailing "Z"
designates UTC time and is mandatory. Optional fractional seconds
are allowed: 1995-12-31T23:59:59.999Z All other components are
mandatory. Expressions can also be used to denote calculations
that should be performed relative to "NOW" to determine the value,
ie... NOW/HOUR ... Round to the start of the current hour NOW-1DAY
... Exactly 1 day prior to now NOW/DAY+6MONTHS+3DAYS ... 6 months
and 3 days in the future from the start of the current day Consult
the DateField javadocs for more information. Note: For faster
range queries, consider the tdate type
-->
<fieldType name="date" class="solr.TrieDateField"
omitNorms="true" precisionStep="0" positionIncrementGap="0" />
<!--
A Trie based date field for faster date range queries and date
faceting.
-->
<fieldType name="tdate" class="solr.TrieDateField"
omitNorms="true" precisionStep="6" positionIncrementGap="0" />
<!--
Note: These should only be used for compatibility with existing
indexes (created with older Solr versions) or if
"sortMissingFirst" or "sortMissingLast" functionality is needed.
Use Trie based fields instead. Plain numeric field types that
store and index the text value verbatim (and hence don't support
range queries, since the lexicographic ordering isn't equal to the
numeric ordering)
-->
<fieldType name="pint" class="solr.IntField" omitNorms="true" />
<fieldType name="plong" class="solr.LongField" omitNorms="true" />
<fieldType name="pfloat" class="solr.FloatField"
omitNorms="true" />
<fieldType name="pdouble" class="solr.DoubleField"
omitNorms="true" />
<fieldType name="pdate" class="solr.DateField"
sortMissingLast="true" omitNorms="true" />
<!--
Note: These should only be used for compatibility with existing
indexes (created with older Solr versions) or if
"sortMissingFirst" or "sortMissingLast" functionality is needed.
Use Trie based fields instead. Numeric field types that manipulate
the value into a string value that isn't human-readable in its
internal form, but with a lexicographic ordering the same as the
numeric ordering, so that range queries work correctly.
-->
<fieldType name="sint" class="solr.SortableIntField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="slong" class="solr.SortableLongField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="sfloat" class="solr.SortableFloatField"
sortMissingLast="true" omitNorms="true" />
<fieldType name="sdouble" class="solr.SortableDoubleField"
sortMissingLast="true" omitNorms="true" />
<!--
The "RandomSortField" is not used to store or search any data. You
can declare fields of this type it in your schema to generate
pseudo-random orderings of your docs for sorting purposes. The
ordering is generated based on the field name and the version of
the index, As long as the index version remains unchanged, and the
same field name is reused, the ordering of the docs will be
consistent. If you want different psuedo-random orderings of
documents, for the same version of the index, use a dynamicField
and change the name
-->
<fieldType name="random" class="solr.RandomSortField"
indexed="true" />
<!--
solr.TextField allows the specification of custom text analyzers
specified as a tokenizer and a list of token filters. Different
analyzers may be specified for indexing and querying. The optional
positionIncrementGap puts space between multiple fields of this
type on the same document, with the purpose of preventing false
phrase matching across fields. For more info on customizing your
analyzer chain, please see
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
-->
<!--
One can also specify an existing Analyzer class that has a default
constructor via the class attribute on the analyzer element
<fieldType name="text_greek" class="solr.TextField"> <analyzer
class="org.apache.lucene.analysis.el.GreekAnalyzer"/> </fieldType>
-->
<!--
A text field that only splits on whitespace for exact matching of
words
-->
<fieldType name="text_ws" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" />
</analyzer>
</fieldType>
<!--
A text field that uses WordDelimiterFilter to enable splitting and
matching of words on case-change, alpha numeric boundaries, and
non-alphanumeric chars, so that a query of "wifi" or "wi fi" could
match a document containing "Wi-Fi". Synonyms and stopwords are
customized by external files, and stemming is enabled.
-->
<fieldType name="text" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<!--
in this example, we will only use synonyms at query time
<filter class="solr.SynonymFilterFactory"
synonyms="index_synonyms.txt" ignoreCase="true"
expand="false"/>
-->
<!--
Case insensitive stop word removal. add
enablePositionIncrements=true in both the index and query
analyzers to leave a 'gap' for more accurate phrase queries.
-->
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
Less flexible matching, but less false matches. Probably not ideal
for product names, but may be good for SKUs. Can insert dashes in
the wrong place and still match.
-->
<fieldType name="textTight" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="false" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="0" generateNumberParts="0" catenateWords="1"
catenateNumbers="1" catenateAll="0" />
<filter class="solr.LowerCaseFilterFactory" />
<!--
this filter can remove any duplicate tokens that appear at the
same position - sometimes possible with WordDelimiterFilter in
conjuncton with stemming.
-->
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
</analyzer>
</fieldType>
<!--
A general unstemmed text field - good if one does not know the
language of the field
-->
<fieldType name="textgen" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
A general unstemmed text field that indexes tokens normally and
also reversed (via ReversedWildcardFilterFactory), to enable more
efficient leading wildcard queries.
-->
<fieldType name="text_rev" class="solr.TextField"
positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="1"
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
<filter class="solr.ReversedWildcardFilterFactory"
withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
maxFractionAsterisk="0.33" />
</analyzer>
<analyzer type="query">
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true" />
<filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" />
<filter class="solr.WordDelimiterFilterFactory"
generateWordParts="1" generateNumberParts="1" catenateWords="0"
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!-- charFilter + WhitespaceTokenizer -->
<!--
<fieldType name="textCharNorm" class="solr.TextField"
positionIncrementGap="100" > <analyzer> <charFilter
class="solr.MappingCharFilterFactory"
mapping="mapping-ISOLatin1Accent.txt"/> <tokenizer
class="solr.WhitespaceTokenizerFactory"/> </analyzer> </fieldType>
-->
<!--
This is an example of using the KeywordTokenizer along With
various TokenFilterFactories to produce a sortable field that does
not include some properties of the source text
-->
<fieldType name="alphaOnlySort" class="solr.TextField"
sortMissingLast="true" omitNorms="true">
<analyzer>
<!--
KeywordTokenizer does no actual tokenizing, so the entire
input string is preserved as a single token
-->
<tokenizer class="solr.KeywordTokenizerFactory" />
<!--
The LowerCase TokenFilter does what you expect, which can be
when you want your sorting to be case insensitive
-->
<filter class="solr.LowerCaseFilterFactory" />
<!-- The TrimFilter removes any leading or trailing whitespace -->
<filter class="solr.TrimFilterFactory" />
<!--
The PatternReplaceFilter gives you the flexibility to use Java
Regular expression to replace any sequence of characters
matching a pattern with an arbitrary replacement string, which
may include back references to portions of the original string
matched by the pattern. See the Java Regular Expression
documentation for more information on pattern and replacement
string syntax.
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
-->
<filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])"
replacement="" replace="all" />
</analyzer>
</fieldType>
<fieldtype name="phonetic" stored="false" indexed="true"
class="solr.TextField">
<analyzer>
<tokenizer class="solr.StandardTokenizerFactory" />
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
</analyzer>
</fieldtype>
<fieldtype name="payloads" stored="false" indexed="true"
class="solr.TextField">
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory" />
<!--
The DelimitedPayloadTokenFilter can put payloads on tokens...
for example, a token of "foo|1.4" would be indexed as "foo"
with a payload of 1.4f Attributes of the
DelimitedPayloadTokenFilterFactory : "delimiter" - a one
character delimiter. Default is | (pipe) "encoder" - how to
encode the following value into a playload float ->
org.apache.lucene.analysis.payloads.FloatEncoder, integer ->
o.a.l.a.p.IntegerEncoder identity -> o.a.l.a.p.IdentityEncoder
Fully Qualified class name implementing PayloadEncoder,
Encoder must have a no arg constructor.
-->
<filter class="solr.DelimitedPayloadTokenFilterFactory"
encoder="float" />
</analyzer>
</fieldtype>
<!--
lowercases the entire field value, keeping it as a single token.
-->
<fieldType name="lowercase" class="solr.TextField"
positionIncrementGap="100">
<analyzer>
<tokenizer class="solr.KeywordTokenizerFactory" />
<filter class="solr.LowerCaseFilterFactory" />
</analyzer>
</fieldType>
<!--
since fields of this type are by default not stored or indexed,
any data added to them will be ignored outright.
-->
<fieldtype name="ignored" stored="false" indexed="false"
multiValued="true" class="solr.StrField" />
</types>
<fields>
<!--
Valid attributes for fields: name: mandatory - the name for the
field type: mandatory - the name of a previously defined type from
the <types> section indexed: true if this field should be indexed
(searchable or sortable) stored: true if this field should be
retrievable compressed: [false] if this field should be stored
using gzip compression (this will only apply if the field type is
compressable; among the standard field types, only TextField and
StrField are) multiValued: true if this field may contain multiple
values per document omitNorms: (expert) set to true to omit the
norms associated with this field (this disables length
normalization and index-time boosting for the field, and saves
some memory). Only full-text fields or fields that need an
index-time boost need norms. termVectors: [false] set to true to
store the term vector for a given field. When using MoreLikeThis,
fields used for similarity should be stored for best performance.
termPositions: Store position information with the term vector.
This will increase storage costs. termOffsets: Store offset
information with the term vector. This will increase storage
costs. default: a value that should be used if no value is
specified when adding a document.
-->
<field name="id" type="string" indexed="true" stored="true"
required="true" />
<field name="sku" type="textTight" indexed="true" stored="true"
omitNorms="true" />
<field name="name" type="textgen" indexed="true" stored="true" />
<field name="alphaNameSort" type="alphaOnlySort" indexed="true"
stored="false" />
<field name="manu" type="textgen" indexed="true" stored="true"
omitNorms="true" />
<field name="cat" type="text_ws" indexed="true" stored="true"
multiValued="true" omitNorms="true" />
<field name="features" type="text" indexed="true" stored="true"
multiValued="true" />
<field name="includes" type="text" indexed="true" stored="true"
termVectors="true" termPositions="true" termOffsets="true" />
<field name="weight" type="float" indexed="true" stored="true" />
<field name="price" type="float" indexed="true" stored="true" />
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<!--
Common metadata fields, named specifically to match up with
SolrCell metadata when parsing rich documents such as Word, PDF.
Some fields are multiValued only because Tika currently may return
multiple values for them.
-->
<field name="title" type="text" indexed="true" stored="true"
multiValued="true" />
<field name="subject" type="text" indexed="true" stored="true" />
<field name="description" type="text" indexed="true" stored="true" />
<field name="comments" type="text" indexed="true" stored="true" />
<field name="author" type="textgen" indexed="true" stored="true" />
<field name="keywords" type="textgen" indexed="true" stored="true" />
<field name="category" type="textgen" indexed="true" stored="true" />
<field name="content_type" type="string" indexed="true"
stored="true" multiValued="true" />
<field name="last_modified" type="date" indexed="true" stored="true" />
<field name="links" type="string" indexed="true" stored="true"
multiValued="true" />
<!--
catchall field, containing all other searchable text fields
(implemented via copyField further on in this schema
-->
<field name="text" type="text" indexed="true" stored="false"
multiValued="true" />
<!--
catchall text field that indexes tokens both normally and in
reverse for efficient leading wildcard queries.
-->
<field name="text_rev" type="text_rev" indexed="true" stored="false"
multiValued="true" />
<!--
non-tokenized version of manufacturer to make it easier to sort or
group results by manufacturer. copied from "manu" via copyField
-->
<field name="manu_exact" type="string" indexed="true" stored="false" />
<field name="payloads" type="payloads" indexed="true" stored="true" />
<!--
Uncommenting the following will create a "timestamp" field using a
default value of "NOW" to indicate when each document was indexed.
-->
<!--
<field name="timestamp" type="date" indexed="true" stored="true"
default="NOW" multiValued="false"/>
-->
<field name="language" type="string" indexed="true" stored="true" required="false"/>
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="keyword" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
<field name="suggested_category" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
<dynamicField name="entity*" type="text" indexed="true" stored="true" multiValued="true"/>
<!--
Dynamic field definitions. If a field name is not found,
dynamicFields will be used if the name matches any of the
patterns. RESTRICTION: the glob-like pattern in the name attribute
must have a "*" only at the start or the end. EXAMPLE: name="*_i"
will match any field ending in _i (like myid_i, z_i) Longer
patterns will be matched first. if equal size patterns both match,
the first appearing in the schema will be used. <dynamicField
name="*_i" type="int" indexed="true" stored="true"/> <dynamicField
name="*_s" type="string" indexed="true" stored="true"/>
<dynamicField name="*_l" type="long" indexed="true"
stored="true"/> <dynamicField name="*_t" type="text"
indexed="true" stored="true"/> <dynamicField name="*_b"
type="boolean" indexed="true" stored="true"/> <dynamicField
name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_d" type="double" indexed="true"
stored="true"/> <dynamicField name="*_dt" type="date"
indexed="true" stored="true"/> <dynamicField name="*_ti"
type="tint" indexed="true" stored="true"/> <dynamicField
name="*_tl" type="tlong" indexed="true" stored="true"/>
<dynamicField name="*_tf" type="tfloat" indexed="true"
stored="true"/> <dynamicField name="*_td" type="tdouble"
indexed="true" stored="true"/> <dynamicField name="*_tdt"
type="tdate" indexed="true" stored="true"/> <dynamicField
name="*_pi" type="pint" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="attr_*" type="textgen" indexed="true"
stored="true" multiValued="true"/> <dynamicField name="random_*"
type="random" />
-->
<!--
uncomment the following to ignore any fields that don't already
match an existing field name or dynamic field, rather than
reporting them as an error. alternately, change the type="ignored"
to some other type e.g. "text" if you want unknown fields indexed
and/or stored by default
-->
<!--dynamicField name="*" type="ignored" multiValued="true" /-->
</fields>
<!--
Field to use to determine and enforce document uniqueness. Unless
this field is marked with required="false", it will be a required
field
-->
<uniqueKey>id</uniqueKey>
<!--
field for the QueryParser to use when an explicit fieldname is
absent
-->
<defaultSearchField>text</defaultSearchField>
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
<solrQueryParser defaultOperator="OR" />
<!--
copyField commands copy one field to another at the time a document
is added to the index. It's used either to index the same field
differently, or to add multiple fields to the same field for
easier/faster searching.
-->
<copyField source="cat" dest="text" />
<copyField source="name" dest="text" />
<copyField source="manu" dest="text" />
<copyField source="features" dest="text" />
<copyField source="includes" dest="text" />
<copyField source="manu" dest="manu_exact" />
<!--copyField source="Titolo" dest="text"/-->
<!--
Above, multiple source fields are copied to the [text] field.
Another way to map multiple source fields to the same destination
field is to use the dynamic field syntax. copyField also supports a
maxChars to copy setting.
-->
<!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
<!--
copy name to alphaNameSort, a field designed for sorting by name
-->
<!-- <copyField source="name" dest="alphaNameSort"/> -->
<!--
Similarity is the scoring routine for each document vs. a query. A
custom similarity may be specified here, but the default is fine for
most applications.
-->
<!--
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
-->
<!--
... OR ... Specify a SimilarityFactory class name implementation
allowing parameters to be used.
-->
<!--
<similarity class="com.example.solr.CustomSimilarityFactory"> <str
name="paramkey">param value</str> </similarity>
-->
</schema>

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,2 @@
pizza
history

View File

@ -0,0 +1,58 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
# a couple of test stopwords to test that the words are really being
# configured from this file:
stopworda
stopwordb
#Standard english stop words taken from Lucene's StopAnalyzer
a
an
and
are
as
at
be
but
by
for
if
in
into
is
it
no
not
of
on
or
s
such
t
that
the
their
then
there
these
they
this
to
was
will
with

View File

@ -0,0 +1,31 @@
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-----------------------------------------------------------------------
#some test synonym mappings unlikely to appear in real input text
aaa => aaaa
bbb => bbbb1 bbbb2
ccc => cccc1,cccc2
a\=>a => b\=>b
a\,a => b\,b
fooaaa,baraaa,bazaaa
# Some synonym groups specific to this example
GB,gib,gigabyte,gigabytes
MB,mib,megabyte,megabytes
Television, Televisions, TV, TVs
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
#after us won't split it into two words.
# Synonym mappings can be used for spelling correction too
pixima => pixma