mirror of https://github.com/apache/lucene.git
SOLR-2129: Provide a Solr module for dynamic metadata extraction/indexing with Apache UIMA
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1062604 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
68dc071064
commit
6c05d94c93
|
@ -73,6 +73,10 @@
|
|||
<classpathentry kind="src" path="solr/contrib/extraction/src/main/java"/>
|
||||
<classpathentry kind="src" path="solr/contrib/extraction/src/test/java"/>
|
||||
<classpathentry kind="src" path="solr/contrib/extraction/src/test/resources"/>
|
||||
<classpathentry kind="src" path="solr/contrib/uima/src/main/java"/>
|
||||
<classpathentry kind="src" path="solr/contrib/uima/src/main/resources"/>
|
||||
<classpathentry kind="src" path="solr/contrib/uima/src/test/java"/>
|
||||
<classpathentry kind="src" path="solr/contrib/uima/src/test/resources"/>
|
||||
<classpathentry kind="lib" path="lucene/lib/ant-1.7.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/lib/ant-junit-1.7.1.jar"/>
|
||||
<classpathentry kind="lib" path="lucene/lib/junit-4.7.jar"/>
|
||||
|
@ -151,6 +155,12 @@
|
|||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.8.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.8.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/commons-digester-2.0.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-alchemy.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-calais.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-tagger.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-an-wst.jar"/>
|
||||
<classpathentry kind="lib" path="solr/contrib/uima/lib/uima-core.jar"/>
|
||||
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
|
||||
<classpathentry kind="output" path="bin"/>
|
||||
</classpath>
|
||||
|
|
|
@ -411,6 +411,9 @@ New Features
|
|||
* SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer,
|
||||
StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe)
|
||||
|
||||
* SOLR-2129: Added a Solr module for dynamic metadata extraction/indexing with Apache UIMA.
|
||||
See contrib/uima/README.txt for more information. (Tommaso Teofili via rmuir)
|
||||
|
||||
Optimizations
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -218,6 +218,7 @@
|
|||
<packageset dir="contrib/dataimporthandler/src/main/java" />
|
||||
<packageset dir="contrib/clustering/src/main/java" />
|
||||
<packageset dir="contrib/extraction/src/main/java" />
|
||||
<packageset dir="contrib/uima/src/main/java" />
|
||||
<packageset dir="contrib/analysis-extras/src/java" />
|
||||
<group title="Core" packages="org.apache.*" />
|
||||
<group title="Common" packages="org.apache.solr.common.*" />
|
||||
|
@ -225,6 +226,7 @@
|
|||
<group title="contrib: DataImportHandler" packages="org.apache.solr.handler.dataimport*" />
|
||||
<group title="contrib: Clustering" packages="org.apache.solr.handler.clustering*" />
|
||||
<group title="contrib: Solr Cell" packages="org.apache.solr.handler.extraction*" />
|
||||
<group title="contrib: Solr UIMA" packages="org.apache.solr.uima*" />
|
||||
</sources>
|
||||
</invoke-javadoc>
|
||||
</sequential>
|
||||
|
@ -514,6 +516,7 @@
|
|||
<fileset dir="contrib/dataimporthandler/src/main/java" />
|
||||
<fileset dir="contrib/clustering/src/main/java" />
|
||||
<fileset dir="contrib/extraction/src/main/java" />
|
||||
<fileset dir="contrib/uima/src/main/java" />
|
||||
<fileset dir="contrib/analysis-extras/src/java" />
|
||||
</clover-setup>
|
||||
</target>
|
||||
|
@ -617,6 +620,10 @@
|
|||
basedir="contrib/clustering/src" />
|
||||
<solr-jar destfile="${dist}/apache-solr-analysis-extras-src-${version}.jar"
|
||||
basedir="contrib/analysis-extras/src" />
|
||||
<solr-jar destfile="${dist}/apache-solr-uima-src-${version}.jar"
|
||||
basedir="contrib/uima/src/main/java" >
|
||||
<fileset dir="contrib/uima/src/main/resources" />
|
||||
</solr-jar>
|
||||
</target>
|
||||
|
||||
<target name="dist-javadoc" description="Creates the Solr javadoc distribution files"
|
||||
|
@ -635,6 +642,8 @@
|
|||
basedir="${build.javadoc}/contrib-solr-cell" />
|
||||
<solr-jar destfile="${dist}/apache-solr-analysis-extras-docs-${version}.jar"
|
||||
basedir="${build.javadoc}/contrib-solr-analysis-extras" />
|
||||
<solr-jar destfile="${dist}/apache-solr-uima-docs-${version}.jar"
|
||||
basedir="${build.javadoc}/contrib-solr-uima" />
|
||||
</target>
|
||||
|
||||
<!-- Creates the solr jar. -->
|
||||
|
@ -731,7 +740,7 @@
|
|||
<tarfileset dir="."
|
||||
prefix="${fullnamever}"
|
||||
includes="LICENSE.txt NOTICE.txt *.txt *.xml lucene-libs/** lib/** src/** example/** client/** contrib/"
|
||||
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/**" />
|
||||
excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/** contrib/uima/lib/**" />
|
||||
<tarfileset dir="."
|
||||
prefix="${fullnamever}"
|
||||
includes="src/test-files/solr/lib/classes/empty-file-main-lib.txt" />
|
||||
|
@ -903,6 +912,14 @@
|
|||
</artifact-attachments>
|
||||
</m2-deploy>
|
||||
|
||||
<m2-deploy pom.xml="contrib/extraction/solr-uima-pom.xml.template"
|
||||
jar.file="${dist}/apache-solr-uima-${version}.jar">
|
||||
<artifact-attachments>
|
||||
<attach file="${dist}/apache-solr-uima-src-${version}.jar" classifier="sources"/>
|
||||
<attach file="${dist}/apache-solr-uima-docs-${version}.jar" classifier="javadoc"/>
|
||||
</artifact-attachments>
|
||||
</m2-deploy>
|
||||
|
||||
<m2-deploy pom.xml="src/pom.xml"
|
||||
jar.file="${dist}/apache-solr-core-${version}.jar">
|
||||
<artifact-attachments>
|
||||
|
@ -952,6 +969,8 @@
|
|||
<fileset dir="contrib/extraction/src/test/java"/>
|
||||
<fileset dir="contrib/analysis-extras/src/test"/>
|
||||
<fileset dir="contrib/analysis-extras/src/test"/>
|
||||
<fileset dir="contrib/uima/src/main/java"/>
|
||||
<fileset dir="contrib/uima/src/test/java"/>
|
||||
</rat:report>
|
||||
</target>
|
||||
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
Apache Solr UIMA Metadata Extraction Library
|
||||
Release Notes
|
||||
|
||||
This file describes changes to the Solr UIMA (contrib/uima) module. See SOLR-2129 for details.
|
||||
|
||||
Introduction
|
||||
------------
|
||||
This module is intended to be used while indexing documents.
|
||||
Its purpose is to provide additional on the fly automatically generated fields to the Solr index.
|
||||
Such fields could be language, concepts, keywords, sentences, named entities, etc.
|
||||
|
||||
UIMA Dependency
|
||||
---------------
|
||||
uima-core, OpenCalaisAnnotator, WhitespaceTokenizer, HMMTagger, AlchemyAPIAnnotator
|
||||
Current Version: 2.3.1-SNAPSHOT rev. 999276
|
||||
|
||||
$Id$
|
|
@ -0,0 +1,60 @@
|
|||
Getting Started
|
||||
---------------
|
||||
To start using Solr UIMA Metadata Extraction Library you should go through the following configuration steps:
|
||||
|
||||
1. copy generated solr-uima jar and its libs (under contrib/uima/lib) inside a Solr libraries directory.
|
||||
|
||||
2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options:
|
||||
|
||||
3. for example you could specify the following
|
||||
<field name="language" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
||||
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
|
||||
|
||||
4. modify your solrconfig.xml adding the following snippet:
|
||||
<uimaConfig>
|
||||
<runtimeParameters>
|
||||
<keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
|
||||
<concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
|
||||
<lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
|
||||
<cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
|
||||
<entities_apikey>VALID_ALCHEMYAPI_KEY</entities_apikey>
|
||||
<oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
|
||||
</runtimeParameters>
|
||||
<analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
|
||||
<analyzeFields merge="false">text</analyzeFields>
|
||||
<fieldMapping>
|
||||
<type name="org.apache.uima.alchemy.ts.concept.ConceptFS">
|
||||
<map feature="text" field="concept"/>
|
||||
</type>
|
||||
<type name="org.apache.uima.alchemy.ts.language.LanguageFS">
|
||||
<map feature="language" field="language"/>
|
||||
</type>
|
||||
<type name="org.apache.uima.SentenceAnnotation">
|
||||
<map feature="coveredText" field="sentence"/>
|
||||
</type>
|
||||
</fieldMapping>
|
||||
</uimaConfig>
|
||||
|
||||
5. the analysisEngine tag must contain an AE descriptor inside the specified path in the classpath
|
||||
|
||||
6. the analyzeFields tag must contain the input fields that need to be analyzed by UIMA,
|
||||
if merge=true then their content will be merged and analyzed only once
|
||||
|
||||
7. field mapping describes which features of which types should go in a field
|
||||
|
||||
8. define in your solrconfig.xml an UpdateRequestProcessorChain as following:
|
||||
<updateRequestProcessorChain name="uima">
|
||||
<processor class="org.apache.solr.uima.processor.UIMAProcessorFactory"/>
|
||||
<processor class="solr.LogUpdateProcessorFactory" />
|
||||
<processor class="solr.RunUpdateProcessorFactory" />
|
||||
</updateRequestProcessorChain>
|
||||
|
||||
9. in your solrconfig.xml replace the existing default (<requestHandler name="/update"...) or create a new UpdateRequestHandler with the following:
|
||||
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
|
||||
<lst name="defaults">
|
||||
<str name="update.processor">uima</str>
|
||||
</lst>
|
||||
</requestHandler>
|
||||
|
||||
Once you're done with the configuration you can index documents which will be automatically enriched with the specified fields
|
|
@ -0,0 +1,189 @@
|
|||
<?xml version="1.0"?>
|
||||
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<project name="solr-uima" default="build">
|
||||
|
||||
<property name="solr-path" value="../.." />
|
||||
|
||||
<import file="../../common-build.xml"/>
|
||||
|
||||
<description>
|
||||
Solr Integration with UIMA for extracting metadata from arbitrary (text) fields and enrich document with features extracted from UIMA types (language, sentences, concepts, named entities, etc.)
|
||||
</description>
|
||||
|
||||
<path id="common.classpath">
|
||||
<pathelement location="${solr-path}/build/solr" />
|
||||
<pathelement location="${solr-path}/build/solrj" />
|
||||
<fileset dir="lib" includes="*.jar"/>
|
||||
<fileset dir="${solr-path}/lib" includes="*.jar"/>
|
||||
<path refid="lucene.classpath"/>
|
||||
<pathelement location="${basedir}/src/main/resources" />
|
||||
</path>
|
||||
|
||||
<path id="test.classpath">
|
||||
<path refid="common.classpath" />
|
||||
<pathelement path="${dest}/classes" />
|
||||
<pathelement path="${dest}/test-classes" />
|
||||
<pathelement location="${solr-path}/build/tests"/> <!-- include solr test code -->
|
||||
<pathelement location="${solr-path}/../lucene/build/classes/test" /> <!-- include some lucene test code -->
|
||||
<pathelement path="${java.class.path}"/>
|
||||
</path>
|
||||
|
||||
<target name="clean">
|
||||
<delete failonerror="false" dir="${dest}"/>
|
||||
</target>
|
||||
|
||||
<target name="init">
|
||||
<mkdir dir="${dest}/classes"/>
|
||||
<mkdir dir="${build.javadoc}" />
|
||||
<subant target="compileTests">
|
||||
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||
</subant>
|
||||
<subant target="make-manifest">
|
||||
<fileset dir="${solr-path}" includes="build.xml"/>
|
||||
</subant>
|
||||
</target>
|
||||
|
||||
<target name="compile" depends="init">
|
||||
<solr-javac destdir="${dest}/classes"
|
||||
classpathref="common.classpath">
|
||||
<src path="src/main/java" />
|
||||
</solr-javac>
|
||||
</target>
|
||||
|
||||
<target name="build" depends="compile">
|
||||
<solr-jar destfile="${dest}/${fullnamever}.jar" basedir="${dest}/classes"
|
||||
manifest="../../${dest}/META-INF/MANIFEST.MF">
|
||||
<fileset dir="src/main/resources" />
|
||||
</solr-jar>
|
||||
</target>
|
||||
|
||||
<target name="compileTests" depends="compile">
|
||||
<solr-javac destdir="${dest}/test-classes"
|
||||
classpathref="test.classpath">
|
||||
<src path="src/test/java" />
|
||||
</solr-javac>
|
||||
<copy todir="${dest}/test-classes">
|
||||
<fileset dir="src/test/resources" excludes="**/*.java"/>
|
||||
</copy>
|
||||
</target>
|
||||
|
||||
<property name="tempDir" value="${junit.output.dir}/temp" />
|
||||
|
||||
<target name="test" depends="compileTests">
|
||||
<mkdir dir="${junit.output.dir}"/>
|
||||
<!-- <mkdir dir="@{tempDir}/@{pattern}"/>
|
||||
This is very loud and obnoxious. abuse touch instead for a "quiet" mkdir
|
||||
-->
|
||||
<touch file="${tempDir}/quiet.ant" verbose="false" mkdirs="true"/>
|
||||
|
||||
<condition property="runall">
|
||||
<not>
|
||||
<or>
|
||||
<isset property="testcase"/>
|
||||
<isset property="testpackage"/>
|
||||
<isset property="testpackageroot"/>
|
||||
</or>
|
||||
</not>
|
||||
</condition>
|
||||
|
||||
<junit printsummary="no"
|
||||
haltonfailure="no"
|
||||
maxmemory="512M"
|
||||
errorProperty="tests.failed"
|
||||
failureProperty="tests.failed"
|
||||
dir="${tempDir}"
|
||||
tempdir="${tempDir}"
|
||||
forkmode="perBatch"
|
||||
>
|
||||
<sysproperty key="java.util.logging.config.file" value="${common-solr.dir}/testlogging.properties"/>
|
||||
<sysproperty key="tests.luceneMatchVersion" value="${tests.luceneMatchVersion}"/>
|
||||
<sysproperty key="tests.codec" value="${tests.codec}"/>
|
||||
<sysproperty key="tests.locale" value="${tests.locale}"/>
|
||||
<sysproperty key="tests.timezone" value="${tests.timezone}"/>
|
||||
<sysproperty key="tests.multiplier" value="${tests.multiplier}"/>
|
||||
<sysproperty key="tests.iter" value="${tests.iter}"/>
|
||||
<sysproperty key="tests.seed" value="${tests.seed}"/>
|
||||
<sysproperty key="jetty.insecurerandom" value="1"/>
|
||||
<sysproperty key="tempDir" file="${tempDir}"/>
|
||||
<sysproperty key="testmethod" value="${testmethod}"/>
|
||||
<jvmarg line="${args}"/>
|
||||
<formatter classname="${junit.details.formatter}" usefile="false" if="junit.details"/>
|
||||
<classpath refid="test.classpath"/>
|
||||
<assertions>
|
||||
<enable package="org.apache.lucene"/>
|
||||
<enable package="org.apache.solr"/>
|
||||
</assertions>
|
||||
<formatter type="${junit.formatter}"/>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="runall">
|
||||
<fileset dir="src/test/java" includes="${junit.includes}"/>
|
||||
</batchtest>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="testcase">
|
||||
<fileset dir="src/test/java" includes="**/${testcase}.java"/>
|
||||
</batchtest>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="testpackage">
|
||||
<fileset dir="src/test/java" includes="**/${testpackage}/**/Test*.java,**/${testpackage}/**/*Test.java"/>
|
||||
</batchtest>
|
||||
<batchtest fork="yes" todir="${junit.output.dir}" if="testpackageroot">
|
||||
<fileset dir="src/test/java" includes="**/${testpackageroot}/Test*.java,**/${testpackageroot}/*Test.java"/>
|
||||
</batchtest>
|
||||
</junit>
|
||||
|
||||
<fail if="tests.failed">Tests failed!</fail>
|
||||
</target>
|
||||
|
||||
<target name="test-reports"
|
||||
description="Generates HTML test reports.">
|
||||
<mkdir dir="${junit.reports}"/>
|
||||
<junitreport todir="${junit.output.dir}">
|
||||
<fileset dir="${junit.output.dir}">
|
||||
<include name="TEST-*.xml"/>
|
||||
</fileset>
|
||||
<report format="frames" todir="${junit.reports}"/>
|
||||
</junitreport>
|
||||
</target>
|
||||
|
||||
<target name="dist" depends="build">
|
||||
<copy file="build/${fullnamever}.jar" todir="${solr-path}/dist"/>
|
||||
</target>
|
||||
|
||||
<target name="example" depends="build">
|
||||
<!-- :NOOP: this use to copy libs but now we can refer to them by path -->
|
||||
</target>
|
||||
|
||||
<target name="javadoc">
|
||||
<sequential>
|
||||
<mkdir dir="${build.javadoc}/contrib-${name}"/>
|
||||
|
||||
<path id="javadoc.classpath">
|
||||
<path refid="common.classpath"/>
|
||||
</path>
|
||||
|
||||
<invoke-javadoc
|
||||
destdir="${build.javadoc}/contrib-${name}"
|
||||
title="${Name} ${version} contrib-${fullnamever} API">
|
||||
<sources>
|
||||
<packageset dir="src/main/java"/>
|
||||
</sources>
|
||||
</invoke-javadoc>
|
||||
</sequential>
|
||||
</target>
|
||||
|
||||
|
||||
</project>
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[9c8bd13a2002a9ff5b35b873b9f111d5281ad201] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[532939ecab6b77ccb77af3635c55ff9752b70ab7] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[33165678da937e03cb069449b40f1cf690beda0a] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[5dfc32bce5e444a9bb3387d664485f7bfdc438ad] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[bf90c19d2c1f77e300b94363385841ec1225b4b9] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[9518da64cdf5d378273ab40a06823a7768f18ece] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,2 @@
|
|||
AnyObjectId[72991424bdfe4776f66feab7ff4e8564f12d2659] was removed in git history.
|
||||
Apache SVN contains full history.
|
|
@ -0,0 +1,115 @@
|
|||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-uima</artifactId>
|
||||
<version>0.0.2-SNAPSHOT</version>
|
||||
<name>Solr - UIMA integration</name>
|
||||
<properties>
|
||||
<uimaVersion>2.3.1-SNAPSHOT</uimaVersion>
|
||||
</properties>
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>org.apache.solr</groupId>
|
||||
<artifactId>solr-core</artifactId>
|
||||
<version>1.4.1</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.uima</groupId>
|
||||
<artifactId>uimaj-core</artifactId>
|
||||
<version>${uimaVersion}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.uima</groupId>
|
||||
<artifactId>alchemy-annotator</artifactId>
|
||||
<version>${uimaVersion}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.uima</groupId>
|
||||
<artifactId>OpenCalaisAnnotator</artifactId>
|
||||
<version>${uimaVersion}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<version>4.7</version>
|
||||
<type>jar</type>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.slf4j</groupId>
|
||||
<artifactId>slf4j-simple</artifactId>
|
||||
<version>1.5.5</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.uima</groupId>
|
||||
<artifactId>WhitespaceTokenizer</artifactId>
|
||||
<version>${uimaVersion}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.uima</groupId>
|
||||
<artifactId>Tagger</artifactId>
|
||||
<version>${uimaVersion}</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
<build>
|
||||
<pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>com.googlecode.maven-gcu-plugin</groupId>
|
||||
<artifactId>maven-gcu-plugin</artifactId>
|
||||
<version>1.0</version>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
<plugins>
|
||||
<plugin>
|
||||
<groupId>org.apache.maven.plugins</groupId>
|
||||
<artifactId>maven-compiler-plugin</artifactId>
|
||||
<version>2.3.1</version>
|
||||
<configuration>
|
||||
<source>1.5</source>
|
||||
<target>1.5</target>
|
||||
</configuration>
|
||||
</plugin>
|
||||
<plugin>
|
||||
<groupId>com.googlecode.maven-gcu-plugin</groupId>
|
||||
<artifactId>maven-gcu-plugin</artifactId>
|
||||
<version>1.0</version>
|
||||
<configuration>
|
||||
<serverId>googlecode</serverId>
|
||||
<failsOnError>true</failsOnError>
|
||||
<projectName>${project.artifactId}</projectName>
|
||||
<uploads>
|
||||
<upload>
|
||||
<file>${project.build.directory}/${project.artifactId}-${project.version}.${project.packaging}</file>
|
||||
<summary>${project.name} sources bundle ${project.version}</summary>
|
||||
<labels>
|
||||
<label>Featured</label>
|
||||
<label>Type-Archive</label>
|
||||
</labels>
|
||||
</upload>
|
||||
</uploads>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
</project>
|
|
@ -0,0 +1,69 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Configuration holding all the configurable parameters for calling UIMA inside Solr
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class SolrUIMAConfiguration {
|
||||
|
||||
private String[] fieldsToAnalyze;
|
||||
|
||||
private boolean fieldsMerging;
|
||||
|
||||
private Map<String, Map<String, String>> typesFeaturesFieldsMapping;
|
||||
|
||||
private String aePath;
|
||||
|
||||
private Map<String, String> runtimeParameters;
|
||||
|
||||
public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
|
||||
Map<String, Map<String, String>> typesFeaturesFieldsMapping,
|
||||
Map<String, String> runtimeParameters) {
|
||||
this.aePath = aePath;
|
||||
this.fieldsToAnalyze = fieldsToAnalyze;
|
||||
this.fieldsMerging = fieldsMerging;
|
||||
this.runtimeParameters = runtimeParameters;
|
||||
this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
|
||||
}
|
||||
|
||||
public String[] getFieldsToAnalyze() {
|
||||
return fieldsToAnalyze;
|
||||
}
|
||||
|
||||
public boolean isFieldsMerging() {
|
||||
return fieldsMerging;
|
||||
}
|
||||
|
||||
public Map<String, Map<String, String>> getTypesFeaturesFieldsMapping() {
|
||||
return typesFeaturesFieldsMapping;
|
||||
}
|
||||
|
||||
public String getAePath() {
|
||||
return aePath;
|
||||
}
|
||||
|
||||
public Map<String, String> getRuntimeParameters() {
|
||||
return runtimeParameters;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
|
||||
/**
|
||||
* Read configuration for Solr-UIMA integration
|
||||
*
|
||||
* @version $Id$
|
||||
*
|
||||
*/
|
||||
public class SolrUIMAConfigurationReader {
|
||||
|
||||
private static final String AE_RUNTIME_PARAMETERS_NODE_PATH = "/config/uimaConfig/runtimeParameters";
|
||||
|
||||
private static final String FIELD_MAPPING_NODE_PATH = "/config/uimaConfig/fieldMapping";
|
||||
|
||||
private static final String ANALYZE_FIELDS_NODE_PATH = "/config/uimaConfig/analyzeFields";
|
||||
|
||||
private static final String ANALYSIS_ENGINE_NODE_PATH = "/config/uimaConfig/analysisEngine";
|
||||
|
||||
private SolrConfig solrConfig;
|
||||
|
||||
public SolrUIMAConfigurationReader(SolrConfig solrConfig) {
|
||||
this.solrConfig = solrConfig;
|
||||
}
|
||||
|
||||
public SolrUIMAConfiguration readSolrUIMAConfiguration() {
|
||||
return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
|
||||
readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
|
||||
}
|
||||
|
||||
private String readAEPath() {
|
||||
return solrConfig.getNode(ANALYSIS_ENGINE_NODE_PATH, true).getTextContent();
|
||||
}
|
||||
|
||||
private String[] readFieldsToAnalyze() {
|
||||
Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
|
||||
return analyzeFieldsNode.getTextContent().split(",");
|
||||
}
|
||||
|
||||
private boolean readFieldsMerging() {
|
||||
Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true);
|
||||
Node mergeNode = analyzeFieldsNode.getAttributes().getNamedItem("merge");
|
||||
return Boolean.valueOf(mergeNode.getNodeValue());
|
||||
}
|
||||
|
||||
private Map<String, Map<String, String>> readTypesFeaturesFieldsMapping() {
|
||||
Map<String, Map<String, String>> map = new HashMap<String, Map<String, String>>();
|
||||
|
||||
Node fieldMappingNode = solrConfig.getNode(FIELD_MAPPING_NODE_PATH, true);
|
||||
/* iterate over UIMA types */
|
||||
if (fieldMappingNode.hasChildNodes()) {
|
||||
NodeList typeNodes = fieldMappingNode.getChildNodes();
|
||||
for (int i = 0; i < typeNodes.getLength(); i++) {
|
||||
/* <type> node */
|
||||
Node typeNode = typeNodes.item(i);
|
||||
if (typeNode.getNodeType() != Node.TEXT_NODE) {
|
||||
Node typeNameAttribute = typeNode.getAttributes().getNamedItem("name");
|
||||
/* get a UIMA typename */
|
||||
String typeName = typeNameAttribute.getNodeValue();
|
||||
/* create entry for UIMA type */
|
||||
map.put(typeName, new HashMap<String, String>());
|
||||
if (typeNode.hasChildNodes()) {
|
||||
/* iterate over features */
|
||||
NodeList featuresNodeList = typeNode.getChildNodes();
|
||||
for (int j = 0; j < featuresNodeList.getLength(); j++) {
|
||||
Node mappingNode = featuresNodeList.item(j);
|
||||
if (mappingNode.getNodeType() != Node.TEXT_NODE) {
|
||||
/* get field name */
|
||||
Node fieldNameNode = mappingNode.getAttributes().getNamedItem("field");
|
||||
String mappedFieldName = fieldNameNode.getNodeValue();
|
||||
/* get feature name */
|
||||
Node featureNameNode = mappingNode.getAttributes().getNamedItem("feature");
|
||||
String featureName = featureNameNode.getNodeValue();
|
||||
/* map the feature to the field for the specified type */
|
||||
map.get(typeName).put(featureName, mappedFieldName);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
private Map<String, String> readAEOverridingParameters() {
|
||||
Map<String, String> runtimeParameters = new HashMap<String, String>();
|
||||
Node uimaConfigNode = solrConfig.getNode(AE_RUNTIME_PARAMETERS_NODE_PATH, true);
|
||||
|
||||
if (uimaConfigNode.hasChildNodes()) {
|
||||
NodeList overridingNodes = uimaConfigNode.getChildNodes();
|
||||
for (int i = 0; i < overridingNodes.getLength(); i++) {
|
||||
Node overridingNode = overridingNodes.item(i);
|
||||
if (overridingNode.getNodeType() != Node.TEXT_NODE) {
|
||||
runtimeParameters.put(overridingNode.getNodeName(), overridingNode.getTextContent());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return runtimeParameters;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,83 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.uima.cas.FSIterator;
|
||||
import org.apache.uima.cas.FeatureStructure;
|
||||
import org.apache.uima.cas.Type;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.jcas.tcas.Annotation;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Map UIMA types and features over fields of a Solr document
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UIMAToSolrMapper {
|
||||
|
||||
private final Logger log = LoggerFactory.getLogger(UIMAToSolrMapper.class);
|
||||
|
||||
private SolrInputDocument document;
|
||||
|
||||
private JCas cas;
|
||||
|
||||
public UIMAToSolrMapper(SolrInputDocument document, JCas cas) {
|
||||
this.document = document;
|
||||
this.cas = cas;
|
||||
}
|
||||
|
||||
/**
|
||||
* map features of a certain UIMA type to corresponding Solr fields based on the mapping
|
||||
*
|
||||
* @param typeName
|
||||
* name of UIMA type to map
|
||||
* @param featureFieldsmapping
|
||||
*/
|
||||
public void map(String typeName, Map<String, String> featureFieldsmapping) {
|
||||
try {
|
||||
FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor(
|
||||
JCas.class).newInstance(cas);
|
||||
Type type = fsMock.getType();
|
||||
for (FSIterator<FeatureStructure> iterator = cas.getFSIndexRepository().getAllIndexedFS(type); iterator
|
||||
.hasNext();) {
|
||||
FeatureStructure fs = iterator.next();
|
||||
for (String featureName : featureFieldsmapping.keySet()) {
|
||||
String fieldName = featureFieldsmapping.get(featureName);
|
||||
log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName)
|
||||
.append(" to ").append(fieldName).toString());
|
||||
String featureValue = null;
|
||||
if (fs instanceof Annotation && "coveredText".equals(featureName)) {
|
||||
featureValue = ((Annotation) fs).getCoveredText();
|
||||
} else {
|
||||
featureValue = fs.getFeatureValueAsString(type.getFeatureByBaseName(featureName));
|
||||
}
|
||||
log.info(new StringBuffer("writing ").append(featureValue).append(" in ").append(
|
||||
fieldName).toString());
|
||||
document.addField(fieldName, featureValue, 1.0f);
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
log.error(e.getLocalizedMessage());
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,126 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.uima.processor.ae.AEProvider;
|
||||
import org.apache.solr.uima.processor.ae.AEProviderFactory;
|
||||
import org.apache.solr.update.AddUpdateCommand;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.uima.UIMAException;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
|
||||
/**
|
||||
* Update document(s) to be indexed with UIMA extracted information
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
|
||||
|
||||
private SolrUIMAConfiguration solrUIMAConfiguration;
|
||||
|
||||
private AEProvider aeProvider;
|
||||
|
||||
public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore) {
|
||||
super(next);
|
||||
initialize(solrCore);
|
||||
}
|
||||
|
||||
private void initialize(SolrCore solrCore) {
|
||||
SolrUIMAConfigurationReader uimaConfigurationReader = new SolrUIMAConfigurationReader(solrCore
|
||||
.getSolrConfig());
|
||||
solrUIMAConfiguration = uimaConfigurationReader.readSolrUIMAConfiguration();
|
||||
aeProvider = AEProviderFactory.getInstance().getAEProvider(solrCore.getName(),
|
||||
solrUIMAConfiguration.getAePath(), solrUIMAConfiguration.getRuntimeParameters());
|
||||
}
|
||||
|
||||
public void processAdd(AddUpdateCommand cmd) throws IOException {
|
||||
try {
|
||||
/* get Solr document */
|
||||
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
|
||||
|
||||
/* get the fields to analyze */
|
||||
for (String text : getTextsToAnalyze(solrInputDocument)) {
|
||||
if (text != null && !"".equals(text)) {
|
||||
/* process the text value */
|
||||
JCas jcas = processText(text);
|
||||
|
||||
UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas);
|
||||
/* get field mapping from config */
|
||||
Map<String, Map<String, String>> typesAndFeaturesFieldsMap = solrUIMAConfiguration
|
||||
.getTypesFeaturesFieldsMapping();
|
||||
/* map type features on fields */
|
||||
for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) {
|
||||
uimaToSolrMapper.map(typeFQN, typesAndFeaturesFieldsMap.get(typeFQN));
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (UIMAException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
super.processAdd(cmd);
|
||||
}
|
||||
|
||||
/*
|
||||
* get the texts to analyze from the corresponding fields
|
||||
*/
|
||||
private String[] getTextsToAnalyze(SolrInputDocument solrInputDocument) {
|
||||
String[] fieldsToAnalyze = solrUIMAConfiguration.getFieldsToAnalyze();
|
||||
boolean merge = solrUIMAConfiguration.isFieldsMerging();
|
||||
String[] textVals = null;
|
||||
if (merge) {
|
||||
StringBuilder unifiedText = new StringBuilder("");
|
||||
for (int i = 0; i < fieldsToAnalyze.length; i++) {
|
||||
unifiedText.append(String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i])));
|
||||
}
|
||||
textVals = new String[1];
|
||||
textVals[0] = unifiedText.toString();
|
||||
} else {
|
||||
textVals = new String[fieldsToAnalyze.length];
|
||||
for (int i = 0; i < fieldsToAnalyze.length; i++) {
|
||||
textVals[i] = String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i]));
|
||||
}
|
||||
}
|
||||
return textVals;
|
||||
}
|
||||
|
||||
/* process a field value executing UIMA the CAS containing it as document text */
|
||||
private JCas processText(String textFieldValue) throws ResourceInitializationException,
|
||||
AnalysisEngineProcessException {
|
||||
log.info(new StringBuffer("Analazying text").toString());
|
||||
/* get the UIMA analysis engine */
|
||||
AnalysisEngine ae = aeProvider.getAE();
|
||||
|
||||
/* create a JCas which contain the text to analyze */
|
||||
JCas jcas = ae.newJCas();
|
||||
jcas.setDocumentText(textFieldValue);
|
||||
|
||||
/* perform analysis on text field */
|
||||
ae.process(jcas);
|
||||
log.info(new StringBuilder("Text processing completed").toString());
|
||||
return jcas;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,37 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.solr.request.SolrQueryRequest;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessor;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessorFactory;
|
||||
|
||||
/**
|
||||
* Factory for {@link UIMAUpdateRequestProcessor}
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UIMAUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory {
|
||||
|
||||
public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp,
|
||||
UpdateRequestProcessor next) {
|
||||
return new UIMAUpdateRequestProcessor(next, req.getCore());
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,32 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
|
||||
/**
|
||||
* provide an Apache UIMA {@link AnalysisEngine}
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public interface AEProvider {
|
||||
|
||||
public AnalysisEngine getAE() throws ResourceInitializationException;
|
||||
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Singleton factory class responsible of {@link AEProvider}s' creation
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class AEProviderFactory {
|
||||
|
||||
private static AEProviderFactory instance;
|
||||
|
||||
private Map<String, AEProvider> providerCache = new HashMap<String, AEProvider>();
|
||||
|
||||
private AEProviderFactory() {
|
||||
// Singleton
|
||||
}
|
||||
|
||||
public static AEProviderFactory getInstance() {
|
||||
if (instance == null) {
|
||||
instance = new AEProviderFactory();
|
||||
}
|
||||
return instance;
|
||||
}
|
||||
|
||||
public synchronized AEProvider getAEProvider(String core, String aePath,
|
||||
Map<String, String> runtimeParameters) {
|
||||
String key = new StringBuilder(core).append(aePath).toString();
|
||||
if (providerCache.get(key) == null) {
|
||||
providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters));
|
||||
}
|
||||
return providerCache.get(key);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,89 @@
|
|||
package org.apache.solr.uima.processor.ae;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.uima.UIMAFramework;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.util.XMLInputSource;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* {@link AEProvider} implementation that creates an Aggregate AE from the given path, also
|
||||
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
|
||||
* them as overriding parameters in the aggregate AE
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class OverridingParamsAEProvider implements AEProvider {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(OverridingParamsAEProvider.class);
|
||||
|
||||
private String aeFilePath;
|
||||
|
||||
private AnalysisEngine cachedAE;
|
||||
|
||||
private Map<String, String> runtimeParameters;
|
||||
|
||||
public OverridingParamsAEProvider(String aeFilePath, Map<String, String> runtimeParameters) {
|
||||
this.aeFilePath = aeFilePath;
|
||||
this.runtimeParameters = runtimeParameters;
|
||||
}
|
||||
|
||||
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
|
||||
try {
|
||||
if (cachedAE == null) {
|
||||
// get Resource Specifier from XML file
|
||||
URL url = this.getClass().getResource(aeFilePath);
|
||||
XMLInputSource in = new XMLInputSource(url);
|
||||
|
||||
// get AE description
|
||||
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
|
||||
.parseAnalysisEngineDescription(in);
|
||||
|
||||
/* iterate over each AE (to set runtime parameters) */
|
||||
for (String attributeName : runtimeParameters.keySet()) {
|
||||
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
|
||||
attributeName, runtimeParameters.get(attributeName));
|
||||
log.info(new StringBuilder("setting ").append(attributeName).append(" : ").append(
|
||||
runtimeParameters.get(attributeName)).toString());
|
||||
}
|
||||
// create AE here
|
||||
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
|
||||
if (log.isDebugEnabled())
|
||||
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
|
||||
.append(" created from descriptor ").append(aeFilePath).toString());
|
||||
} else {
|
||||
cachedAE.reconfigure();
|
||||
if (log.isDebugEnabled())
|
||||
log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName())
|
||||
.append(" at path ").append(aeFilePath).append(" reconfigured ").toString());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
cachedAE = null;
|
||||
throw new ResourceInitializationException(e);
|
||||
}
|
||||
return cachedAE;
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>false</primitive>
|
||||
<delegateAnalysisEngineSpecifiers>
|
||||
<delegateAnalysisEngine key="HmmTagger">
|
||||
<import name="HmmTagger"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="WhitespaceTokenizer">
|
||||
<import name="WhitespaceTokenizer"/>
|
||||
</delegateAnalysisEngine>
|
||||
</delegateAnalysisEngineSpecifiers>
|
||||
<analysisEngineMetaData>
|
||||
<name>AggregateSentenceAE</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters/>
|
||||
<configurationParameterSettings/>
|
||||
<flowConstraints>
|
||||
<fixedFlow>
|
||||
<node>WhitespaceTokenizer</node>
|
||||
<node>HmmTagger</node>
|
||||
</fixedFlow>
|
||||
</flowConstraints>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,57 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>false</primitive>
|
||||
<delegateAnalysisEngineSpecifiers>
|
||||
<delegateAnalysisEngine key="TextConceptTaggingAEDescriptor">
|
||||
<import name="TextConceptTaggingAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextKeywordExtractionAEDescriptor">
|
||||
<import name="TextKeywordExtractionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="OpenCalaisAnnotator">
|
||||
<import name="OpenCalaisAnnotator"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextCategorizationAEDescriptor">
|
||||
<import name="TextCategorizationAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextLanguageDetectionAEDescriptor">
|
||||
<import name="TextLanguageDetectionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextRankedEntityExtractionAEDescriptor">
|
||||
<import name="TextRankedEntityExtractionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
</delegateAnalysisEngineSpecifiers>
|
||||
<analysisEngineMetaData>
|
||||
<name>ExtServicesAE</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters/>
|
||||
<configurationParameterSettings/>
|
||||
<flowConstraints>
|
||||
<fixedFlow>
|
||||
<node>OpenCalaisAnnotator</node>
|
||||
<node>TextKeywordExtractionAEDescriptor</node>
|
||||
<node>TextLanguageDetectionAEDescriptor</node>
|
||||
<node>TextCategorizationAEDescriptor</node>
|
||||
<node>TextConceptTaggingAEDescriptor</node>
|
||||
<node>TextRankedEntityExtractionAEDescriptor</node>
|
||||
</fixedFlow>
|
||||
</flowConstraints>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,121 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.examples.tagger.HMMTagger</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>Hidden Markov Model - Part of Speech Tagger</name>
|
||||
<description>A configuration of the HmmTaggerAnnotator that looks for
|
||||
parts of speech of identified tokens within existing
|
||||
Sentence and Token annotations. See also
|
||||
WhitespaceTokenizer.xml.</description>
|
||||
<version>1.0</version>
|
||||
<vendor>The Apache Software Foundation</vendor>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>NGRAM_SIZE</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>NGRAM_SIZE</name>
|
||||
<value>
|
||||
<integer>3</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.TokenAnnotation</name>
|
||||
<description>Single token annotation</description>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>posTag</name>
|
||||
<description>contains part-of-speech of a
|
||||
corresponding token</description>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.SentenceAnnotation</name>
|
||||
<description>sentence annotation</description>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs>
|
||||
<type>org.apache.uima.TokenAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.uima.SentenceAnnotation</type>
|
||||
<feature>org.apache.uima.TokenAnnotation:end</feature>
|
||||
<feature>org.apache.uima.TokenAnnotation:begin</feature>
|
||||
</inputs>
|
||||
<outputs>
|
||||
<type>org.apache.uima.TokenAnnotation</type>
|
||||
<feature>org.apache.uima.TokenAnnotation:posTag</feature>
|
||||
<feature>org.apache.uima.TokenAnnotation:end</feature>
|
||||
<feature>org.apache.uima.TokenAnnotation:begin</feature>
|
||||
</outputs>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<externalResourceDependencies>
|
||||
<externalResourceDependency>
|
||||
<key>Model</key>
|
||||
<description>HMM Tagger model file</description>
|
||||
<interfaceName>org.apache.uima.examples.tagger.IModelResource</interfaceName>
|
||||
<optional>false</optional>
|
||||
</externalResourceDependency>
|
||||
</externalResourceDependencies>
|
||||
<resourceManagerConfiguration>
|
||||
<externalResources>
|
||||
<externalResource>
|
||||
<name>ModelFile</name>
|
||||
<description>HMM Tagger model file</description>
|
||||
<fileResourceSpecifier>
|
||||
<fileUrl>file:english/BrownModel.dat</fileUrl>
|
||||
</fileResourceSpecifier>
|
||||
<implementationName>org.apache.uima.examples.tagger.ModelResource</implementationName>
|
||||
</externalResource>
|
||||
</externalResources>
|
||||
<externalResourceBindings>
|
||||
<externalResourceBinding>
|
||||
<key>Model</key>
|
||||
<resourceName>ModelFile</resourceName>
|
||||
</externalResourceBinding>
|
||||
</externalResourceBindings>
|
||||
</resourceManagerConfiguration>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,194 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.annotator.calais.OpenCalaisAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>OpenCalaisAnnotator</name>
|
||||
<description/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>allowDistribution</name>
|
||||
<description/>
|
||||
<type>Boolean</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>allowSearch</name>
|
||||
<description/>
|
||||
<type>Boolean</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>submitter</name>
|
||||
<description/>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>licenseID</name>
|
||||
<description/>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>allowDistribution</name>
|
||||
<value>
|
||||
<boolean>false</boolean>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>allowSearch</name>
|
||||
<value>
|
||||
<boolean>false</boolean>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>submitter</name>
|
||||
<value>
|
||||
<string/>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>licenseID</name>
|
||||
<value>
|
||||
<string>OC_LICENSE_ID</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Person</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Anniversary</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.City</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Company</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Continent</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Country</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Currency</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.EmailAddress</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Facility</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.FaxNumber</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Holiday</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.IndustryTerm</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.NaturalDisaster</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.NaturalFeature</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Organization</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.PhoneNumber</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.ProviceOrState</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Region</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.Technology</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.URL</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.calais.BaseType</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.calais.BaseType</name>
|
||||
<description/>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>calaisType</name>
|
||||
<description>OpenCalais type</description>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,147 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>false</primitive>
|
||||
<delegateAnalysisEngineSpecifiers>
|
||||
<delegateAnalysisEngine key="TextKeywordExtractionAEDescriptor">
|
||||
<import name="TextKeywordExtractionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextConceptTaggingAEDescriptor">
|
||||
<import name="TextConceptTaggingAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="OpenCalaisAnnotator">
|
||||
<import name="OpenCalaisAnnotator"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextLanguageDetectionAEDescriptor">
|
||||
<import name="TextLanguageDetectionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextCategorizationAEDescriptor">
|
||||
<import name="TextCategorizationAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="AggregateSentenceAE">
|
||||
<import location="AggregateSentenceAE.xml"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="TextRankedEntityExtractionAEDescriptor">
|
||||
<import name="TextRankedEntityExtractionAEDescriptor"/>
|
||||
</delegateAnalysisEngine>
|
||||
</delegateAnalysisEngineSpecifiers>
|
||||
<analysisEngineMetaData>
|
||||
<name>ExtServicesAE</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters searchStrategy="language_fallback">
|
||||
<configurationParameter>
|
||||
<name>oc_licenseID</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>OpenCalaisAnnotator/licenseID</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>keyword_apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>TextKeywordExtractionAEDescriptor/apikey</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>concept_apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>TextConceptTaggingAEDescriptor/apikey</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>lang_apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>TextLanguageDetectionAEDescriptor/apikey</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>cat_apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>TextCategorizationAEDescriptor/apikey</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>entities_apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
<overrides>
|
||||
<parameter>TextRankedEntityExtractionAEDescriptor/apikey</parameter>
|
||||
</overrides>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>oc_licenseID</name>
|
||||
<value>
|
||||
<string>licenseid</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>keyword_apikey</name>
|
||||
<value>
|
||||
<string>apikey</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>concept_apikey</name>
|
||||
<value>
|
||||
<string>apikey</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>lang_apikey</name>
|
||||
<value>
|
||||
<string>apikey</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>cat_apikey</name>
|
||||
<value>
|
||||
<string>apikey</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<flowConstraints>
|
||||
<fixedFlow>
|
||||
<node>AggregateSentenceAE</node>
|
||||
<node>OpenCalaisAnnotator</node>
|
||||
<node>TextKeywordExtractionAEDescriptor</node>
|
||||
<node>TextLanguageDetectionAEDescriptor</node>
|
||||
<node>TextCategorizationAEDescriptor</node>
|
||||
<node>TextConceptTaggingAEDescriptor</node>
|
||||
<node>TextRankedEntityExtractionAEDescriptor</node>
|
||||
</fixedFlow>
|
||||
</flowConstraints>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,102 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
|
||||
-->
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextCategorizationAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>TextCategorizationAEDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>outputMode</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>baseUrl</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>outputMode</name>
|
||||
<value>
|
||||
<string>xml</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>apikey</name>
|
||||
<value>
|
||||
<string>AA_API_KEY</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.categorization.Category</name>
|
||||
<description/>
|
||||
<supertypeName>uima.cas.TOP</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>score</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>text</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,196 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextConceptTaggingAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>TextConceptTaggingAEDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>outputMode</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>linkedData</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>showSourceText</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>maxRetrieve</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>url</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>apikey</name>
|
||||
<value>
|
||||
<string/>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>outputMode</name>
|
||||
<value>
|
||||
<string>xml</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>linkedData</name>
|
||||
<value>
|
||||
<string>1</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>showSourceText</name>
|
||||
<value>
|
||||
<integer>0</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>maxRetrieve</name>
|
||||
<value>
|
||||
<string>8</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.concept.ConceptFS</name>
|
||||
<description>a concept tag</description>
|
||||
<supertypeName>uima.cas.TOP</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>text</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>relevance</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>website</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>geo</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>dbpedia</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>yago</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>opencyc</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>freebase</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>ciaFactbook</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>census</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>geonames</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>musicBrainz</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>crunchbase</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>semanticCrunchbase</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,107 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextKeywordExtractionAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>TextKeywordExtractionAEDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>outputMode</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>baseUrl</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>url</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>maxRetrieve</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>showSourceText</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>outputMode</name>
|
||||
<value>
|
||||
<string>xml</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>apikey</name>
|
||||
<value>
|
||||
<string>04490000a72fe7ec5cb3497f14e77f338c86f2fe</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>maxRetrieve</name>
|
||||
<value>
|
||||
<integer>10</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>showSourceText</name>
|
||||
<value>
|
||||
<integer>0</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.keywords.KeywordFS</name>
|
||||
<description/>
|
||||
<supertypeName>uima.cas.TOP</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>text</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,107 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextLanguageDetectionAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>TextLanguageDetectionAEDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>outputMode</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>url</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>outputMode</name>
|
||||
<value>
|
||||
<string>xml</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>apikey</name>
|
||||
<value>
|
||||
<string>AA_API_KEY</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.language.LanguageFS</name>
|
||||
<description/>
|
||||
<supertypeName>uima.cas.TOP</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>language</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>iso6391</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>iso6392</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>iso6393</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>ethnologue</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>nativeSpeakers</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>wikipedia</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,403 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
|
||||
-->
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.uima.alchemy.annotator.TextRankedNamedEntityExtractionAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>TextRankedEntityExtractionAEDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>apikey</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>outputMode</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>disambiguate</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>linkedData</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>showSourceText</name>
|
||||
<type>Integer</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>true</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>baseUrl</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>url</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>coreference</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
<configurationParameter>
|
||||
<name>quotations</name>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>apikey</name>
|
||||
<value>
|
||||
<string/>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>outputMode</name>
|
||||
<value>
|
||||
<string>xml</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>disambiguate</name>
|
||||
<value>
|
||||
<integer>1</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>linkedData</name>
|
||||
<value>
|
||||
<string>1</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>coreference</name>
|
||||
<value>
|
||||
<string>1</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>showSourceText</name>
|
||||
<value>
|
||||
<integer>0</integer>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
<nameValuePair>
|
||||
<name>quotations</name>
|
||||
<value>
|
||||
<string>1</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<imports>
|
||||
<import location="baseAlchemyTypeSystemDescriptor.xml"/>
|
||||
</imports>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Anniversary</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Automobile</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.City</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Company</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Continent</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Country</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.EntertainmentAward</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Facility</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.FieldTerminology</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.FinancialMarketIndex</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.GeographicFeature</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.HealthCondition</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Holiday</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Movie</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.MusicGroup</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.NaturalDisaster</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Organization</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Person</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.PrintMedia</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.RadioProgram</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.RadioStation</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Region</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Sport</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.StateOrCounty</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Technology</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.TelevisionShow</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.TelevisionStation</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.OperatingSystem</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.SportingEvent</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.Drug</name>
|
||||
<description/>
|
||||
<supertypeName>org.apache.uima.alchemy.ts.entity.BaseEntity</supertypeName>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.BaseEntity</name>
|
||||
<description/>
|
||||
<supertypeName>uima.cas.TOP</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>text</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>count</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>relevance</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>disambiguation</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>subType</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>website</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>geo</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>dbpedia</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>yago</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>opencyc</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>umbel</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>freebase</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>ciaFactbook</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>census</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>geonames</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>musicBrainz</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>quotations</name>
|
||||
<description/>
|
||||
<rangeTypeName>uima.cas.StringArray</rangeTypeName>
|
||||
<multipleReferencesAllowed>true</multipleReferencesAllowed>
|
||||
</featureDescription>
|
||||
<featureDescription>
|
||||
<name>occurrences</name>
|
||||
<description>A list of annotations annotating this entity</description>
|
||||
<rangeTypeName>uima.cas.FSList</rangeTypeName>
|
||||
<elementType>uima.tcas.Annotation</elementType>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs/>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,115 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
|
||||
<!--
|
||||
***************************************************************
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing,
|
||||
* software distributed under the License is distributed on an
|
||||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
* KIND, either express or implied. See the License for the
|
||||
* specific language governing permissions and limitations
|
||||
* under the License.
|
||||
***************************************************************
|
||||
-->
|
||||
|
||||
<analysisEngineDescription
|
||||
xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>
|
||||
org.apache.uima.java
|
||||
</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>
|
||||
org.apache.uima.annotator.WhitespaceTokenizer
|
||||
</annotatorImplementationName>
|
||||
|
||||
<analysisEngineMetaData>
|
||||
<name>WhitespaceTokenizer</name>
|
||||
<description>
|
||||
creates token and sentence annotations for whitespace
|
||||
separated languages
|
||||
</description>
|
||||
<version>1.0</version>
|
||||
<vendor>The Apache Software Foundation</vendor>
|
||||
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>SofaNames</name>
|
||||
<description>
|
||||
The Sofa names the annotator should work on. If no
|
||||
names are specified, the annotator works on the
|
||||
default sofa.
|
||||
</description>
|
||||
<type>String</type>
|
||||
<multiValued>true</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
|
||||
</configurationParameters>
|
||||
|
||||
<configurationParameterSettings>
|
||||
<!--
|
||||
<nameValuePair>
|
||||
<name>SofaNames</name>
|
||||
<value>
|
||||
<array>
|
||||
<string>sofaName</string>
|
||||
</array>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
-->
|
||||
</configurationParameterSettings>
|
||||
|
||||
<typeSystemDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.TokenAnnotation</name>
|
||||
<description>Single token annotation</description>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>tokenType</name>
|
||||
<description>token type</description>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.SentenceAnnotation</name>
|
||||
<description>sentence annotation</description>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
|
||||
</features>
|
||||
</typeDescription>
|
||||
</typeSystemDescription>
|
||||
|
||||
<fsIndexes />
|
||||
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs />
|
||||
<outputs>
|
||||
<type>org.apache.uima.TokenAnnotation</type>
|
||||
<feature>
|
||||
org.apache.uima.TokenAnnotation:tokentype
|
||||
</feature>
|
||||
<type>org.apache.uima.SentenceAnnotation</type>
|
||||
</outputs>
|
||||
<languagesSupported>
|
||||
<language>x-unspecified</language>
|
||||
</languagesSupported>
|
||||
</capability>
|
||||
</capabilities>
|
||||
|
||||
</analysisEngineMetaData>
|
||||
</analysisEngineDescription>
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
|
||||
-->
|
||||
<typeSystemDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<name>baseAlchemyTypeSystemDescriptor</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.uima.alchemy.ts.entity.AlchemyAnnotation</name>
|
||||
<description/>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>alchemyType</name>
|
||||
<description>alchemyAPI type</description>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
|
@ -0,0 +1,33 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version
|
||||
2.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
|
||||
applicable law or agreed to in writing, software distributed under
|
||||
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<uimaConfig>
|
||||
<runtimeParameters>
|
||||
<keyword_apikey>VALID_ALCHEMYAPI_KEY</keyword_apikey>
|
||||
<concept_apikey>VALID_ALCHEMYAPI_KEY</concept_apikey>
|
||||
<lang_apikey>VALID_ALCHEMYAPI_KEY</lang_apikey>
|
||||
<cat_apikey>VALID_ALCHEMYAPI_KEY</cat_apikey>
|
||||
<oc_licenseID>VALID_OPENCALAIS_KEY</oc_licenseID>
|
||||
</runtimeParameters>
|
||||
<analysisEngine>/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</analysisEngine>
|
||||
<analyzeFields merge="false">text,title</analyzeFields>
|
||||
<fieldMapping>
|
||||
<type name="org.apache.uima.jcas.tcas.Annotation">
|
||||
<map feature="coveredText" field="tag"/>
|
||||
</type>
|
||||
</fieldMapping>
|
||||
</uimaConfig>
|
|
@ -0,0 +1,9 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<fields>
|
||||
<field name="language" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
||||
<field name="keyword" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
||||
<field name="suggested_category" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
|
||||
<dynamicField name="entity*" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
</fields>
|
|
@ -0,0 +1,137 @@
|
|||
package org.apache.solr.uima.processor;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.net.URL;
|
||||
import java.net.URLConnection;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.solr.SolrTestCaseJ4;
|
||||
import org.apache.solr.common.params.MultiMapSolrParams;
|
||||
import org.apache.solr.common.params.SolrParams;
|
||||
import org.apache.solr.common.params.UpdateParams;
|
||||
import org.apache.solr.common.util.ContentStream;
|
||||
import org.apache.solr.common.util.ContentStreamBase;
|
||||
import org.apache.solr.core.SolrCore;
|
||||
import org.apache.solr.handler.XmlUpdateRequestHandler;
|
||||
import org.apache.solr.request.SolrQueryRequestBase;
|
||||
import org.apache.solr.response.SolrQueryResponse;
|
||||
import org.apache.solr.update.processor.UpdateRequestProcessorChain;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
|
||||
/**
|
||||
* TestCase for {@link UIMAUpdateRequestProcessor}
|
||||
*
|
||||
* @version $Id$
|
||||
*/
|
||||
public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 {
|
||||
|
||||
@BeforeClass
|
||||
public static void beforeClass() throws Exception {
|
||||
initCore("solrconfig.xml", "schema.xml", "solr-uima");
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
clearIndex();
|
||||
assertU(commit());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessorConfiguration() {
|
||||
SolrCore core = h.getCore();
|
||||
UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima");
|
||||
assertNotNull(chained);
|
||||
UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained
|
||||
.getFactories()[0];
|
||||
assertNotNull(factory);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testProcessing() throws Exception {
|
||||
// this test requires an internet connection (e.g. opencalais api)
|
||||
checkInternetConnection();
|
||||
|
||||
addDoc(adoc(
|
||||
"id",
|
||||
"2312312321312",
|
||||
"text",
|
||||
"SpellCheckComponent got improvement related to recent Lucene changes. \n "
|
||||
+ "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
|
||||
+ "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
|
||||
+ " attached if you need it, but it is also committed to trunk and 3_x branch."
|
||||
+ " Last Lucene European Conference has been held in Prague."));
|
||||
assertU(commit());
|
||||
assertQ(req("language:english"), "//*[@numFound='1']");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTwoUpdates() {
|
||||
// this test requires an internet connection (e.g. opencalais api)
|
||||
checkInternetConnection();
|
||||
|
||||
try {
|
||||
addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
|
||||
+ "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
|
||||
+ "Hemisphere!"));
|
||||
assertU(commit());
|
||||
assertQ(req("language:english"), "//*[@numFound='1']");
|
||||
|
||||
addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
|
||||
+ "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
|
||||
+ " attendee-driven, facilitated by members of the Apache community and will "
|
||||
+ "focus on the Apache..."));
|
||||
assertU(commit());
|
||||
assertQ(req("language:english"), "//*[@numFound='2']");
|
||||
|
||||
} catch (Exception e) {
|
||||
assumeNoException("Multiple updates on same instance didn't work", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void addDoc(String doc) throws Exception {
|
||||
Map<String, String[]> params = new HashMap<String, String[]>();
|
||||
params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "uima" });
|
||||
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
|
||||
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
|
||||
};
|
||||
|
||||
XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler();
|
||||
handler.init(null);
|
||||
ArrayList<ContentStream> streams = new ArrayList<ContentStream>(2);
|
||||
streams.add(new ContentStreamBase.StringStream(doc));
|
||||
req.setContentStreams(streams);
|
||||
handler.handleRequestBody(req, new SolrQueryResponse());
|
||||
}
|
||||
|
||||
private void checkInternetConnection() {
|
||||
try {
|
||||
URLConnection conn = new URL("http://www.apache.org/").openConnection();
|
||||
conn.setConnectTimeout(5000);
|
||||
conn.setReadTimeout(5000);
|
||||
conn.connect();
|
||||
} catch (Exception ex) {
|
||||
assumeNoException("This test requires an internet connection", ex);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# Use a protected word file to protect against the stemmer reducing two
|
||||
# unrelated words to the same base word.
|
||||
|
||||
# Some non-words that normally won't be encountered,
|
||||
# just to test that they won't be stemmed.
|
||||
dontstems
|
||||
zwhacky
|
||||
|
|
@ -0,0 +1,679 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version
|
||||
2.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0 Unless required by
|
||||
applicable law or agreed to in writing, software distributed under
|
||||
the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
|
||||
OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is the Solr schema file. This file should be named "schema.xml"
|
||||
and should be in the conf directory under the solr home (i.e.
|
||||
./solr/conf/schema.xml by default) or located where the classloader
|
||||
for the Solr webapp can find it. This example schema is the
|
||||
recommended starting point for users. It should be kept correct and
|
||||
concise, usable out-of-the-box. For more information, on how to
|
||||
customize this file, please see
|
||||
http://wiki.apache.org/solr/SchemaXml PERFORMANCE NOTE: this schema
|
||||
includes many optional features and should not be used for
|
||||
benchmarking. To improve performance one could - set stored="false"
|
||||
for all fields possible (esp large fields) when you only need to
|
||||
search on the field but don't need to return the original value. -
|
||||
set indexed="false" if you don't need to search on the field, but
|
||||
only return the field as a result of searching on other indexed
|
||||
fields. - remove all unneeded copyField statements - for best index
|
||||
size and searching performance, set "index" to false for all general
|
||||
text fields, use copyField to copy them to the catchall "text"
|
||||
field, and use that for searching. - For maximum indexing
|
||||
performance, use the StreamingUpdateSolrServer java client. -
|
||||
Remember to run the JVM in server mode, and use a higher logging
|
||||
level that avoids logging every request
|
||||
-->
|
||||
|
||||
<schema name="sample" version="1.2">
|
||||
<!--
|
||||
attribute "name" is the name of this schema and is only used for
|
||||
display purposes. Applications should change this to reflect the
|
||||
nature of the search collection. version="1.2" is Solr's version
|
||||
number for the schema syntax and semantics. It should not normally
|
||||
be changed by applications. 1.0: multiValued attribute did not
|
||||
exist, all fields are multiValued by nature 1.1: multiValued
|
||||
attribute introduced, false by default 1.2: omitTermFreqAndPositions
|
||||
attribute introduced, true by default except for text fields.
|
||||
-->
|
||||
|
||||
<types>
|
||||
<!--
|
||||
field type definitions. The "name" attribute is just a label to be
|
||||
used by field definitions. The "class" attribute and any other
|
||||
attributes determine the real behavior of the fieldType. Class
|
||||
names starting with "solr" refer to java classes in the
|
||||
org.apache.solr.analysis package.
|
||||
-->
|
||||
|
||||
<!--
|
||||
The StrField type is not analyzed, but indexed/stored verbatim. -
|
||||
StrField and TextField support an optional compressThreshold which
|
||||
limits compression (if enabled in the derived fields) to values
|
||||
which exceed a certain size (in characters).
|
||||
-->
|
||||
<fieldType name="string" class="solr.StrField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
<!-- boolean type: "true" or "false" -->
|
||||
<fieldType name="boolean" class="solr.BoolField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<!--
|
||||
Binary data type. The data should be sent/retrieved in as Base64
|
||||
encoded Strings
|
||||
-->
|
||||
<fieldtype name="binary" class="solr.BinaryField" />
|
||||
|
||||
<!--
|
||||
The optional sortMissingLast and sortMissingFirst attributes are
|
||||
currently supported on types that are sorted internally as
|
||||
strings. This includes
|
||||
"string","boolean","sint","slong","sfloat","sdouble","pdate" - If
|
||||
sortMissingLast="true", then a sort on this field will cause
|
||||
documents without the field to come after documents with the
|
||||
field, regardless of the requested sort order (asc or desc). - If
|
||||
sortMissingFirst="true", then a sort on this field will cause
|
||||
documents without the field to come before documents with the
|
||||
field, regardless of the requested sort order. - If
|
||||
sortMissingLast="false" and sortMissingFirst="false" (the
|
||||
default), then default lucene sorting will be used which places
|
||||
docs without the field first in an ascending sort and last in a
|
||||
descending sort.
|
||||
-->
|
||||
|
||||
<!--
|
||||
Default numeric field types. For faster range queries, consider
|
||||
the tint/tfloat/tlong/tdouble types.
|
||||
-->
|
||||
<fieldType name="int" class="solr.TrieIntField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="float" class="solr.TrieFloatField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="long" class="solr.TrieLongField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="double" class="solr.TrieDoubleField"
|
||||
precisionStep="0" omitNorms="true" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
Numeric field types that index each value at various levels of
|
||||
precision to accelerate range queries when the number of values
|
||||
between the range endpoints is large. See the javadoc for
|
||||
NumericRangeQuery for internal implementation details. Smaller
|
||||
precisionStep values (specified in bits) will lead to more tokens
|
||||
indexed per value, slightly larger index size, and faster range
|
||||
queries. A precisionStep of 0 disables indexing at different
|
||||
precision levels.
|
||||
-->
|
||||
<fieldType name="tint" class="solr.TrieIntField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tfloat" class="solr.TrieFloatField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tlong" class="solr.TrieLongField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
<fieldType name="tdouble" class="solr.TrieDoubleField"
|
||||
precisionStep="8" omitNorms="true" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
The format for this date field is of the form
|
||||
1995-12-31T23:59:59Z, and is a more restricted form of the
|
||||
canonical representation of dateTime
|
||||
http://www.w3.org/TR/xmlschema-2/#dateTime The trailing "Z"
|
||||
designates UTC time and is mandatory. Optional fractional seconds
|
||||
are allowed: 1995-12-31T23:59:59.999Z All other components are
|
||||
mandatory. Expressions can also be used to denote calculations
|
||||
that should be performed relative to "NOW" to determine the value,
|
||||
ie... NOW/HOUR ... Round to the start of the current hour NOW-1DAY
|
||||
... Exactly 1 day prior to now NOW/DAY+6MONTHS+3DAYS ... 6 months
|
||||
and 3 days in the future from the start of the current day Consult
|
||||
the DateField javadocs for more information. Note: For faster
|
||||
range queries, consider the tdate type
|
||||
-->
|
||||
<fieldType name="date" class="solr.TrieDateField"
|
||||
omitNorms="true" precisionStep="0" positionIncrementGap="0" />
|
||||
|
||||
<!--
|
||||
A Trie based date field for faster date range queries and date
|
||||
faceting.
|
||||
-->
|
||||
<fieldType name="tdate" class="solr.TrieDateField"
|
||||
omitNorms="true" precisionStep="6" positionIncrementGap="0" />
|
||||
|
||||
|
||||
<!--
|
||||
Note: These should only be used for compatibility with existing
|
||||
indexes (created with older Solr versions) or if
|
||||
"sortMissingFirst" or "sortMissingLast" functionality is needed.
|
||||
Use Trie based fields instead. Plain numeric field types that
|
||||
store and index the text value verbatim (and hence don't support
|
||||
range queries, since the lexicographic ordering isn't equal to the
|
||||
numeric ordering)
|
||||
-->
|
||||
<fieldType name="pint" class="solr.IntField" omitNorms="true" />
|
||||
<fieldType name="plong" class="solr.LongField" omitNorms="true" />
|
||||
<fieldType name="pfloat" class="solr.FloatField"
|
||||
omitNorms="true" />
|
||||
<fieldType name="pdouble" class="solr.DoubleField"
|
||||
omitNorms="true" />
|
||||
<fieldType name="pdate" class="solr.DateField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
|
||||
<!--
|
||||
Note: These should only be used for compatibility with existing
|
||||
indexes (created with older Solr versions) or if
|
||||
"sortMissingFirst" or "sortMissingLast" functionality is needed.
|
||||
Use Trie based fields instead. Numeric field types that manipulate
|
||||
the value into a string value that isn't human-readable in its
|
||||
internal form, but with a lexicographic ordering the same as the
|
||||
numeric ordering, so that range queries work correctly.
|
||||
-->
|
||||
<fieldType name="sint" class="solr.SortableIntField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="slong" class="solr.SortableLongField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="sfloat" class="solr.SortableFloatField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
<fieldType name="sdouble" class="solr.SortableDoubleField"
|
||||
sortMissingLast="true" omitNorms="true" />
|
||||
|
||||
|
||||
<!--
|
||||
The "RandomSortField" is not used to store or search any data. You
|
||||
can declare fields of this type it in your schema to generate
|
||||
pseudo-random orderings of your docs for sorting purposes. The
|
||||
ordering is generated based on the field name and the version of
|
||||
the index, As long as the index version remains unchanged, and the
|
||||
same field name is reused, the ordering of the docs will be
|
||||
consistent. If you want different psuedo-random orderings of
|
||||
documents, for the same version of the index, use a dynamicField
|
||||
and change the name
|
||||
-->
|
||||
<fieldType name="random" class="solr.RandomSortField"
|
||||
indexed="true" />
|
||||
|
||||
<!--
|
||||
solr.TextField allows the specification of custom text analyzers
|
||||
specified as a tokenizer and a list of token filters. Different
|
||||
analyzers may be specified for indexing and querying. The optional
|
||||
positionIncrementGap puts space between multiple fields of this
|
||||
type on the same document, with the purpose of preventing false
|
||||
phrase matching across fields. For more info on customizing your
|
||||
analyzer chain, please see
|
||||
http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters
|
||||
-->
|
||||
|
||||
<!--
|
||||
One can also specify an existing Analyzer class that has a default
|
||||
constructor via the class attribute on the analyzer element
|
||||
<fieldType name="text_greek" class="solr.TextField"> <analyzer
|
||||
class="org.apache.lucene.analysis.el.GreekAnalyzer"/> </fieldType>
|
||||
-->
|
||||
|
||||
<!--
|
||||
A text field that only splits on whitespace for exact matching of
|
||||
words
|
||||
-->
|
||||
<fieldType name="text_ws" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!--
|
||||
A text field that uses WordDelimiterFilter to enable splitting and
|
||||
matching of words on case-change, alpha numeric boundaries, and
|
||||
non-alphanumeric chars, so that a query of "wifi" or "wi fi" could
|
||||
match a document containing "Wi-Fi". Synonyms and stopwords are
|
||||
customized by external files, and stemming is enabled.
|
||||
-->
|
||||
<fieldType name="text" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<!--
|
||||
in this example, we will only use synonyms at query time
|
||||
<filter class="solr.SynonymFilterFactory"
|
||||
synonyms="index_synonyms.txt" ignoreCase="true"
|
||||
expand="false"/>
|
||||
-->
|
||||
<!--
|
||||
Case insensitive stop word removal. add
|
||||
enablePositionIncrements=true in both the index and query
|
||||
analyzers to leave a 'gap' for more accurate phrase queries.
|
||||
-->
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||
ignoreCase="true" expand="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
Less flexible matching, but less false matches. Probably not ideal
|
||||
for product names, but may be good for SKUs. Can insert dashes in
|
||||
the wrong place and still match.
|
||||
-->
|
||||
<fieldType name="textTight" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||
ignoreCase="true" expand="false" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="0" generateNumberParts="0" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
|
||||
<!--
|
||||
this filter can remove any duplicate tokens that appear at the
|
||||
same position - sometimes possible with WordDelimiterFilter in
|
||||
conjuncton with stemming.
|
||||
-->
|
||||
<filter class="solr.RemoveDuplicatesTokenFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
A general unstemmed text field - good if one does not know the
|
||||
language of the field
|
||||
-->
|
||||
<fieldType name="textgen" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||
ignoreCase="true" expand="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
A general unstemmed text field that indexes tokens normally and
|
||||
also reversed (via ReversedWildcardFilterFactory), to enable more
|
||||
efficient leading wildcard queries.
|
||||
-->
|
||||
<fieldType name="text_rev" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer type="index">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="1"
|
||||
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<filter class="solr.ReversedWildcardFilterFactory"
|
||||
withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2"
|
||||
maxFractionAsterisk="0.33" />
|
||||
</analyzer>
|
||||
<analyzer type="query">
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
|
||||
ignoreCase="true" expand="true" />
|
||||
<filter class="solr.StopFilterFactory" ignoreCase="true"
|
||||
words="stopwords.txt" enablePositionIncrements="true" />
|
||||
<filter class="solr.WordDelimiterFilterFactory"
|
||||
generateWordParts="1" generateNumberParts="1" catenateWords="0"
|
||||
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<!-- charFilter + WhitespaceTokenizer -->
|
||||
<!--
|
||||
<fieldType name="textCharNorm" class="solr.TextField"
|
||||
positionIncrementGap="100" > <analyzer> <charFilter
|
||||
class="solr.MappingCharFilterFactory"
|
||||
mapping="mapping-ISOLatin1Accent.txt"/> <tokenizer
|
||||
class="solr.WhitespaceTokenizerFactory"/> </analyzer> </fieldType>
|
||||
-->
|
||||
|
||||
<!--
|
||||
This is an example of using the KeywordTokenizer along With
|
||||
various TokenFilterFactories to produce a sortable field that does
|
||||
not include some properties of the source text
|
||||
-->
|
||||
<fieldType name="alphaOnlySort" class="solr.TextField"
|
||||
sortMissingLast="true" omitNorms="true">
|
||||
<analyzer>
|
||||
<!--
|
||||
KeywordTokenizer does no actual tokenizing, so the entire
|
||||
input string is preserved as a single token
|
||||
-->
|
||||
<tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
<!--
|
||||
The LowerCase TokenFilter does what you expect, which can be
|
||||
when you want your sorting to be case insensitive
|
||||
-->
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
<!-- The TrimFilter removes any leading or trailing whitespace -->
|
||||
<filter class="solr.TrimFilterFactory" />
|
||||
<!--
|
||||
The PatternReplaceFilter gives you the flexibility to use Java
|
||||
Regular expression to replace any sequence of characters
|
||||
matching a pattern with an arbitrary replacement string, which
|
||||
may include back references to portions of the original string
|
||||
matched by the pattern. See the Java Regular Expression
|
||||
documentation for more information on pattern and replacement
|
||||
string syntax.
|
||||
|
||||
http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
|
||||
-->
|
||||
<filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])"
|
||||
replacement="" replace="all" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
<fieldtype name="phonetic" stored="false" indexed="true"
|
||||
class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.StandardTokenizerFactory" />
|
||||
<filter class="solr.DoubleMetaphoneFilterFactory" inject="false" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<fieldtype name="payloads" stored="false" indexed="true"
|
||||
class="solr.TextField">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.WhitespaceTokenizerFactory" />
|
||||
<!--
|
||||
The DelimitedPayloadTokenFilter can put payloads on tokens...
|
||||
for example, a token of "foo|1.4" would be indexed as "foo"
|
||||
with a payload of 1.4f Attributes of the
|
||||
DelimitedPayloadTokenFilterFactory : "delimiter" - a one
|
||||
character delimiter. Default is | (pipe) "encoder" - how to
|
||||
encode the following value into a playload float ->
|
||||
org.apache.lucene.analysis.payloads.FloatEncoder, integer ->
|
||||
o.a.l.a.p.IntegerEncoder identity -> o.a.l.a.p.IdentityEncoder
|
||||
Fully Qualified class name implementing PayloadEncoder,
|
||||
Encoder must have a no arg constructor.
|
||||
-->
|
||||
<filter class="solr.DelimitedPayloadTokenFilterFactory"
|
||||
encoder="float" />
|
||||
</analyzer>
|
||||
</fieldtype>
|
||||
|
||||
<!--
|
||||
lowercases the entire field value, keeping it as a single token.
|
||||
-->
|
||||
<fieldType name="lowercase" class="solr.TextField"
|
||||
positionIncrementGap="100">
|
||||
<analyzer>
|
||||
<tokenizer class="solr.KeywordTokenizerFactory" />
|
||||
<filter class="solr.LowerCaseFilterFactory" />
|
||||
</analyzer>
|
||||
</fieldType>
|
||||
|
||||
|
||||
<!--
|
||||
since fields of this type are by default not stored or indexed,
|
||||
any data added to them will be ignored outright.
|
||||
-->
|
||||
<fieldtype name="ignored" stored="false" indexed="false"
|
||||
multiValued="true" class="solr.StrField" />
|
||||
|
||||
</types>
|
||||
|
||||
|
||||
<fields>
|
||||
<!--
|
||||
Valid attributes for fields: name: mandatory - the name for the
|
||||
field type: mandatory - the name of a previously defined type from
|
||||
the <types> section indexed: true if this field should be indexed
|
||||
(searchable or sortable) stored: true if this field should be
|
||||
retrievable compressed: [false] if this field should be stored
|
||||
using gzip compression (this will only apply if the field type is
|
||||
compressable; among the standard field types, only TextField and
|
||||
StrField are) multiValued: true if this field may contain multiple
|
||||
values per document omitNorms: (expert) set to true to omit the
|
||||
norms associated with this field (this disables length
|
||||
normalization and index-time boosting for the field, and saves
|
||||
some memory). Only full-text fields or fields that need an
|
||||
index-time boost need norms. termVectors: [false] set to true to
|
||||
store the term vector for a given field. When using MoreLikeThis,
|
||||
fields used for similarity should be stored for best performance.
|
||||
termPositions: Store position information with the term vector.
|
||||
This will increase storage costs. termOffsets: Store offset
|
||||
information with the term vector. This will increase storage
|
||||
costs. default: a value that should be used if no value is
|
||||
specified when adding a document.
|
||||
-->
|
||||
<field name="id" type="string" indexed="true" stored="true"
|
||||
required="true" />
|
||||
<field name="sku" type="textTight" indexed="true" stored="true"
|
||||
omitNorms="true" />
|
||||
<field name="name" type="textgen" indexed="true" stored="true" />
|
||||
<field name="alphaNameSort" type="alphaOnlySort" indexed="true"
|
||||
stored="false" />
|
||||
<field name="manu" type="textgen" indexed="true" stored="true"
|
||||
omitNorms="true" />
|
||||
<field name="cat" type="text_ws" indexed="true" stored="true"
|
||||
multiValued="true" omitNorms="true" />
|
||||
<field name="features" type="text" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
<field name="includes" type="text" indexed="true" stored="true"
|
||||
termVectors="true" termPositions="true" termOffsets="true" />
|
||||
|
||||
<field name="weight" type="float" indexed="true" stored="true" />
|
||||
<field name="price" type="float" indexed="true" stored="true" />
|
||||
<field name="popularity" type="int" indexed="true" stored="true" />
|
||||
<field name="inStock" type="boolean" indexed="true" stored="true" />
|
||||
|
||||
|
||||
<!--
|
||||
Common metadata fields, named specifically to match up with
|
||||
SolrCell metadata when parsing rich documents such as Word, PDF.
|
||||
Some fields are multiValued only because Tika currently may return
|
||||
multiple values for them.
|
||||
-->
|
||||
<field name="title" type="text" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
<field name="subject" type="text" indexed="true" stored="true" />
|
||||
<field name="description" type="text" indexed="true" stored="true" />
|
||||
<field name="comments" type="text" indexed="true" stored="true" />
|
||||
<field name="author" type="textgen" indexed="true" stored="true" />
|
||||
<field name="keywords" type="textgen" indexed="true" stored="true" />
|
||||
<field name="category" type="textgen" indexed="true" stored="true" />
|
||||
<field name="content_type" type="string" indexed="true"
|
||||
stored="true" multiValued="true" />
|
||||
<field name="last_modified" type="date" indexed="true" stored="true" />
|
||||
<field name="links" type="string" indexed="true" stored="true"
|
||||
multiValued="true" />
|
||||
|
||||
|
||||
<!--
|
||||
catchall field, containing all other searchable text fields
|
||||
(implemented via copyField further on in this schema
|
||||
-->
|
||||
<field name="text" type="text" indexed="true" stored="false"
|
||||
multiValued="true" />
|
||||
|
||||
<!--
|
||||
catchall text field that indexes tokens both normally and in
|
||||
reverse for efficient leading wildcard queries.
|
||||
-->
|
||||
<field name="text_rev" type="text_rev" indexed="true" stored="false"
|
||||
multiValued="true" />
|
||||
|
||||
<!--
|
||||
non-tokenized version of manufacturer to make it easier to sort or
|
||||
group results by manufacturer. copied from "manu" via copyField
|
||||
-->
|
||||
<field name="manu_exact" type="string" indexed="true" stored="false" />
|
||||
|
||||
<field name="payloads" type="payloads" indexed="true" stored="true" />
|
||||
|
||||
<!--
|
||||
Uncommenting the following will create a "timestamp" field using a
|
||||
default value of "NOW" to indicate when each document was indexed.
|
||||
-->
|
||||
<!--
|
||||
<field name="timestamp" type="date" indexed="true" stored="true"
|
||||
default="NOW" multiValued="false"/>
|
||||
-->
|
||||
|
||||
<field name="language" type="string" indexed="true" stored="true" required="false"/>
|
||||
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
||||
<field name="keyword" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
||||
<field name="suggested_category" type="string" indexed="true" stored="true" multiValued="false" required="false"/>
|
||||
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
|
||||
<dynamicField name="entity*" type="text" indexed="true" stored="true" multiValued="true"/>
|
||||
|
||||
<!--
|
||||
Dynamic field definitions. If a field name is not found,
|
||||
dynamicFields will be used if the name matches any of the
|
||||
patterns. RESTRICTION: the glob-like pattern in the name attribute
|
||||
must have a "*" only at the start or the end. EXAMPLE: name="*_i"
|
||||
will match any field ending in _i (like myid_i, z_i) Longer
|
||||
patterns will be matched first. if equal size patterns both match,
|
||||
the first appearing in the schema will be used. <dynamicField
|
||||
name="*_i" type="int" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_s" type="string" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_l" type="long" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_t" type="text"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_b"
|
||||
type="boolean" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_f" type="float" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_d" type="double" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_dt" type="date"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_ti"
|
||||
type="tint" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_tl" type="tlong" indexed="true" stored="true"/>
|
||||
<dynamicField name="*_tf" type="tfloat" indexed="true"
|
||||
stored="true"/> <dynamicField name="*_td" type="tdouble"
|
||||
indexed="true" stored="true"/> <dynamicField name="*_tdt"
|
||||
type="tdate" indexed="true" stored="true"/> <dynamicField
|
||||
name="*_pi" type="pint" indexed="true" stored="true"/>
|
||||
|
||||
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
|
||||
<dynamicField name="attr_*" type="textgen" indexed="true"
|
||||
stored="true" multiValued="true"/> <dynamicField name="random_*"
|
||||
type="random" />
|
||||
-->
|
||||
<!--
|
||||
uncomment the following to ignore any fields that don't already
|
||||
match an existing field name or dynamic field, rather than
|
||||
reporting them as an error. alternately, change the type="ignored"
|
||||
to some other type e.g. "text" if you want unknown fields indexed
|
||||
and/or stored by default
|
||||
-->
|
||||
<!--dynamicField name="*" type="ignored" multiValued="true" /-->
|
||||
|
||||
</fields>
|
||||
|
||||
<!--
|
||||
Field to use to determine and enforce document uniqueness. Unless
|
||||
this field is marked with required="false", it will be a required
|
||||
field
|
||||
-->
|
||||
<uniqueKey>id</uniqueKey>
|
||||
|
||||
<!--
|
||||
field for the QueryParser to use when an explicit fieldname is
|
||||
absent
|
||||
-->
|
||||
<defaultSearchField>text</defaultSearchField>
|
||||
|
||||
<!-- SolrQueryParser configuration: defaultOperator="AND|OR" -->
|
||||
<solrQueryParser defaultOperator="OR" />
|
||||
|
||||
<!--
|
||||
copyField commands copy one field to another at the time a document
|
||||
is added to the index. It's used either to index the same field
|
||||
differently, or to add multiple fields to the same field for
|
||||
easier/faster searching.
|
||||
-->
|
||||
|
||||
<copyField source="cat" dest="text" />
|
||||
<copyField source="name" dest="text" />
|
||||
<copyField source="manu" dest="text" />
|
||||
<copyField source="features" dest="text" />
|
||||
<copyField source="includes" dest="text" />
|
||||
<copyField source="manu" dest="manu_exact" />
|
||||
|
||||
|
||||
<!--copyField source="Titolo" dest="text"/-->
|
||||
|
||||
<!--
|
||||
Above, multiple source fields are copied to the [text] field.
|
||||
Another way to map multiple source fields to the same destination
|
||||
field is to use the dynamic field syntax. copyField also supports a
|
||||
maxChars to copy setting.
|
||||
-->
|
||||
|
||||
<!-- <copyField source="*_t" dest="text" maxChars="3000"/> -->
|
||||
|
||||
<!--
|
||||
copy name to alphaNameSort, a field designed for sorting by name
|
||||
-->
|
||||
<!-- <copyField source="name" dest="alphaNameSort"/> -->
|
||||
|
||||
|
||||
<!--
|
||||
Similarity is the scoring routine for each document vs. a query. A
|
||||
custom similarity may be specified here, but the default is fine for
|
||||
most applications.
|
||||
-->
|
||||
<!--
|
||||
<similarity class="org.apache.lucene.search.DefaultSimilarity"/>
|
||||
-->
|
||||
<!--
|
||||
... OR ... Specify a SimilarityFactory class name implementation
|
||||
allowing parameters to be used.
|
||||
-->
|
||||
<!--
|
||||
<similarity class="com.example.solr.CustomSimilarityFactory"> <str
|
||||
name="paramkey">param value</str> </similarity>
|
||||
-->
|
||||
|
||||
|
||||
</schema>
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,2 @@
|
|||
pizza
|
||||
history
|
|
@ -0,0 +1,58 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
# a couple of test stopwords to test that the words are really being
|
||||
# configured from this file:
|
||||
stopworda
|
||||
stopwordb
|
||||
|
||||
#Standard english stop words taken from Lucene's StopAnalyzer
|
||||
a
|
||||
an
|
||||
and
|
||||
are
|
||||
as
|
||||
at
|
||||
be
|
||||
but
|
||||
by
|
||||
for
|
||||
if
|
||||
in
|
||||
into
|
||||
is
|
||||
it
|
||||
no
|
||||
not
|
||||
of
|
||||
on
|
||||
or
|
||||
s
|
||||
such
|
||||
t
|
||||
that
|
||||
the
|
||||
their
|
||||
then
|
||||
there
|
||||
these
|
||||
they
|
||||
this
|
||||
to
|
||||
was
|
||||
will
|
||||
with
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
#-----------------------------------------------------------------------
|
||||
#some test synonym mappings unlikely to appear in real input text
|
||||
aaa => aaaa
|
||||
bbb => bbbb1 bbbb2
|
||||
ccc => cccc1,cccc2
|
||||
a\=>a => b\=>b
|
||||
a\,a => b\,b
|
||||
fooaaa,baraaa,bazaaa
|
||||
|
||||
# Some synonym groups specific to this example
|
||||
GB,gib,gigabyte,gigabytes
|
||||
MB,mib,megabyte,megabytes
|
||||
Television, Televisions, TV, TVs
|
||||
#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
|
||||
#after us won't split it into two words.
|
||||
|
||||
# Synonym mappings can be used for spelling correction too
|
||||
pixima => pixma
|
||||
|
Loading…
Reference in New Issue