mirror of
https://github.com/apache/lucene.git
synced 2025-02-14 14:05:41 +00:00
127 lines
5.7 KiB
Plaintext
127 lines
5.7 KiB
Plaintext
= UIMA Integration
|
|
:page-shortname: uima-integration
|
|
:page-permalink: uima-integration.html
|
|
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
You can integrate the Apache Unstructured Information Management Architecture (https://uima.apache.org/[UIMA]) with Solr. UIMA lets you define custom pipelines of Analysis Engines that incrementally add metadata to your documents as annotations.
|
|
|
|
[[UIMAIntegration-ConfiguringUIMA]]
|
|
== Configuring UIMA
|
|
|
|
The SolrUIMA UpdateRequestProcessor is a custom update request processor that takes documents being indexed, sends them to a UIMA pipeline, and then returns the documents enriched with the specified metadata. To configure UIMA for Solr, follow these steps:
|
|
|
|
1. Copy `solr-uima-VERSION.jar` (under `/solr-VERSION/dist/`) and its libraries (under `contrib/uima/lib`) to a Solr libraries directory, or set `<lib/>` tags in `solrconfig.xml` appropriately to point to those jar files:
|
|
+
|
|
[source,xml]
|
|
----
|
|
<lib dir="../../contrib/uima/lib" />
|
|
<lib dir="../../dist/" regex="solr-uima-\d.*\.jar" />
|
|
----
|
|
2. Modify `schema.xml`, adding your desired metadata fields specifying proper values for type, indexed, stored, and multiValued options. For example:
|
|
+
|
|
[source,xml]
|
|
----
|
|
<field name="language" type="string" indexed="true" stored="true" required="false"/>
|
|
<field name="concept" type="string" indexed="true" stored="true" multiValued="true" required="false"/>
|
|
<field name="sentence" type="text" indexed="true" stored="true" multiValued="true" required="false" />
|
|
----
|
|
3. Add the following snippet to `solrconfig.xml`:
|
|
+
|
|
[source,xml]
|
|
----
|
|
<updateRequestProcessorChain name="uima">
|
|
<processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
|
|
<lst name="uimaConfig">
|
|
<lst name="runtimeParameters">
|
|
<str name="keyword_apikey">VALID_ALCHEMYAPI_KEY</str>
|
|
<str name="concept_apikey">VALID_ALCHEMYAPI_KEY</str>
|
|
<str name="lang_apikey">VALID_ALCHEMYAPI_KEY</str>
|
|
<str name="cat_apikey">VALID_ALCHEMYAPI_KEY</str>
|
|
<str name="entities_apikey">VALID_ALCHEMYAPI_KEY</str>
|
|
<str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
|
|
</lst>
|
|
<str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
|
|
<!-- Set to true if you want to continue indexing even if text processing fails.
|
|
Default is false. That is, Solr throws RuntimeException and
|
|
never indexed documents entirely in your session. -->
|
|
<bool name="ignoreErrors">true</bool>
|
|
<!-- This is optional. It is used for logging when text processing fails.
|
|
If logField is not specified, uniqueKey will be used as logField.
|
|
<str name="logField">id</str>
|
|
-->
|
|
<lst name="analyzeFields">
|
|
<bool name="merge">false</bool>
|
|
<arr name="fields">
|
|
<str>text</str>
|
|
</arr>
|
|
</lst>
|
|
<lst name="fieldMappings">
|
|
<lst name="type">
|
|
<str name="name">org.apache.uima.alchemy.ts.concept.ConceptFS</str>
|
|
<lst name="mapping">
|
|
<str name="feature">text</str>
|
|
<str name="field">concept</str>
|
|
</lst>
|
|
</lst>
|
|
<lst name="type">
|
|
<str name="name">org.apache.uima.alchemy.ts.language.LanguageFS</str>
|
|
<lst name="mapping">
|
|
<str name="feature">language</str>
|
|
<str name="field">language</str>
|
|
</lst>
|
|
</lst>
|
|
<lst name="type">
|
|
<str name="name">org.apache.uima.SentenceAnnotation</str>
|
|
<lst name="mapping">
|
|
<str name="feature">coveredText</str>
|
|
<str name="field">sentence</str>
|
|
</lst>
|
|
</lst>
|
|
</lst>
|
|
</lst>
|
|
</processor>
|
|
<processor class="solr.LogUpdateProcessorFactory" />
|
|
<processor class="solr.RunUpdateProcessorFactory" />
|
|
</updateRequestProcessorChain>
|
|
----
|
|
+
|
|
[IMPORTANT]
|
|
====
|
|
* `VALID_ALCHEMYAPI_KEY` is your AlchemyAPI Access Key. You need to register an AlchemyAPI Access key to use AlchemyAPI services: http://www.alchemyapi.com/api/register.html.
|
|
* `VALID_OPENCALAIS_KEY` is your Calais Service Key. You need to register a Calais Service key to use the Calais services: http://www.opencalais.com/apikey.
|
|
* `analysisEngine` must contain an AE descriptor inside the specified path in the classpath.
|
|
* `analyzeFields` must contain the input fields that need to be analyzed by UIMA. If `merge=true` then their content will be merged and analyzed only once.
|
|
* Field mapping describes which features of which types should go in a field.
|
|
====
|
|
|
|
4. In your `solrconfig.xml` replace the existing default UpdateRequestHandler or create a new UpdateRequestHandler:
|
|
+
|
|
[source,xml]
|
|
----
|
|
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
|
|
<lst name="defaults">
|
|
<str name="update.chain">uima</str>
|
|
</lst>
|
|
</requestHandler>
|
|
----
|
|
|
|
Once you are done with the configuration your documents will be automatically enriched with the specified fields when you index them.
|
|
|
|
For more information about Solr UIMA integration, see https://wiki.apache.org/solr/SolrUIMA.
|
|
|