From 6c05d94c93c2f657637f4431fa8cc058a31fd7a2 Mon Sep 17 00:00:00 2001 From: Robert Muir Date: Mon, 24 Jan 2011 01:58:00 +0000 Subject: [PATCH] SOLR-2129: Provide a Solr module for dynamic metadata extraction/indexing with Apache UIMA git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1062604 13f79535-47bb-0310-9956-ffa450edef68 --- dev-tools/eclipse/dot.classpath | 10 + solr/CHANGES.txt | 3 + solr/build.xml | 21 +- solr/contrib/uima/CHANGES.txt | 17 + solr/contrib/uima/README.txt | 60 + solr/contrib/uima/build.xml | 189 +++ .../contrib/uima/lib/commons-digester-2.0.jar | 2 + solr/contrib/uima/lib/commons-lang-2.4.jar | 2 + solr/contrib/uima/lib/uima-an-alchemy.jar | 2 + solr/contrib/uima/lib/uima-an-calais.jar | 2 + solr/contrib/uima/lib/uima-an-tagger.jar | 2 + solr/contrib/uima/lib/uima-an-wst.jar | 2 + solr/contrib/uima/lib/uima-core.jar | 2 + solr/contrib/uima/solr-uima-pom.xml.template | 115 ++ .../uima/processor/SolrUIMAConfiguration.java | 69 + .../SolrUIMAConfigurationReader.java | 125 ++ .../solr/uima/processor/UIMAToSolrMapper.java | 83 ++ .../processor/UIMAUpdateRequestProcessor.java | 126 ++ .../UIMAUpdateRequestProcessorFactory.java | 37 + .../solr/uima/processor/ae/AEProvider.java | 32 + .../uima/processor/ae/AEProviderFactory.java | 53 + .../ae/OverridingParamsAEProvider.java | 89 ++ .../apache/uima/desc/AggregateSentenceAE.xml | 41 + .../org/apache/uima/desc/ExtServicesAE.xml | 57 + .../org/apache/uima/desc/HmmTagger.xml | 121 ++ .../apache/uima/desc/OpenCalaisAnnotator.xml | 194 +++ .../desc/OverridingParamsExtServicesAE.xml | 147 +++ .../desc/TextCategorizationAEDescriptor.xml | 102 ++ .../desc/TextConceptTaggingAEDescriptor.xml | 196 +++ .../TextKeywordExtractionAEDescriptor.xml | 107 ++ .../TextLanguageDetectionAEDescriptor.xml | 107 ++ ...TextRankedEntityExtractionAEDescriptor.xml | 403 ++++++ .../apache/uima/desc/WhitespaceTokenizer.xml | 115 ++ .../desc/baseAlchemyTypeSystemDescriptor.xml | 41 + .../solr/conf/aggregate-uima-config.xml | 33 + .../main/resources/solr/conf/uima-fields.xml | 9 + .../UIMAUpdateRequestProcessorTest.java | 137 ++ .../resources/solr-uima/conf/protwords.txt | 21 + .../test/resources/solr-uima/conf/schema.xml | 679 ++++++++++ .../resources/solr-uima/conf/solrconfig.xml | 1108 +++++++++++++++++ .../resources/solr-uima/conf/spellings.txt | 2 + .../resources/solr-uima/conf/stopwords.txt | 58 + .../resources/solr-uima/conf/synonyms.txt | 31 + 43 files changed, 4751 insertions(+), 1 deletion(-) create mode 100644 solr/contrib/uima/CHANGES.txt create mode 100644 solr/contrib/uima/README.txt create mode 100644 solr/contrib/uima/build.xml create mode 100644 solr/contrib/uima/lib/commons-digester-2.0.jar create mode 100644 solr/contrib/uima/lib/commons-lang-2.4.jar create mode 100644 solr/contrib/uima/lib/uima-an-alchemy.jar create mode 100644 solr/contrib/uima/lib/uima-an-calais.jar create mode 100644 solr/contrib/uima/lib/uima-an-tagger.jar create mode 100644 solr/contrib/uima/lib/uima-an-wst.jar create mode 100644 solr/contrib/uima/lib/uima-core.jar create mode 100644 solr/contrib/uima/solr-uima-pom.xml.template create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProvider.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProviderFactory.java create mode 100644 solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/OverridingParamsAEProvider.java create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/AggregateSentenceAE.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/ExtServicesAE.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/HmmTagger.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/OpenCalaisAnnotator.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/OverridingParamsExtServicesAE.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextCategorizationAEDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextConceptTaggingAEDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextKeywordExtractionAEDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextLanguageDetectionAEDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextRankedEntityExtractionAEDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/WhitespaceTokenizer.xml create mode 100644 solr/contrib/uima/src/main/resources/org/apache/uima/desc/baseAlchemyTypeSystemDescriptor.xml create mode 100644 solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml create mode 100644 solr/contrib/uima/src/main/resources/solr/conf/uima-fields.xml create mode 100644 solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/protwords.txt create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/spellings.txt create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/stopwords.txt create mode 100644 solr/contrib/uima/src/test/resources/solr-uima/conf/synonyms.txt diff --git a/dev-tools/eclipse/dot.classpath b/dev-tools/eclipse/dot.classpath index 244d5b440fd..b3b5c76a5cc 100644 --- a/dev-tools/eclipse/dot.classpath +++ b/dev-tools/eclipse/dot.classpath @@ -73,6 +73,10 @@ + + + + @@ -151,6 +155,12 @@ + + + + + + diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt index 9b1e42a5ec4..4aa8fac6833 100644 --- a/solr/CHANGES.txt +++ b/solr/CHANGES.txt @@ -411,6 +411,9 @@ New Features * SOLR-2188: Added "maxTokenLength" argument to the factories for ClassicTokenizer, StandardTokenizer, and UAX29URLEmailTokenizer. (Steven Rowe) +* SOLR-2129: Added a Solr module for dynamic metadata extraction/indexing with Apache UIMA. + See contrib/uima/README.txt for more information. (Tommaso Teofili via rmuir) + Optimizations ---------------------- diff --git a/solr/build.xml b/solr/build.xml index 4b0f0352a1e..cf6c7f655e9 100644 --- a/solr/build.xml +++ b/solr/build.xml @@ -218,6 +218,7 @@ + @@ -225,6 +226,7 @@ + @@ -514,6 +516,7 @@ + @@ -617,6 +620,10 @@ basedir="contrib/clustering/src" /> + + + + @@ -731,7 +740,7 @@ + excludes="lib/README.committers.txt **/data/ **/logs/* **/classes/ **/*.sh **/bin/ src/scripts/ src/site/build/ **/target/ client/ruby/flare/ client/python contrib/**/build/ **/*.iml **/*.ipr **/*.iws contrib/clustering/example/lib/** contrib/clustering/lib/downloads/** contrib/analysis-extras/lib/** contrib/uima/lib/**" /> @@ -903,6 +912,14 @@ + + + + + + + @@ -952,6 +969,8 @@ + + diff --git a/solr/contrib/uima/CHANGES.txt b/solr/contrib/uima/CHANGES.txt new file mode 100644 index 00000000000..c0ca1a0960d --- /dev/null +++ b/solr/contrib/uima/CHANGES.txt @@ -0,0 +1,17 @@ +Apache Solr UIMA Metadata Extraction Library + Release Notes + +This file describes changes to the Solr UIMA (contrib/uima) module. See SOLR-2129 for details. + +Introduction +------------ +This module is intended to be used while indexing documents. +Its purpose is to provide additional on the fly automatically generated fields to the Solr index. +Such fields could be language, concepts, keywords, sentences, named entities, etc. + + UIMA Dependency + --------------- +uima-core, OpenCalaisAnnotator, WhitespaceTokenizer, HMMTagger, AlchemyAPIAnnotator +Current Version: 2.3.1-SNAPSHOT rev. 999276 + +$Id$ diff --git a/solr/contrib/uima/README.txt b/solr/contrib/uima/README.txt new file mode 100644 index 00000000000..b2b97293dac --- /dev/null +++ b/solr/contrib/uima/README.txt @@ -0,0 +1,60 @@ +Getting Started +--------------- +To start using Solr UIMA Metadata Extraction Library you should go through the following configuration steps: + +1. copy generated solr-uima jar and its libs (under contrib/uima/lib) inside a Solr libraries directory. + +2. modify your schema.xml adding the fields you want to be hold metadata specifying proper values for type, indexed, stored and multiValued options: + +3. for example you could specify the following + + + + +4. modify your solrconfig.xml adding the following snippet: + + + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_OPENCALAIS_KEY + + /org/apache/uima/desc/OverridingParamsExtServicesAE.xml + text + + + + + + + + + + + + + +5. the analysisEngine tag must contain an AE descriptor inside the specified path in the classpath + +6. the analyzeFields tag must contain the input fields that need to be analyzed by UIMA, + if merge=true then their content will be merged and analyzed only once + +7. field mapping describes which features of which types should go in a field + +8. define in your solrconfig.xml an UpdateRequestProcessorChain as following: + + + + + + +9. in your solrconfig.xml replace the existing default ( + + uima + + + +Once you're done with the configuration you can index documents which will be automatically enriched with the specified fields diff --git a/solr/contrib/uima/build.xml b/solr/contrib/uima/build.xml new file mode 100644 index 00000000000..34b190b7727 --- /dev/null +++ b/solr/contrib/uima/build.xml @@ -0,0 +1,189 @@ + + + + + + + + + + + + Solr Integration with UIMA for extracting metadata from arbitrary (text) fields and enrich document with features extracted from UIMA types (language, sentences, concepts, named entities, etc.) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Tests failed! + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/uima/lib/commons-digester-2.0.jar b/solr/contrib/uima/lib/commons-digester-2.0.jar new file mode 100644 index 00000000000..bd9b6332cd4 --- /dev/null +++ b/solr/contrib/uima/lib/commons-digester-2.0.jar @@ -0,0 +1,2 @@ +AnyObjectId[9c8bd13a2002a9ff5b35b873b9f111d5281ad201] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/commons-lang-2.4.jar b/solr/contrib/uima/lib/commons-lang-2.4.jar new file mode 100644 index 00000000000..2ef0c625eb9 --- /dev/null +++ b/solr/contrib/uima/lib/commons-lang-2.4.jar @@ -0,0 +1,2 @@ +AnyObjectId[532939ecab6b77ccb77af3635c55ff9752b70ab7] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/uima-an-alchemy.jar b/solr/contrib/uima/lib/uima-an-alchemy.jar new file mode 100644 index 00000000000..5ef77543e3e --- /dev/null +++ b/solr/contrib/uima/lib/uima-an-alchemy.jar @@ -0,0 +1,2 @@ +AnyObjectId[33165678da937e03cb069449b40f1cf690beda0a] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/uima-an-calais.jar b/solr/contrib/uima/lib/uima-an-calais.jar new file mode 100644 index 00000000000..bebd55ac233 --- /dev/null +++ b/solr/contrib/uima/lib/uima-an-calais.jar @@ -0,0 +1,2 @@ +AnyObjectId[5dfc32bce5e444a9bb3387d664485f7bfdc438ad] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/uima-an-tagger.jar b/solr/contrib/uima/lib/uima-an-tagger.jar new file mode 100644 index 00000000000..6e879bd9553 --- /dev/null +++ b/solr/contrib/uima/lib/uima-an-tagger.jar @@ -0,0 +1,2 @@ +AnyObjectId[bf90c19d2c1f77e300b94363385841ec1225b4b9] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/uima-an-wst.jar b/solr/contrib/uima/lib/uima-an-wst.jar new file mode 100644 index 00000000000..d0ce4c499c6 --- /dev/null +++ b/solr/contrib/uima/lib/uima-an-wst.jar @@ -0,0 +1,2 @@ +AnyObjectId[9518da64cdf5d378273ab40a06823a7768f18ece] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/lib/uima-core.jar b/solr/contrib/uima/lib/uima-core.jar new file mode 100644 index 00000000000..37d22bb6b65 --- /dev/null +++ b/solr/contrib/uima/lib/uima-core.jar @@ -0,0 +1,2 @@ +AnyObjectId[72991424bdfe4776f66feab7ff4e8564f12d2659] was removed in git history. +Apache SVN contains full history. \ No newline at end of file diff --git a/solr/contrib/uima/solr-uima-pom.xml.template b/solr/contrib/uima/solr-uima-pom.xml.template new file mode 100644 index 00000000000..cff8375af72 --- /dev/null +++ b/solr/contrib/uima/solr-uima-pom.xml.template @@ -0,0 +1,115 @@ + + + 4.0.0 + org.apache.solr + solr-uima + 0.0.2-SNAPSHOT + Solr - UIMA integration + + 2.3.1-SNAPSHOT + + + + org.apache.solr + solr-core + 1.4.1 + + + org.apache.uima + uimaj-core + ${uimaVersion} + + + org.apache.uima + alchemy-annotator + ${uimaVersion} + + + org.apache.uima + OpenCalaisAnnotator + ${uimaVersion} + + + junit + junit + 4.7 + jar + test + + + org.slf4j + slf4j-simple + 1.5.5 + + + org.apache.uima + WhitespaceTokenizer + ${uimaVersion} + + + org.apache.uima + Tagger + ${uimaVersion} + + + + + + + com.googlecode.maven-gcu-plugin + maven-gcu-plugin + 1.0 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.3.1 + + 1.5 + 1.5 + + + + com.googlecode.maven-gcu-plugin + maven-gcu-plugin + 1.0 + + googlecode + true + ${project.artifactId} + + + ${project.build.directory}/${project.artifactId}-${project.version}.${project.packaging} + ${project.name} sources bundle ${project.version} + + + + + + + + + + + \ No newline at end of file diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java new file mode 100644 index 00000000000..2ba2d7f4fc5 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfiguration.java @@ -0,0 +1,69 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +/** + * Configuration holding all the configurable parameters for calling UIMA inside Solr + * + * @version $Id$ + */ +public class SolrUIMAConfiguration { + + private String[] fieldsToAnalyze; + + private boolean fieldsMerging; + + private Map> typesFeaturesFieldsMapping; + + private String aePath; + + private Map runtimeParameters; + + public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging, + Map> typesFeaturesFieldsMapping, + Map runtimeParameters) { + this.aePath = aePath; + this.fieldsToAnalyze = fieldsToAnalyze; + this.fieldsMerging = fieldsMerging; + this.runtimeParameters = runtimeParameters; + this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping; + } + + public String[] getFieldsToAnalyze() { + return fieldsToAnalyze; + } + + public boolean isFieldsMerging() { + return fieldsMerging; + } + + public Map> getTypesFeaturesFieldsMapping() { + return typesFeaturesFieldsMapping; + } + + public String getAePath() { + return aePath; + } + + public Map getRuntimeParameters() { + return runtimeParameters; + } + +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java new file mode 100644 index 00000000000..4ffeb83fa56 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/SolrUIMAConfigurationReader.java @@ -0,0 +1,125 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.Map; + +import org.apache.solr.core.SolrConfig; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +/** + * Read configuration for Solr-UIMA integration + * + * @version $Id$ + * + */ +public class SolrUIMAConfigurationReader { + + private static final String AE_RUNTIME_PARAMETERS_NODE_PATH = "/config/uimaConfig/runtimeParameters"; + + private static final String FIELD_MAPPING_NODE_PATH = "/config/uimaConfig/fieldMapping"; + + private static final String ANALYZE_FIELDS_NODE_PATH = "/config/uimaConfig/analyzeFields"; + + private static final String ANALYSIS_ENGINE_NODE_PATH = "/config/uimaConfig/analysisEngine"; + + private SolrConfig solrConfig; + + public SolrUIMAConfigurationReader(SolrConfig solrConfig) { + this.solrConfig = solrConfig; + } + + public SolrUIMAConfiguration readSolrUIMAConfiguration() { + return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(), + readTypesFeaturesFieldsMapping(), readAEOverridingParameters()); + } + + private String readAEPath() { + return solrConfig.getNode(ANALYSIS_ENGINE_NODE_PATH, true).getTextContent(); + } + + private String[] readFieldsToAnalyze() { + Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true); + return analyzeFieldsNode.getTextContent().split(","); + } + + private boolean readFieldsMerging() { + Node analyzeFieldsNode = solrConfig.getNode(ANALYZE_FIELDS_NODE_PATH, true); + Node mergeNode = analyzeFieldsNode.getAttributes().getNamedItem("merge"); + return Boolean.valueOf(mergeNode.getNodeValue()); + } + + private Map> readTypesFeaturesFieldsMapping() { + Map> map = new HashMap>(); + + Node fieldMappingNode = solrConfig.getNode(FIELD_MAPPING_NODE_PATH, true); + /* iterate over UIMA types */ + if (fieldMappingNode.hasChildNodes()) { + NodeList typeNodes = fieldMappingNode.getChildNodes(); + for (int i = 0; i < typeNodes.getLength(); i++) { + /* node */ + Node typeNode = typeNodes.item(i); + if (typeNode.getNodeType() != Node.TEXT_NODE) { + Node typeNameAttribute = typeNode.getAttributes().getNamedItem("name"); + /* get a UIMA typename */ + String typeName = typeNameAttribute.getNodeValue(); + /* create entry for UIMA type */ + map.put(typeName, new HashMap()); + if (typeNode.hasChildNodes()) { + /* iterate over features */ + NodeList featuresNodeList = typeNode.getChildNodes(); + for (int j = 0; j < featuresNodeList.getLength(); j++) { + Node mappingNode = featuresNodeList.item(j); + if (mappingNode.getNodeType() != Node.TEXT_NODE) { + /* get field name */ + Node fieldNameNode = mappingNode.getAttributes().getNamedItem("field"); + String mappedFieldName = fieldNameNode.getNodeValue(); + /* get feature name */ + Node featureNameNode = mappingNode.getAttributes().getNamedItem("feature"); + String featureName = featureNameNode.getNodeValue(); + /* map the feature to the field for the specified type */ + map.get(typeName).put(featureName, mappedFieldName); + } + } + } + } + } + } + return map; + } + + private Map readAEOverridingParameters() { + Map runtimeParameters = new HashMap(); + Node uimaConfigNode = solrConfig.getNode(AE_RUNTIME_PARAMETERS_NODE_PATH, true); + + if (uimaConfigNode.hasChildNodes()) { + NodeList overridingNodes = uimaConfigNode.getChildNodes(); + for (int i = 0; i < overridingNodes.getLength(); i++) { + Node overridingNode = overridingNodes.item(i); + if (overridingNode.getNodeType() != Node.TEXT_NODE) { + runtimeParameters.put(overridingNode.getNodeName(), overridingNode.getTextContent()); + } + } + } + + return runtimeParameters; + } + +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java new file mode 100644 index 00000000000..29e7b5c2926 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAToSolrMapper.java @@ -0,0 +1,83 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.Map; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.Type; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Map UIMA types and features over fields of a Solr document + * + * @version $Id$ + */ +public class UIMAToSolrMapper { + + private final Logger log = LoggerFactory.getLogger(UIMAToSolrMapper.class); + + private SolrInputDocument document; + + private JCas cas; + + public UIMAToSolrMapper(SolrInputDocument document, JCas cas) { + this.document = document; + this.cas = cas; + } + + /** + * map features of a certain UIMA type to corresponding Solr fields based on the mapping + * + * @param typeName + * name of UIMA type to map + * @param featureFieldsmapping + */ + public void map(String typeName, Map featureFieldsmapping) { + try { + FeatureStructure fsMock = (FeatureStructure) Class.forName(typeName).getConstructor( + JCas.class).newInstance(cas); + Type type = fsMock.getType(); + for (FSIterator iterator = cas.getFSIndexRepository().getAllIndexedFS(type); iterator + .hasNext();) { + FeatureStructure fs = iterator.next(); + for (String featureName : featureFieldsmapping.keySet()) { + String fieldName = featureFieldsmapping.get(featureName); + log.info(new StringBuffer("mapping ").append(typeName).append("@").append(featureName) + .append(" to ").append(fieldName).toString()); + String featureValue = null; + if (fs instanceof Annotation && "coveredText".equals(featureName)) { + featureValue = ((Annotation) fs).getCoveredText(); + } else { + featureValue = fs.getFeatureValueAsString(type.getFeatureByBaseName(featureName)); + } + log.info(new StringBuffer("writing ").append(featureValue).append(" in ").append( + fieldName).toString()); + document.addField(fieldName, featureValue, 1.0f); + } + } + } catch (Exception e) { + log.error(e.getLocalizedMessage()); + } + } +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java new file mode 100644 index 00000000000..f042f2a3531 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessor.java @@ -0,0 +1,126 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Map; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.uima.processor.ae.AEProvider; +import org.apache.solr.uima.processor.ae.AEProviderFactory; +import org.apache.solr.update.AddUpdateCommand; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; + +/** + * Update document(s) to be indexed with UIMA extracted information + * + * @version $Id$ + */ +public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor { + + private SolrUIMAConfiguration solrUIMAConfiguration; + + private AEProvider aeProvider; + + public UIMAUpdateRequestProcessor(UpdateRequestProcessor next, SolrCore solrCore) { + super(next); + initialize(solrCore); + } + + private void initialize(SolrCore solrCore) { + SolrUIMAConfigurationReader uimaConfigurationReader = new SolrUIMAConfigurationReader(solrCore + .getSolrConfig()); + solrUIMAConfiguration = uimaConfigurationReader.readSolrUIMAConfiguration(); + aeProvider = AEProviderFactory.getInstance().getAEProvider(solrCore.getName(), + solrUIMAConfiguration.getAePath(), solrUIMAConfiguration.getRuntimeParameters()); + } + + public void processAdd(AddUpdateCommand cmd) throws IOException { + try { + /* get Solr document */ + SolrInputDocument solrInputDocument = cmd.getSolrInputDocument(); + + /* get the fields to analyze */ + for (String text : getTextsToAnalyze(solrInputDocument)) { + if (text != null && !"".equals(text)) { + /* process the text value */ + JCas jcas = processText(text); + + UIMAToSolrMapper uimaToSolrMapper = new UIMAToSolrMapper(solrInputDocument, jcas); + /* get field mapping from config */ + Map> typesAndFeaturesFieldsMap = solrUIMAConfiguration + .getTypesFeaturesFieldsMapping(); + /* map type features on fields */ + for (String typeFQN : typesAndFeaturesFieldsMap.keySet()) { + uimaToSolrMapper.map(typeFQN, typesAndFeaturesFieldsMap.get(typeFQN)); + } + } + } + } catch (UIMAException e) { + throw new RuntimeException(e); + } + super.processAdd(cmd); + } + + /* + * get the texts to analyze from the corresponding fields + */ + private String[] getTextsToAnalyze(SolrInputDocument solrInputDocument) { + String[] fieldsToAnalyze = solrUIMAConfiguration.getFieldsToAnalyze(); + boolean merge = solrUIMAConfiguration.isFieldsMerging(); + String[] textVals = null; + if (merge) { + StringBuilder unifiedText = new StringBuilder(""); + for (int i = 0; i < fieldsToAnalyze.length; i++) { + unifiedText.append(String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i]))); + } + textVals = new String[1]; + textVals[0] = unifiedText.toString(); + } else { + textVals = new String[fieldsToAnalyze.length]; + for (int i = 0; i < fieldsToAnalyze.length; i++) { + textVals[i] = String.valueOf(solrInputDocument.getFieldValue(fieldsToAnalyze[i])); + } + } + return textVals; + } + + /* process a field value executing UIMA the CAS containing it as document text */ + private JCas processText(String textFieldValue) throws ResourceInitializationException, + AnalysisEngineProcessException { + log.info(new StringBuffer("Analazying text").toString()); + /* get the UIMA analysis engine */ + AnalysisEngine ae = aeProvider.getAE(); + + /* create a JCas which contain the text to analyze */ + JCas jcas = ae.newJCas(); + jcas.setDocumentText(textFieldValue); + + /* perform analysis on text field */ + ae.process(jcas); + log.info(new StringBuilder("Text processing completed").toString()); + return jcas; + } + +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java new file mode 100644 index 00000000000..5078f9d3a19 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorFactory.java @@ -0,0 +1,37 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.solr.request.SolrQueryRequest; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.processor.UpdateRequestProcessor; +import org.apache.solr.update.processor.UpdateRequestProcessorFactory; + +/** + * Factory for {@link UIMAUpdateRequestProcessor} + * + * @version $Id$ + */ +public class UIMAUpdateRequestProcessorFactory extends UpdateRequestProcessorFactory { + + public UpdateRequestProcessor getInstance(SolrQueryRequest req, SolrQueryResponse rsp, + UpdateRequestProcessor next) { + return new UIMAUpdateRequestProcessor(next, req.getCore()); + } + +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProvider.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProvider.java new file mode 100644 index 00000000000..89c981ab6e3 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProvider.java @@ -0,0 +1,32 @@ +package org.apache.solr.uima.processor.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.resource.ResourceInitializationException; + +/** + * provide an Apache UIMA {@link AnalysisEngine} + * + * @version $Id$ + */ +public interface AEProvider { + + public AnalysisEngine getAE() throws ResourceInitializationException; + +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProviderFactory.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProviderFactory.java new file mode 100644 index 00000000000..2104e753353 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/AEProviderFactory.java @@ -0,0 +1,53 @@ +package org.apache.solr.uima.processor.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.Map; + +/** + * Singleton factory class responsible of {@link AEProvider}s' creation + * + * @version $Id$ + */ +public class AEProviderFactory { + + private static AEProviderFactory instance; + + private Map providerCache = new HashMap(); + + private AEProviderFactory() { + // Singleton + } + + public static AEProviderFactory getInstance() { + if (instance == null) { + instance = new AEProviderFactory(); + } + return instance; + } + + public synchronized AEProvider getAEProvider(String core, String aePath, + Map runtimeParameters) { + String key = new StringBuilder(core).append(aePath).toString(); + if (providerCache.get(key) == null) { + providerCache.put(key, new OverridingParamsAEProvider(aePath, runtimeParameters)); + } + return providerCache.get(key); + } +} diff --git a/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/OverridingParamsAEProvider.java b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/OverridingParamsAEProvider.java new file mode 100644 index 00000000000..d4d74910379 --- /dev/null +++ b/solr/contrib/uima/src/main/java/org/apache/solr/uima/processor/ae/OverridingParamsAEProvider.java @@ -0,0 +1,89 @@ +package org.apache.solr.uima.processor.ae; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.net.URL; +import java.util.Map; + +import org.apache.uima.UIMAFramework; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.XMLInputSource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * {@link AEProvider} implementation that creates an Aggregate AE from the given path, also + * injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning + * them as overriding parameters in the aggregate AE + * + * @version $Id$ + */ +public class OverridingParamsAEProvider implements AEProvider { + + private static Logger log = LoggerFactory.getLogger(OverridingParamsAEProvider.class); + + private String aeFilePath; + + private AnalysisEngine cachedAE; + + private Map runtimeParameters; + + public OverridingParamsAEProvider(String aeFilePath, Map runtimeParameters) { + this.aeFilePath = aeFilePath; + this.runtimeParameters = runtimeParameters; + } + + public synchronized AnalysisEngine getAE() throws ResourceInitializationException { + try { + if (cachedAE == null) { + // get Resource Specifier from XML file + URL url = this.getClass().getResource(aeFilePath); + XMLInputSource in = new XMLInputSource(url); + + // get AE description + AnalysisEngineDescription desc = UIMAFramework.getXMLParser() + .parseAnalysisEngineDescription(in); + + /* iterate over each AE (to set runtime parameters) */ + for (String attributeName : runtimeParameters.keySet()) { + desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue( + attributeName, runtimeParameters.get(attributeName)); + log.info(new StringBuilder("setting ").append(attributeName).append(" : ").append( + runtimeParameters.get(attributeName)).toString()); + } + // create AE here + cachedAE = UIMAFramework.produceAnalysisEngine(desc); + if (log.isDebugEnabled()) + log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName()) + .append(" created from descriptor ").append(aeFilePath).toString()); + } else { + cachedAE.reconfigure(); + if (log.isDebugEnabled()) + log.debug(new StringBuilder("AE ").append(cachedAE.getAnalysisEngineMetaData().getName()) + .append(" at path ").append(aeFilePath).append(" reconfigured ").toString()); + } + } catch (Exception e) { + cachedAE = null; + throw new ResourceInitializationException(e); + } + return cachedAE; + } + +} \ No newline at end of file diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/AggregateSentenceAE.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/AggregateSentenceAE.xml new file mode 100644 index 00000000000..75ae50e500e --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/AggregateSentenceAE.xml @@ -0,0 +1,41 @@ + + + org.apache.uima.java + false + + + + + + + + + + AggregateSentenceAE + + 1.0 + + + + + + WhitespaceTokenizer + HmmTagger + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/ExtServicesAE.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/ExtServicesAE.xml new file mode 100644 index 00000000000..ef5268fd592 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/ExtServicesAE.xml @@ -0,0 +1,57 @@ + + + org.apache.uima.java + false + + + + + + + + + + + + + + + + + + + + + + ExtServicesAE + + 1.0 + + + + + + OpenCalaisAnnotator + TextKeywordExtractionAEDescriptor + TextLanguageDetectionAEDescriptor + TextCategorizationAEDescriptor + TextConceptTaggingAEDescriptor + TextRankedEntityExtractionAEDescriptor + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/HmmTagger.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/HmmTagger.xml new file mode 100644 index 00000000000..8fe4216d91a --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/HmmTagger.xml @@ -0,0 +1,121 @@ + + + + org.apache.uima.java + true + org.apache.uima.examples.tagger.HMMTagger + + Hidden Markov Model - Part of Speech Tagger + A configuration of the HmmTaggerAnnotator that looks for + parts of speech of identified tokens within existing + Sentence and Token annotations. See also + WhitespaceTokenizer.xml. + 1.0 + The Apache Software Foundation + + + NGRAM_SIZE + Integer + false + true + + + + + NGRAM_SIZE + + 3 + + + + + + + org.apache.uima.TokenAnnotation + Single token annotation + uima.tcas.Annotation + + + posTag + contains part-of-speech of a + corresponding token + uima.cas.String + + + + + org.apache.uima.SentenceAnnotation + sentence annotation + uima.tcas.Annotation + + + + + + + + + org.apache.uima.TokenAnnotation + org.apache.uima.SentenceAnnotation + org.apache.uima.TokenAnnotation:end + org.apache.uima.TokenAnnotation:begin + + + org.apache.uima.TokenAnnotation + org.apache.uima.TokenAnnotation:posTag + org.apache.uima.TokenAnnotation:end + org.apache.uima.TokenAnnotation:begin + + + + + + true + true + false + + + + + Model + HMM Tagger model file + org.apache.uima.examples.tagger.IModelResource + false + + + + + + ModelFile + HMM Tagger model file + + file:english/BrownModel.dat + + org.apache.uima.examples.tagger.ModelResource + + + + + Model + ModelFile + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OpenCalaisAnnotator.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OpenCalaisAnnotator.xml new file mode 100644 index 00000000000..e7b0c07dcd4 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OpenCalaisAnnotator.xml @@ -0,0 +1,194 @@ + + + org.apache.uima.java + true + org.apache.uima.annotator.calais.OpenCalaisAnnotator + + OpenCalaisAnnotator + + + + allowDistribution + + Boolean + false + true + + + allowSearch + + Boolean + false + true + + + submitter + + String + false + true + + + licenseID + + String + false + true + + + + + allowDistribution + + false + + + + allowSearch + + false + + + + submitter + + + + + + licenseID + + OC_LICENSE_ID + + + + + + + org.apache.uima.calais.Person + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Anniversary + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.City + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Company + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Continent + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Country + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Currency + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.EmailAddress + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Facility + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.FaxNumber + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Holiday + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.IndustryTerm + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.NaturalDisaster + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.NaturalFeature + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Organization + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.PhoneNumber + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.ProviceOrState + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Region + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.Technology + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.URL + + org.apache.uima.calais.BaseType + + + org.apache.uima.calais.BaseType + + uima.tcas.Annotation + + + calaisType + OpenCalais type + uima.cas.String + + + + + + + + + + + + + + true + true + false + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OverridingParamsExtServicesAE.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OverridingParamsExtServicesAE.xml new file mode 100644 index 00000000000..81bd4029016 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/OverridingParamsExtServicesAE.xml @@ -0,0 +1,147 @@ + + + org.apache.uima.java + false + + + + + + + + + + + + + + + + + + + + + + + + + ExtServicesAE + + 1.0 + + + + oc_licenseID + String + false + true + + OpenCalaisAnnotator/licenseID + + + + keyword_apikey + String + false + true + + TextKeywordExtractionAEDescriptor/apikey + + + + concept_apikey + String + false + true + + TextConceptTaggingAEDescriptor/apikey + + + + lang_apikey + String + false + true + + TextLanguageDetectionAEDescriptor/apikey + + + + cat_apikey + String + false + true + + TextCategorizationAEDescriptor/apikey + + + + entities_apikey + String + false + true + + TextRankedEntityExtractionAEDescriptor/apikey + + + + + + oc_licenseID + + licenseid + + + + keyword_apikey + + apikey + + + + concept_apikey + + apikey + + + + lang_apikey + + apikey + + + + cat_apikey + + apikey + + + + + + AggregateSentenceAE + OpenCalaisAnnotator + TextKeywordExtractionAEDescriptor + TextLanguageDetectionAEDescriptor + TextCategorizationAEDescriptor + TextConceptTaggingAEDescriptor + TextRankedEntityExtractionAEDescriptor + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextCategorizationAEDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextCategorizationAEDescriptor.xml new file mode 100644 index 00000000000..16aff2b7775 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextCategorizationAEDescriptor.xml @@ -0,0 +1,102 @@ + + + + org.apache.uima.java + true + org.apache.uima.alchemy.annotator.TextCategorizationAnnotator + + TextCategorizationAEDescriptor + + 1.0 + + + + apikey + String + false + true + + + outputMode + String + false + true + + + baseUrl + String + false + false + + + + + outputMode + + xml + + + + apikey + + AA_API_KEY + + + + + + + org.apache.uima.alchemy.ts.categorization.Category + + uima.cas.TOP + + + score + + uima.cas.String + + + text + + uima.cas.String + + + + + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextConceptTaggingAEDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextConceptTaggingAEDescriptor.xml new file mode 100644 index 00000000000..ee9166c3a2e --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextConceptTaggingAEDescriptor.xml @@ -0,0 +1,196 @@ + + + + org.apache.uima.java + true + org.apache.uima.alchemy.annotator.TextConceptTaggingAnnotator + + TextConceptTaggingAEDescriptor + + 1.0 + + + + apikey + String + false + true + + + outputMode + String + false + true + + + linkedData + String + false + false + + + showSourceText + Integer + false + true + + + maxRetrieve + String + false + false + + + url + String + false + false + + + + + apikey + + + + + + outputMode + + xml + + + + linkedData + + 1 + + + + showSourceText + + 0 + + + + maxRetrieve + + 8 + + + + + + + org.apache.uima.alchemy.ts.concept.ConceptFS + a concept tag + uima.cas.TOP + + + text + + uima.cas.String + + + relevance + + uima.cas.String + + + website + + uima.cas.String + + + geo + + uima.cas.String + + + dbpedia + + uima.cas.String + + + yago + + uima.cas.String + + + opencyc + + uima.cas.String + + + freebase + + uima.cas.String + + + ciaFactbook + + uima.cas.String + + + census + + uima.cas.String + + + geonames + + uima.cas.String + + + musicBrainz + + uima.cas.String + + + crunchbase + + uima.cas.String + + + semanticCrunchbase + + uima.cas.String + + + + + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextKeywordExtractionAEDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextKeywordExtractionAEDescriptor.xml new file mode 100644 index 00000000000..af6a5127a75 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextKeywordExtractionAEDescriptor.xml @@ -0,0 +1,107 @@ + + + org.apache.uima.java + true + org.apache.uima.alchemy.annotator.TextKeywordExtractionAnnotator + + TextKeywordExtractionAEDescriptor + + 1.0 + + + + apikey + String + false + true + + + outputMode + String + false + true + + + baseUrl + String + false + false + + + url + String + false + false + + + maxRetrieve + Integer + false + false + + + showSourceText + Integer + false + false + + + + + outputMode + + xml + + + + apikey + + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + + + + maxRetrieve + + 10 + + + + showSourceText + + 0 + + + + + + + org.apache.uima.alchemy.ts.keywords.KeywordFS + + uima.cas.TOP + + + text + + uima.cas.String + + + + + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextLanguageDetectionAEDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextLanguageDetectionAEDescriptor.xml new file mode 100644 index 00000000000..6f9fb982ab8 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextLanguageDetectionAEDescriptor.xml @@ -0,0 +1,107 @@ + + + org.apache.uima.java + true + org.apache.uima.alchemy.annotator.TextLanguageDetectionAnnotator + + TextLanguageDetectionAEDescriptor + + 1.0 + + + + apikey + String + false + true + + + outputMode + String + false + true + + + url + String + false + false + + + + + outputMode + + xml + + + + apikey + + AA_API_KEY + + + + + + + org.apache.uima.alchemy.ts.language.LanguageFS + + uima.cas.TOP + + + language + + uima.cas.String + + + iso6391 + + uima.cas.String + + + iso6392 + + uima.cas.String + + + iso6393 + + uima.cas.String + + + ethnologue + + uima.cas.String + + + nativeSpeakers + + uima.cas.String + + + wikipedia + + uima.cas.String + + + + + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextRankedEntityExtractionAEDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextRankedEntityExtractionAEDescriptor.xml new file mode 100644 index 00000000000..410d6c9f825 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/TextRankedEntityExtractionAEDescriptor.xml @@ -0,0 +1,403 @@ + + + + org.apache.uima.java + true + org.apache.uima.alchemy.annotator.TextRankedNamedEntityExtractionAnnotator + + TextRankedEntityExtractionAEDescriptor + + 1.0 + + + + apikey + String + false + true + + + outputMode + String + false + true + + + disambiguate + Integer + false + true + + + linkedData + String + false + false + + + showSourceText + Integer + false + true + + + baseUrl + String + false + false + + + url + String + false + false + + + coreference + String + false + false + + + quotations + String + false + false + + + + + apikey + + + + + + outputMode + + xml + + + + disambiguate + + 1 + + + + linkedData + + 1 + + + + coreference + + 1 + + + + showSourceText + + 0 + + + + quotations + + 1 + + + + + + + + + + org.apache.uima.alchemy.ts.entity.Anniversary + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Automobile + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.City + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Company + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Continent + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Country + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.EntertainmentAward + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Facility + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.FieldTerminology + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.FinancialMarketIndex + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.GeographicFeature + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.HealthCondition + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Holiday + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Movie + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.MusicGroup + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.NaturalDisaster + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Organization + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Person + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.PrintMedia + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.RadioProgram + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.RadioStation + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Region + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Sport + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.StateOrCounty + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Technology + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.TelevisionShow + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.TelevisionStation + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.OperatingSystem + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.SportingEvent + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.Drug + + org.apache.uima.alchemy.ts.entity.BaseEntity + + + org.apache.uima.alchemy.ts.entity.BaseEntity + + uima.cas.TOP + + + text + + uima.cas.String + + + count + + uima.cas.String + + + relevance + + uima.cas.String + + + disambiguation + + uima.cas.String + + + subType + + uima.cas.String + + + website + + uima.cas.String + + + geo + + uima.cas.String + + + dbpedia + + uima.cas.String + + + yago + + uima.cas.String + + + opencyc + + uima.cas.String + + + umbel + + uima.cas.String + + + freebase + + uima.cas.String + + + ciaFactbook + + uima.cas.String + + + census + + uima.cas.String + + + geonames + + uima.cas.String + + + musicBrainz + + uima.cas.String + + + quotations + + uima.cas.StringArray + true + + + occurrences + A list of annotations annotating this entity + uima.cas.FSList + uima.tcas.Annotation + + + + + + + + + + + + + + + + true + true + false + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/WhitespaceTokenizer.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/WhitespaceTokenizer.xml new file mode 100644 index 00000000000..686dbefc5d3 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/WhitespaceTokenizer.xml @@ -0,0 +1,115 @@ + + + + + + + org.apache.uima.java + + true + + org.apache.uima.annotator.WhitespaceTokenizer + + + + WhitespaceTokenizer + + creates token and sentence annotations for whitespace + separated languages + + 1.0 + The Apache Software Foundation + + + + SofaNames + + The Sofa names the annotator should work on. If no + names are specified, the annotator works on the + default sofa. + + String + true + false + + + + + + + + + + + org.apache.uima.TokenAnnotation + Single token annotation + uima.tcas.Annotation + + + tokenType + token type + uima.cas.String + + + + + + org.apache.uima.SentenceAnnotation + sentence annotation + uima.tcas.Annotation + + + + + + + + + + + + + org.apache.uima.TokenAnnotation + + org.apache.uima.TokenAnnotation:tokentype + + org.apache.uima.SentenceAnnotation + + + x-unspecified + + + + + + + diff --git a/solr/contrib/uima/src/main/resources/org/apache/uima/desc/baseAlchemyTypeSystemDescriptor.xml b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/baseAlchemyTypeSystemDescriptor.xml new file mode 100644 index 00000000000..32d5d843a48 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/org/apache/uima/desc/baseAlchemyTypeSystemDescriptor.xml @@ -0,0 +1,41 @@ + + + + baseAlchemyTypeSystemDescriptor + + 1.0 + + + + org.apache.uima.alchemy.ts.entity.AlchemyAnnotation + + uima.tcas.Annotation + + + alchemyType + alchemyAPI type + uima.cas.String + + + + + diff --git a/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml b/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml new file mode 100644 index 00000000000..0e66585bf80 --- /dev/null +++ b/solr/contrib/uima/src/main/resources/solr/conf/aggregate-uima-config.xml @@ -0,0 +1,33 @@ + + + + + + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_ALCHEMYAPI_KEY + VALID_OPENCALAIS_KEY + + /org/apache/uima/desc/OverridingParamsExtServicesAE.xml + text,title + + + + + + \ No newline at end of file diff --git a/solr/contrib/uima/src/main/resources/solr/conf/uima-fields.xml b/solr/contrib/uima/src/main/resources/solr/conf/uima-fields.xml new file mode 100644 index 00000000000..270aaa6f62e --- /dev/null +++ b/solr/contrib/uima/src/main/resources/solr/conf/uima-fields.xml @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java new file mode 100644 index 00000000000..3d2058bae6b --- /dev/null +++ b/solr/contrib/uima/src/test/java/org/apache/solr/uima/processor/UIMAUpdateRequestProcessorTest.java @@ -0,0 +1,137 @@ +package org.apache.solr.uima.processor; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.net.URL; +import java.net.URLConnection; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +import org.apache.solr.SolrTestCaseJ4; +import org.apache.solr.common.params.MultiMapSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.params.UpdateParams; +import org.apache.solr.common.util.ContentStream; +import org.apache.solr.common.util.ContentStreamBase; +import org.apache.solr.core.SolrCore; +import org.apache.solr.handler.XmlUpdateRequestHandler; +import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.response.SolrQueryResponse; +import org.apache.solr.update.processor.UpdateRequestProcessorChain; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +/** + * TestCase for {@link UIMAUpdateRequestProcessor} + * + * @version $Id$ + */ +public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 { + + @BeforeClass + public static void beforeClass() throws Exception { + initCore("solrconfig.xml", "schema.xml", "solr-uima"); + } + + @Before + public void setUp() throws Exception { + super.setUp(); + clearIndex(); + assertU(commit()); + } + + @Test + public void testProcessorConfiguration() { + SolrCore core = h.getCore(); + UpdateRequestProcessorChain chained = core.getUpdateProcessingChain("uima"); + assertNotNull(chained); + UIMAUpdateRequestProcessorFactory factory = (UIMAUpdateRequestProcessorFactory) chained + .getFactories()[0]; + assertNotNull(factory); + } + + @Test + public void testProcessing() throws Exception { + // this test requires an internet connection (e.g. opencalais api) + checkInternetConnection(); + + addDoc(adoc( + "id", + "2312312321312", + "text", + "SpellCheckComponent got improvement related to recent Lucene changes. \n " + + "Add support for specifying Spelling SuggestWord Comparator to Lucene spell " + + "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is" + + " attached if you need it, but it is also committed to trunk and 3_x branch." + + " Last Lucene European Conference has been held in Prague.")); + assertU(commit()); + assertQ(req("language:english"), "//*[@numFound='1']"); + } + + @Test + public void testTwoUpdates() { + // this test requires an internet connection (e.g. opencalais api) + checkInternetConnection(); + + try { + addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce " + + "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern " + + "Hemisphere!")); + assertU(commit()); + assertQ(req("language:english"), "//*[@numFound='1']"); + + addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University " + + "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be" + + " attendee-driven, facilitated by members of the Apache community and will " + + "focus on the Apache...")); + assertU(commit()); + assertQ(req("language:english"), "//*[@numFound='2']"); + + } catch (Exception e) { + assumeNoException("Multiple updates on same instance didn't work", e); + } + } + + private void addDoc(String doc) throws Exception { + Map params = new HashMap(); + params.put(UpdateParams.UPDATE_PROCESSOR, new String[] { "uima" }); + MultiMapSolrParams mmparams = new MultiMapSolrParams(params); + SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) { + }; + + XmlUpdateRequestHandler handler = new XmlUpdateRequestHandler(); + handler.init(null); + ArrayList streams = new ArrayList(2); + streams.add(new ContentStreamBase.StringStream(doc)); + req.setContentStreams(streams); + handler.handleRequestBody(req, new SolrQueryResponse()); + } + + private void checkInternetConnection() { + try { + URLConnection conn = new URL("http://www.apache.org/").openConnection(); + conn.setConnectTimeout(5000); + conn.setReadTimeout(5000); + conn.connect(); + } catch (Exception ex) { + assumeNoException("This test requires an internet connection", ex); + } + } +} diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/protwords.txt b/solr/contrib/uima/src/test/resources/solr-uima/conf/protwords.txt new file mode 100644 index 00000000000..1dfc0abecbf --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/protwords.txt @@ -0,0 +1,21 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Use a protected word file to protect against the stemmer reducing two +# unrelated words to the same base word. + +# Some non-words that normally won't be encountered, +# just to test that they won't be stemmed. +dontstems +zwhacky + diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml b/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml new file mode 100644 index 00000000000..ff447a97f2b --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/schema.xml @@ -0,0 +1,679 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + id + + + text + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml b/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml new file mode 100644 index 00000000000..173505fa67f --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/solrconfig.xml @@ -0,0 +1,1108 @@ + + + + + + + LUCENE_40 + ${solr.abortOnConfigurationError:true} + + + + + + + + + + + + + + + + ${solr.data.dir:./solr/data} + + + + + + false + + 10 + + + + + 32 + + 10000 + 1000 + 10000 + + + + + + + + + + + + + native + + + + + + + false + 32 + 10 + + + + + + + + false + + + true + + + + + + + + 1 + + 0 + + + + + false + + + + + + + + + + + + + + + + + + + + + + + + + + + 1024 + + + + + + + + + + + + + + + + true + + + + + + + + 20 + + + 200 + + + + + + + + + + + + + + solr rocks + 0 + 10 + + + static firstSearcher warming query from + solrconfig.xml + + + + + + false + + + 2 + + + + + + + + + + + + + + + + + + + + + + + explicit + + + + + + + + + + + + + dismax + explicit + 0.01 + + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 + manu^1.1 cat^1.4 + + + text^0.2 features^1.1 name^1.5 manu^1.4 + manu_exact^1.9 + + + popularity^0.5 recip(price,1,1000,1000)^0.3 + + + id,name,price,score + + + 2<-1 5<-2 6<90% + 100 + *:* + + text features name + + 0 + + name + regex + + + + + + + dismax + explicit + text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 + 2<-1 5<-2 6<90% + + incubationdate_dt:[* TO NOW/DAY-1MONTH]^2.2 + + + + inStock:true + + + + cat + manu_exact + price:[* TO 500] + price:[500 TO *] + + + + + + + + + + textSpell + + + default + name + ./spellchecker + + + + + + + + + + + + false + + false + + 1 + + + spellcheck + + + + + + + + true + + + tvComponent + + + + + + + + + default + + org.carrot2.clustering.lingo.LingoClusteringAlgorithm + + 20 + + + stc + org.carrot2.clustering.stc.STCClusteringAlgorithm + + + + + true + default + true + + name + id + + features + + true + + + + false + + + clusteringComponent + + + + + + + + text + true + ignored_ + + + true + links + ignored_ + + + + + + + + + + true + + + termsComponent + + + + + + + + + + uima + + + + + + + + + + + + + + + + + + + + + + + + + + standard + solrpingquery + all + + + + + + + explicit + true + + + + + + + + + 100 + + + + + + + + 70 + + 0.5 + + [-\w ,/\n\"']{20,200} + + + + + + + ]]> + ]]> + + + + + + + + + + + + + + + + + + + 5 + + + + + + + + + + * + + + + + + + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + 04490000a72fe7ec5cb3497f14e77f338c86f2fe + g6h9zamsdtwhb93nc247ecrs + + /org/apache/uima/desc/OverridingParamsExtServicesAE.xml + text + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/spellings.txt b/solr/contrib/uima/src/test/resources/solr-uima/conf/spellings.txt new file mode 100644 index 00000000000..162a044d561 --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/spellings.txt @@ -0,0 +1,2 @@ +pizza +history diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/stopwords.txt b/solr/contrib/uima/src/test/resources/solr-uima/conf/stopwords.txt new file mode 100644 index 00000000000..b5824da3263 --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/stopwords.txt @@ -0,0 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# a couple of test stopwords to test that the words are really being +# configured from this file: +stopworda +stopwordb + +#Standard english stop words taken from Lucene's StopAnalyzer +a +an +and +are +as +at +be +but +by +for +if +in +into +is +it +no +not +of +on +or +s +such +t +that +the +their +then +there +these +they +this +to +was +will +with + diff --git a/solr/contrib/uima/src/test/resources/solr-uima/conf/synonyms.txt b/solr/contrib/uima/src/test/resources/solr-uima/conf/synonyms.txt new file mode 100644 index 00000000000..b0e31cb7ec8 --- /dev/null +++ b/solr/contrib/uima/src/test/resources/solr-uima/conf/synonyms.txt @@ -0,0 +1,31 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +#some test synonym mappings unlikely to appear in real input text +aaa => aaaa +bbb => bbbb1 bbbb2 +ccc => cccc1,cccc2 +a\=>a => b\=>b +a\,a => b\,b +fooaaa,baraaa,bazaaa + +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +pixima => pixma +