SOLR-2512: add ignoreErrors flag so that users can ignore exceptions in AE.

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1102785 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Koji Sekiguchi 2011-05-13 15:12:53 +00:00
parent ed573a067b
commit 0ec6d7a81b
10 changed files with 260 additions and 10 deletions

View File

@ -33,6 +33,9 @@ New Features
* SOLR-2503: extend mapping function to map feature value to dynamicField. (koji)
* SOLR-2512: add ignoreErrors flag so that users can ignore exceptions in AE.
(Tommaso Teofili, koji)
Test Cases:
----------------------

View File

@ -30,6 +30,13 @@ To start using Solr UIMA Metadata Extraction Library you should go through the f
<str name="oc_licenseID">VALID_OPENCALAIS_KEY</str>
</lst>
<str name="analysisEngine">/org/apache/uima/desc/OverridingParamsExtServicesAE.xml</str>
<!-- Set to true if you want to continue indexing even if text processing fails.
Default is false. That is, Solr throws RuntimeException and
never indexed documents entirely in your session. -->
<bool name="ignoreErrors">true</bool>
<!-- This is optional. It is used for logging when text processing fails.
Usually, set uniqueKey field name -->
<str name="logField">id</str>
<lst name="analyzeFields">
<bool name="merge">false</bool>
<arr name="fields">

View File

@ -36,14 +36,20 @@ public class SolrUIMAConfiguration {
private Map<String, Object> runtimeParameters;
private boolean ignoreErrors;
private String logField;
public SolrUIMAConfiguration(String aePath, String[] fieldsToAnalyze, boolean fieldsMerging,
Map<String, Map<String, MapField>> typesFeaturesFieldsMapping,
Map<String, Object> runtimeParameters) {
Map<String, Object> runtimeParameters, boolean ignoreErrors, String logField) {
this.aePath = aePath;
this.fieldsToAnalyze = fieldsToAnalyze;
this.fieldsMerging = fieldsMerging;
this.runtimeParameters = runtimeParameters;
this.typesFeaturesFieldsMapping = typesFeaturesFieldsMapping;
this.ignoreErrors = ignoreErrors;
this.logField = logField;
}
public String[] getFieldsToAnalyze() {
@ -66,6 +72,14 @@ public class SolrUIMAConfiguration {
return runtimeParameters;
}
public boolean isIgnoreErrors() {
return ignoreErrors;
}
public String getLogField(){
return logField;
}
static final class MapField {
private String fieldName, fieldNameFeature;

View File

@ -40,7 +40,8 @@ public class SolrUIMAConfigurationReader {
public SolrUIMAConfiguration readSolrUIMAConfiguration() {
return new SolrUIMAConfiguration(readAEPath(), readFieldsToAnalyze(), readFieldsMerging(),
readTypesFeaturesFieldsMapping(), readAEOverridingParameters());
readTypesFeaturesFieldsMapping(), readAEOverridingParameters(), readIgnoreErrors(),
readLogField());
}
private String readAEPath() {
@ -105,4 +106,12 @@ public class SolrUIMAConfigurationReader {
return runtimeParameters;
}
private boolean readIgnoreErrors() {
Object ignoreErrors = args.get("ignoreErrors");
return ignoreErrors == null ? false : (Boolean)ignoreErrors;
}
private String readLogField() {
return (String)args.get("logField");
}
}

View File

@ -20,7 +20,9 @@ package org.apache.solr.uima.processor;
import java.io.IOException;
import java.util.Map;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrException.ErrorCode;
import org.apache.solr.core.SolrCore;
import org.apache.solr.uima.processor.SolrUIMAConfiguration.MapField;
import org.apache.solr.uima.processor.ae.AEProvider;
@ -58,12 +60,15 @@ public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
@Override
public void processAdd(AddUpdateCommand cmd) throws IOException {
String text = null;
try {
/* get Solr document */
SolrInputDocument solrInputDocument = cmd.getSolrInputDocument();
/* get the fields to analyze */
for (String text : getTextsToAnalyze(solrInputDocument)) {
String[] texts = getTextsToAnalyze(solrInputDocument);
for (int i = 0; i < texts.length; i++) {
text = texts[i];
if (text != null && !"".equals(text)) {
/* process the text value */
JCas jcas = processText(text);
@ -79,7 +84,21 @@ public class UIMAUpdateRequestProcessor extends UpdateRequestProcessor {
}
}
} catch (UIMAException e) {
throw new RuntimeException(e);
String logField = solrUIMAConfiguration.getLogField();
String optionalFieldInfo = logField == null ? "." :
new StringBuilder(". ").append(logField).append("=")
.append((String)cmd.getSolrInputDocument().getField(logField).getValue())
.append(", ").toString();
if (solrUIMAConfiguration.isIgnoreErrors())
log.warn(new StringBuilder("skip the text processing due to ")
.append(e.getLocalizedMessage()).append(optionalFieldInfo)
.append(" text=\"").append(text.substring(0, 100)).append("...\"").toString());
else{
throw new SolrException(ErrorCode.SERVER_ERROR,
new StringBuilder("processing error: ")
.append(e.getLocalizedMessage()).append(optionalFieldInfo)
.append(" text=\"").append(text.substring(0, 100)).append("...\"").toString(), e);
}
}
super.processAdd(cmd);
}

View File

@ -93,7 +93,7 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 {
@Test
public void testProcessing() throws Exception {
addDoc(adoc(
addDoc("uima", adoc(
"id",
"2312312321312",
"text",
@ -111,13 +111,13 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 {
@Test
public void testTwoUpdates() throws Exception {
addDoc(adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
addDoc("uima", adoc("id", "1", "text", "The Apache Software Foundation is happy to announce "
+ "BarCampApache Sydney, Australia, the first ASF-backed event in the Southern "
+ "Hemisphere!"));
assertU(commit());
assertQ(req("sentence:*"), "//*[@numFound='1']");
addDoc(adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
addDoc("uima", adoc("id", "2", "text", "Taking place 11th December 2010 at the University "
+ "of Sydney's Darlington Centre, the BarCampApache \"unconference\" will be"
+ " attendee-driven, facilitated by members of the Apache community and will "
+ "focus on the Apache..."));
@ -128,9 +128,41 @@ public class UIMAUpdateRequestProcessorTest extends SolrTestCaseJ4 {
assertQ(req("ORGANIZATION_sm:Apache"), "//*[@numFound='2']");
}
private void addDoc(String doc) throws Exception {
@Test
public void testErrorHandling() throws Exception {
try{
addDoc("uima-not-ignoreErrors", adoc(
"id",
"2312312321312",
"text",
"SpellCheckComponent got improvement related to recent Lucene changes. \n "
+ "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+ "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+ " attached if you need it, but it is also committed to trunk and 3_x branch."
+ " Last Lucene European Conference has been held in Prague."));
fail("exception shouldn't be ignored");
}
catch(RuntimeException expected){}
assertU(commit());
assertQ(req("*:*"), "//*[@numFound='0']");
addDoc("uima-ignoreErrors", adoc(
"id",
"2312312321312",
"text",
"SpellCheckComponent got improvement related to recent Lucene changes. \n "
+ "Add support for specifying Spelling SuggestWord Comparator to Lucene spell "
+ "checkers for SpellCheckComponent. Issue SOLR-2053 is already fixed, patch is"
+ " attached if you need it, but it is also committed to trunk and 3_x branch."
+ " Last Lucene European Conference has been held in Prague."));
assertU(commit());
assertQ(req("*:*"), "//*[@numFound='1']");
}
private void addDoc(String chain, String doc) throws Exception {
Map<String, String[]> params = new HashMap<String, String[]>();
params.put(UpdateParams.UPDATE_CHAIN, new String[] { "uima" });
params.put(UpdateParams.UPDATE_CHAIN, new String[] { chain });
MultiMapSolrParams mmparams = new MultiMapSolrParams(params);
SolrQueryRequestBase req = new SolrQueryRequestBase(h.getCore(), (SolrParams) mmparams) {
};

View File

@ -0,0 +1,31 @@
package org.apache.solr.uima.processor.an;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.jcas.JCas;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class DummyExceptionAnnotator extends JCasAnnotator_ImplBase{
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
throw new AnalysisEngineProcessException();
}
}

View File

@ -0,0 +1,40 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.solr.uima.processor.an.DummyExceptionAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyExceptionAEDescriptor</name>
<description/>
<version>1.0</version>
<vendor>ASF</vendor>
<configurationParameters/>
<configurationParameterSettings/>
<typeSystemDescription/>
<typePriorities/>
<fsIndexCollection/>
<capabilities/>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,54 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>false</primitive>
<delegateAnalysisEngineSpecifiers>
<delegateAnalysisEngine key="DummyExceptionAEDescriptor">
<import location="DummyExceptionAEDescriptor.xml"/>
</delegateAnalysisEngine>
</delegateAnalysisEngineSpecifiers>
<analysisEngineMetaData>
<name>TestExceptionAE</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameters/>
<configurationParameterSettings/>
<flowConstraints>
<fixedFlow>
<node>DummyExceptionAEDescriptor</node>
</fixedFlow>
</flowConstraints>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs/>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -1003,7 +1003,6 @@
</lst>
</lst>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
@ -1037,6 +1036,48 @@
</processor>
</updateRequestProcessorChain>
<updateRequestProcessorChain name="uima-not-ignoreErrors">
<processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
<lst name="uimaConfig">
<lst name="runtimeParameters">
<int name="ngramsize">3</int>
</lst>
<str name="analysisEngine">/TestExceptionAE.xml</str>
<bool name="ignoreErrors">false</bool>
<lst name="analyzeFields">
<bool name="merge">false</bool>
<arr name="fields">
<str>text</str>
</arr>
</lst>
<lst name="fieldMappings"/>
</lst>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<updateRequestProcessorChain name="uima-ignoreErrors">
<processor class="org.apache.solr.uima.processor.UIMAUpdateRequestProcessorFactory">
<lst name="uimaConfig">
<lst name="runtimeParameters">
<int name="ngramsize">3</int>
</lst>
<str name="analysisEngine">/TestExceptionAE.xml</str>
<bool name="ignoreErrors">true</bool>
<!-- This is optional. It is used for logging when text processing fails. Usually, set uniqueKey field name -->
<str name="logField">id</str>
<lst name="analyzeFields">
<bool name="merge">false</bool>
<arr name="fields">
<str>text</str>
</arr>
</lst>
<lst name="fieldMappings"/>
</lst>
</processor>
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<!--
queryResponseWriter plugins... query responses will be written using
the writer specified by the 'wt' request parameter matching the name