LUCENE-3731: performance improvements and thread safety fixes to UIMA tokenizers

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244688 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Robert Muir 2012-02-15 20:29:20 +00:00
parent 264eb5690a
commit c97e3edbb9
5 changed files with 77 additions and 104 deletions

View File

@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uima;
*/
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
@ -35,28 +36,31 @@ import java.io.Reader;
public abstract class BaseUIMATokenizer extends Tokenizer {
protected FSIterator<AnnotationFS> iterator;
protected final AnalysisEngine ae;
protected final CAS cas;
protected BaseUIMATokenizer(Reader reader) {
protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
super(reader);
try {
ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
cas = ae.newCAS();
} catch (ResourceInitializationException e) {
throw new RuntimeException(e);
}
}
/**
* analyzes the tokenizer input using the given analysis engine
*
* @param analysisEngine the AE to use for analyzing the tokenizer input
* @return CAS with extracted metadata (UIMA annotations, feature structures)
* @throws ResourceInitializationException
*
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
*
* @throws AnalysisEngineProcessException
* @throws IOException
*/
protected CAS analyzeInput(AnalysisEngine analysisEngine) throws ResourceInitializationException,
AnalysisEngineProcessException, IOException {
CAS cas = analysisEngine.newCAS();
protected void analyzeInput() throws AnalysisEngineProcessException,IOException {
cas.reset();
cas.setDocumentText(toString(input));
analysisEngine.process(cas);
analysisEngine.destroy();
return cas;
ae.process(cas);
}
private String toString(Reader reader) throws IOException {
@ -78,4 +82,6 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
public void end() throws IOException {
iterator = null;
}
}

View File

@ -20,14 +20,9 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.InvalidXMLException;
import java.io.IOException;
import java.io.Reader;
@ -42,23 +37,18 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
private final OffsetAttribute offsetAttr;
private final String tokenTypeString;
private final String descriptorPath;
private int finalOffset = 0;
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
super(input);
super(input, descriptorPath);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class);
this.descriptorPath = descriptorPath;
}
private void analyzeText(String descriptorPath) throws IOException, ResourceInitializationException,
AnalysisEngineProcessException {
AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
CAS cas = analyzeInput(ae);
private void analyzeText() throws IOException, AnalysisEngineProcessException {
analyzeInput();
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();
@ -68,7 +58,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
public boolean incrementToken() throws IOException {
if (iterator == null) {
try {
analyzeText(descriptorPath);
analyzeText();
} catch (Exception e) {
throw new IOException(e);
}

View File

@ -21,16 +21,11 @@ import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FeaturePath;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.InvalidXMLException;
import java.io.IOException;
import java.io.Reader;
@ -49,28 +44,23 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
private final String tokenTypeString;
private final String descriptorPath;
private final String typeAttributeFeaturePath;
private FeaturePath featurePath;
private int finalOffset = 0;
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
super(input);
super(input, descriptorPath);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.typeAttr = addAttribute(TypeAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class);
this.typeAttributeFeaturePath = typeAttributeFeaturePath;
this.descriptorPath = descriptorPath;
}
private void analyzeText() throws IOException, ResourceInitializationException, AnalysisEngineProcessException,
CASException {
AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
CAS cas = analyzeInput(ae);
private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException {
analyzeInput();
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();

View File

@ -17,6 +17,9 @@ package org.apache.lucene.analysis.uima.ae;
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.util.IOUtils;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@ -30,38 +33,55 @@ import org.apache.uima.util.XMLInputSource;
public class BasicAEProvider implements AEProvider {
private final String aePath;
private AnalysisEngine cachedAE;
private AnalysisEngineDescription cachedDescription;
public BasicAEProvider(String aePath) {
this.aePath = aePath;
}
@Override
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
try {
if (cachedAE == null) {
// get Resource Specifier from XML file
XMLInputSource in;
public AnalysisEngine getAE() throws ResourceInitializationException {
synchronized(this) {
if (cachedDescription == null) {
XMLInputSource in = null;
boolean success = false;
try {
in = new XMLInputSource(aePath);
// get Resource Specifier from XML file
in = getInputSource();
// get AE description
cachedDescription = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(in);
configureDescription(cachedDescription);
success = true;
} catch (Exception e) {
in = new XMLInputSource(getClass().getResource(aePath));
throw new ResourceInitializationException(e);
} finally {
if (success) {
try {
IOUtils.close(in.getInputStream());
} catch (IOException e) {
throw new ResourceInitializationException(e);
}
} else if (in != null) {
IOUtils.closeWhileHandlingException(in.getInputStream());
}
}
// get AE description
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(in);
// create AE here
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
} else {
cachedAE.reconfigure();
}
} catch (Exception e) {
cachedAE = null;
throw new ResourceInitializationException(e);
}
}
return UIMAFramework.produceAnalysisEngine(cachedDescription);
}
protected void configureDescription(AnalysisEngineDescription description) {
// no configuration
}
private XMLInputSource getInputSource() throws IOException {
try {
return new XMLInputSource(aePath);
} catch (IOException e) {
return new XMLInputSource(getClass().getResource(aePath));
}
return cachedAE;
}
}

View File

@ -17,11 +17,7 @@ package org.apache.lucene.analysis.uima.ae;
* limitations under the License.
*/
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.XMLInputSource;
import java.util.Map;
@ -30,51 +26,22 @@ import java.util.Map;
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
* them as overriding parameters in the aggregate AE
*/
public class OverridingParamsAEProvider implements AEProvider {
private final String aePath;
private AnalysisEngine cachedAE;
public class OverridingParamsAEProvider extends BasicAEProvider {
private final Map<String, Object> runtimeParameters;
public OverridingParamsAEProvider(String aePath, Map<String, Object> runtimeParameters) {
this.aePath = aePath;
super(aePath);
this.runtimeParameters = runtimeParameters;
}
@Override
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
try {
if (cachedAE == null) {
// get Resource Specifier from XML file
XMLInputSource in;
try {
in = new XMLInputSource(aePath);
} catch (Exception e) {
in = new XMLInputSource(getClass().getResource(aePath));
}
// get AE description
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
.parseAnalysisEngineDescription(in);
/* iterate over each AE (to set runtime parameters) */
for (String attributeName : runtimeParameters.keySet()) {
Object val = getRuntimeValue(desc, attributeName);
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
attributeName, val);
}
// create AE here
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
} else {
cachedAE.reconfigure();
}
} catch (Exception e) {
cachedAE = null;
throw new ResourceInitializationException(e);
protected void configureDescription(AnalysisEngineDescription description) {
for (String attributeName : runtimeParameters.keySet()) {
Object val = getRuntimeValue(description, attributeName);
description.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
attributeName, val);
}
return cachedAE;
}
/* create the value to inject in the runtime parameter depending on its declared type */