mirror of https://github.com/apache/lucene.git
LUCENE-3731: performance improvements and thread safety fixes to UIMA tokenizers
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244688 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
264eb5690a
commit
c97e3edbb9
|
@ -18,6 +18,7 @@ package org.apache.lucene.analysis.uima;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.CAS;
|
||||
|
@ -35,28 +36,31 @@ import java.io.Reader;
|
|||
public abstract class BaseUIMATokenizer extends Tokenizer {
|
||||
|
||||
protected FSIterator<AnnotationFS> iterator;
|
||||
protected final AnalysisEngine ae;
|
||||
protected final CAS cas;
|
||||
|
||||
protected BaseUIMATokenizer(Reader reader) {
|
||||
protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
|
||||
super(reader);
|
||||
try {
|
||||
ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
|
||||
cas = ae.newCAS();
|
||||
} catch (ResourceInitializationException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* analyzes the tokenizer input using the given analysis engine
|
||||
*
|
||||
* @param analysisEngine the AE to use for analyzing the tokenizer input
|
||||
* @return CAS with extracted metadata (UIMA annotations, feature structures)
|
||||
* @throws ResourceInitializationException
|
||||
*
|
||||
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
|
||||
*
|
||||
* @throws AnalysisEngineProcessException
|
||||
* @throws IOException
|
||||
*/
|
||||
protected CAS analyzeInput(AnalysisEngine analysisEngine) throws ResourceInitializationException,
|
||||
AnalysisEngineProcessException, IOException {
|
||||
CAS cas = analysisEngine.newCAS();
|
||||
protected void analyzeInput() throws AnalysisEngineProcessException,IOException {
|
||||
cas.reset();
|
||||
cas.setDocumentText(toString(input));
|
||||
analysisEngine.process(cas);
|
||||
analysisEngine.destroy();
|
||||
return cas;
|
||||
ae.process(cas);
|
||||
}
|
||||
|
||||
private String toString(Reader reader) throws IOException {
|
||||
|
@ -78,4 +82,6 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
|
|||
public void end() throws IOException {
|
||||
iterator = null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -20,14 +20,9 @@ package org.apache.lucene.analysis.uima;
|
|||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.CAS;
|
||||
import org.apache.uima.cas.Type;
|
||||
import org.apache.uima.cas.text.AnnotationFS;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.util.InvalidXMLException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -42,23 +37,18 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
|
|||
private final OffsetAttribute offsetAttr;
|
||||
|
||||
private final String tokenTypeString;
|
||||
|
||||
private final String descriptorPath;
|
||||
|
||||
|
||||
private int finalOffset = 0;
|
||||
|
||||
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
|
||||
super(input);
|
||||
super(input, descriptorPath);
|
||||
this.tokenTypeString = tokenType;
|
||||
this.termAttr = addAttribute(CharTermAttribute.class);
|
||||
this.offsetAttr = addAttribute(OffsetAttribute.class);
|
||||
this.descriptorPath = descriptorPath;
|
||||
}
|
||||
|
||||
private void analyzeText(String descriptorPath) throws IOException, ResourceInitializationException,
|
||||
AnalysisEngineProcessException {
|
||||
AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
|
||||
CAS cas = analyzeInput(ae);
|
||||
private void analyzeText() throws IOException, AnalysisEngineProcessException {
|
||||
analyzeInput();
|
||||
finalOffset = correctOffset(cas.getDocumentText().length());
|
||||
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
|
||||
iterator = cas.getAnnotationIndex(tokenType).iterator();
|
||||
|
@ -68,7 +58,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
|
|||
public boolean incrementToken() throws IOException {
|
||||
if (iterator == null) {
|
||||
try {
|
||||
analyzeText(descriptorPath);
|
||||
analyzeText();
|
||||
} catch (Exception e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
|
|
|
@ -21,16 +21,11 @@ import org.apache.lucene.analysis.Tokenizer;
|
|||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
import org.apache.lucene.analysis.uima.ae.AEProviderFactory;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.CAS;
|
||||
import org.apache.uima.cas.CASException;
|
||||
import org.apache.uima.cas.FeaturePath;
|
||||
import org.apache.uima.cas.Type;
|
||||
import org.apache.uima.cas.text.AnnotationFS;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.util.InvalidXMLException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -49,28 +44,23 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
|
|||
|
||||
private final String tokenTypeString;
|
||||
|
||||
private final String descriptorPath;
|
||||
|
||||
private final String typeAttributeFeaturePath;
|
||||
|
||||
private FeaturePath featurePath;
|
||||
|
||||
|
||||
private int finalOffset = 0;
|
||||
|
||||
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
|
||||
super(input);
|
||||
super(input, descriptorPath);
|
||||
this.tokenTypeString = tokenType;
|
||||
this.termAttr = addAttribute(CharTermAttribute.class);
|
||||
this.typeAttr = addAttribute(TypeAttribute.class);
|
||||
this.offsetAttr = addAttribute(OffsetAttribute.class);
|
||||
this.typeAttributeFeaturePath = typeAttributeFeaturePath;
|
||||
this.descriptorPath = descriptorPath;
|
||||
}
|
||||
|
||||
private void analyzeText() throws IOException, ResourceInitializationException, AnalysisEngineProcessException,
|
||||
CASException {
|
||||
AnalysisEngine ae = AEProviderFactory.getInstance().getAEProvider("", descriptorPath).getAE();
|
||||
CAS cas = analyzeInput(ae);
|
||||
private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException {
|
||||
analyzeInput();
|
||||
finalOffset = correctOffset(cas.getDocumentText().length());
|
||||
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
|
||||
iterator = cas.getAnnotationIndex(tokenType).iterator();
|
||||
|
|
|
@ -17,6 +17,9 @@ package org.apache.lucene.analysis.uima.ae;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.util.IOUtils;
|
||||
import org.apache.uima.UIMAFramework;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
|
||||
|
@ -30,38 +33,55 @@ import org.apache.uima.util.XMLInputSource;
|
|||
public class BasicAEProvider implements AEProvider {
|
||||
|
||||
private final String aePath;
|
||||
private AnalysisEngine cachedAE;
|
||||
private AnalysisEngineDescription cachedDescription;
|
||||
|
||||
public BasicAEProvider(String aePath) {
|
||||
this.aePath = aePath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
|
||||
try {
|
||||
if (cachedAE == null) {
|
||||
// get Resource Specifier from XML file
|
||||
|
||||
XMLInputSource in;
|
||||
public AnalysisEngine getAE() throws ResourceInitializationException {
|
||||
synchronized(this) {
|
||||
if (cachedDescription == null) {
|
||||
XMLInputSource in = null;
|
||||
boolean success = false;
|
||||
try {
|
||||
in = new XMLInputSource(aePath);
|
||||
// get Resource Specifier from XML file
|
||||
in = getInputSource();
|
||||
|
||||
// get AE description
|
||||
cachedDescription = UIMAFramework.getXMLParser()
|
||||
.parseAnalysisEngineDescription(in);
|
||||
configureDescription(cachedDescription);
|
||||
success = true;
|
||||
} catch (Exception e) {
|
||||
in = new XMLInputSource(getClass().getResource(aePath));
|
||||
throw new ResourceInitializationException(e);
|
||||
} finally {
|
||||
if (success) {
|
||||
try {
|
||||
IOUtils.close(in.getInputStream());
|
||||
} catch (IOException e) {
|
||||
throw new ResourceInitializationException(e);
|
||||
}
|
||||
} else if (in != null) {
|
||||
IOUtils.closeWhileHandlingException(in.getInputStream());
|
||||
}
|
||||
}
|
||||
|
||||
// get AE description
|
||||
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
|
||||
.parseAnalysisEngineDescription(in);
|
||||
|
||||
// create AE here
|
||||
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
|
||||
} else {
|
||||
cachedAE.reconfigure();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
cachedAE = null;
|
||||
throw new ResourceInitializationException(e);
|
||||
}
|
||||
}
|
||||
|
||||
return UIMAFramework.produceAnalysisEngine(cachedDescription);
|
||||
}
|
||||
|
||||
protected void configureDescription(AnalysisEngineDescription description) {
|
||||
// no configuration
|
||||
}
|
||||
|
||||
private XMLInputSource getInputSource() throws IOException {
|
||||
try {
|
||||
return new XMLInputSource(aePath);
|
||||
} catch (IOException e) {
|
||||
return new XMLInputSource(getClass().getResource(aePath));
|
||||
}
|
||||
return cachedAE;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,11 +17,7 @@ package org.apache.lucene.analysis.uima.ae;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.UIMAFramework;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngine;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
import org.apache.uima.util.XMLInputSource;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
|
@ -30,51 +26,22 @@ import java.util.Map;
|
|||
* injecting runtime parameters defined in the solrconfig.xml Solr configuration file and assigning
|
||||
* them as overriding parameters in the aggregate AE
|
||||
*/
|
||||
public class OverridingParamsAEProvider implements AEProvider {
|
||||
|
||||
private final String aePath;
|
||||
|
||||
private AnalysisEngine cachedAE;
|
||||
public class OverridingParamsAEProvider extends BasicAEProvider {
|
||||
|
||||
private final Map<String, Object> runtimeParameters;
|
||||
|
||||
public OverridingParamsAEProvider(String aePath, Map<String, Object> runtimeParameters) {
|
||||
this.aePath = aePath;
|
||||
super(aePath);
|
||||
this.runtimeParameters = runtimeParameters;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public synchronized AnalysisEngine getAE() throws ResourceInitializationException {
|
||||
try {
|
||||
if (cachedAE == null) {
|
||||
// get Resource Specifier from XML file
|
||||
XMLInputSource in;
|
||||
try {
|
||||
in = new XMLInputSource(aePath);
|
||||
} catch (Exception e) {
|
||||
in = new XMLInputSource(getClass().getResource(aePath));
|
||||
}
|
||||
|
||||
// get AE description
|
||||
AnalysisEngineDescription desc = UIMAFramework.getXMLParser()
|
||||
.parseAnalysisEngineDescription(in);
|
||||
|
||||
/* iterate over each AE (to set runtime parameters) */
|
||||
for (String attributeName : runtimeParameters.keySet()) {
|
||||
Object val = getRuntimeValue(desc, attributeName);
|
||||
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
|
||||
attributeName, val);
|
||||
}
|
||||
// create AE here
|
||||
cachedAE = UIMAFramework.produceAnalysisEngine(desc);
|
||||
} else {
|
||||
cachedAE.reconfigure();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
cachedAE = null;
|
||||
throw new ResourceInitializationException(e);
|
||||
protected void configureDescription(AnalysisEngineDescription description) {
|
||||
for (String attributeName : runtimeParameters.keySet()) {
|
||||
Object val = getRuntimeValue(description, attributeName);
|
||||
description.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(
|
||||
attributeName, val);
|
||||
}
|
||||
return cachedAE;
|
||||
}
|
||||
|
||||
/* create the value to inject in the runtime parameter depending on its declared type */
|
||||
|
|
Loading…
Reference in New Issue