LUCENE-4749 - exposed UIMA AEs config parameters in analysis/uima tools

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442106 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2013-02-04 13:18:40 +00:00
parent da8488a2da
commit 5e556813fa
12 changed files with 96 additions and 28 deletions

View File

@ -28,6 +28,8 @@ import org.apache.uima.resource.ResourceInitializationException;
import java.io.IOException;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
* Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a
@ -39,10 +41,10 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
protected final AnalysisEngine ae;
protected final CAS cas;
protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
protected BaseUIMATokenizer(Reader reader, String descriptorPath, Map<String, Object> configurationParameters) {
super(reader);
try {
ae = AEProviderFactory.getInstance().getAEProvider(descriptorPath).getAE();
ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE();
cas = ae.newCAS();
} catch (ResourceInitializationException e) {
throw new RuntimeException(e);

View File

@ -26,6 +26,7 @@ import org.apache.uima.cas.text.AnnotationFS;
import java.io.IOException;
import java.io.Reader;
import java.util.Map;
/**
* a {@link Tokenizer} which creates tokens from UIMA Annotations
@ -40,8 +41,8 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
private int finalOffset = 0;
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
super(input, descriptorPath);
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters, Reader input) {
super(input, descriptorPath, configurationParameters);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class);

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
@ -31,19 +32,29 @@ public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory {
private String descriptorPath;
private String tokenType;
private Map<String, Object> configurationParameters;
@Override
public void init(Map<String, String> args) {
super.init(args);
descriptorPath = args.get("descriptorPath");
configurationParameters = new HashMap<String, Object>();
for (String k : args.keySet()) {
if (k.equals("tokenType")) {
tokenType = args.get("tokenType");
if (descriptorPath == null || tokenType == null) {
throw new IllegalArgumentException("Both descriptorPath and tokenType are mandatory");
} else if (k.equals("descriptorPath")) {
descriptorPath = args.get("descriptorPath");
} else {
configurationParameters.put(k, args.get(k));
}
}
if (descriptorPath == null || tokenType == null ) {
throw new IllegalArgumentException("descriptorPath and tokenType are mandatory");
}
}
@Override
public Tokenizer create(Reader input) {
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input);
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, input);
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Analyzer;
import java.io.Reader;
import java.util.Map;
/**
* An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens
@ -28,15 +29,17 @@ public final class UIMABaseAnalyzer extends Analyzer {
private final String descriptorPath;
private final String tokenType;
private final Map<String, Object> configurationParameters;
public UIMABaseAnalyzer(String descriptorPath, String tokenType) {
public UIMABaseAnalyzer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters) {
this.descriptorPath = descriptorPath;
this.tokenType = tokenType;
this.configurationParameters = configurationParameters;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, reader));
return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, reader));
}
}

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Analyzer;
import java.io.Reader;
import java.util.Map;
/**
* {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase
@ -28,15 +29,17 @@ public final class UIMATypeAwareAnalyzer extends Analyzer {
private final String descriptorPath;
private final String tokenType;
private final String featurePath;
private final Map<String, Object> configurationParameters;
public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath) {
public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath, Map<String, Object> configurationParameters) {
this.descriptorPath = descriptorPath;
this.tokenType = tokenType;
this.featurePath = featurePath;
this.configurationParameters = configurationParameters;
}
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, reader));
return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, reader));
}
}

View File

@ -29,6 +29,7 @@ import org.apache.uima.cas.text.AnnotationFS;
import java.io.IOException;
import java.io.Reader;
import java.util.Map;
/**
* A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to
@ -50,8 +51,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
private int finalOffset = 0;
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
super(input, descriptorPath);
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map<String, Object> configurationParameters, Reader input) {
super(input, descriptorPath, configurationParameters);
this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class);
this.typeAttr = addAttribute(TypeAttribute.class);

View File

@ -18,10 +18,10 @@ package org.apache.lucene.analysis.uima;
*/
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory;
import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/**
@ -32,13 +32,23 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
private String descriptorPath;
private String tokenType;
private String featurePath;
private Map<String, Object> configurationParameters;
@Override
public void init(Map<String, String> args) {
super.init(args);
descriptorPath = args.get("descriptorPath");
tokenType = args.get("tokenType");
configurationParameters = new HashMap<String, Object>();
for (String k : args.keySet()) {
if (k.equals("featurePath")) {
featurePath = args.get("featurePath");
} else if (k.equals("tokenType")) {
tokenType = args.get("tokenType");
} else if (k.equals("descriptorPath")) {
descriptorPath = args.get("descriptorPath");
} else {
configurationParameters.put(k, args.get(k));
}
}
if (descriptorPath == null || tokenType == null || featurePath == null) {
throw new IllegalArgumentException("descriptorPath, tokenType, and featurePath are mandatory");
}
@ -46,6 +56,6 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
@Override
public Tokenizer create(Reader input) {
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input);
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, input);
}
}

View File

@ -20,7 +20,7 @@
<primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyPoSTagger</name>
<name>EntityAnnotator</name>
<description/>
<version>1.0</version>
<vendor>ASF</vendor>

View File

@ -20,9 +20,28 @@
<primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyPoSTagger</name>
<name>WSTokenizer</name>
<version>1.0</version>
<vendor>ASF</vendor>
<configurationParameters>
<configurationParameter>
<name>line-end</name>
<description>
the string used as line end
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>line-end</name>
<value>
<string>\n</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<types>
<typeDescription>

View File

@ -36,6 +36,8 @@ import org.junit.Before;
import org.junit.Test;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
/**
* Testcase for {@link UIMABaseAnalyzer}
@ -48,7 +50,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
@Before
public void setUp() throws Exception {
super.setUp();
analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation");
analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation", null);
}
@Override
@ -120,7 +122,15 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null),
100 * RANDOM_MULTIPLIER);
}
@Test
public void testRandomStringsWithConfigurationParameters() throws Exception {
Map<String, Object> cp = new HashMap<String, Object>();
cp.put("line-end", "\r");
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp),
100 * RANDOM_MULTIPLIER);
}

View File

@ -37,7 +37,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
public void setUp() throws Exception {
super.setUp();
analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml",
"org.apache.uima.TokenAnnotation", "posTag");
"org.apache.uima.TokenAnnotation", "posTag", null);
}
@Override
@ -63,7 +63,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
"org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER);
"org.apache.lucene.uima.ts.TokenAnnotation", "pos", null), 100 * RANDOM_MULTIPLIER);
}
}

View File

@ -17,11 +17,13 @@ package org.apache.lucene.analysis.uima.an;
* limitations under the License.
*/
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
/**
* Dummy implementation of a UIMA based whitespace tokenizer
@ -30,15 +32,21 @@ public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
private static final String CR = "\n";
private String lineEnd;
private static final String WHITESPACE = " ";
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
lineEnd = String.valueOf(aContext.getConfigParameterValue("line-end"));
}
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
int i = 0;
for (String sentenceString : jCas.getDocumentText().split(CR)) {
for (String sentenceString : jCas.getDocumentText().split(lineEnd)) {
// add the sentence
AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
jCas.addFsToIndexes(sentenceAnnotation);