LUCENE-4749 - exposed UIMA AEs config parameters in analysis/uima tools

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442106 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2013-02-04 13:18:40 +00:00
parent da8488a2da
commit 5e556813fa
12 changed files with 96 additions and 28 deletions

View File

@ -28,6 +28,8 @@ import org.apache.uima.resource.ResourceInitializationException;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap;
import java.util.Map;
/** /**
* Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a * Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a
@ -39,10 +41,10 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
protected final AnalysisEngine ae; protected final AnalysisEngine ae;
protected final CAS cas; protected final CAS cas;
protected BaseUIMATokenizer(Reader reader, String descriptorPath) { protected BaseUIMATokenizer(Reader reader, String descriptorPath, Map<String, Object> configurationParameters) {
super(reader); super(reader);
try { try {
ae = AEProviderFactory.getInstance().getAEProvider(descriptorPath).getAE(); ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE();
cas = ae.newCAS(); cas = ae.newCAS();
} catch (ResourceInitializationException e) { } catch (ResourceInitializationException e) {
throw new RuntimeException(e); throw new RuntimeException(e);

View File

@ -26,6 +26,7 @@ import org.apache.uima.cas.text.AnnotationFS;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* a {@link Tokenizer} which creates tokens from UIMA Annotations * a {@link Tokenizer} which creates tokens from UIMA Annotations
@ -40,8 +41,8 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
private int finalOffset = 0; private int finalOffset = 0;
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) { public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters, Reader input) {
super(input, descriptorPath); super(input, descriptorPath, configurationParameters);
this.tokenTypeString = tokenType; this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class); this.termAttr = addAttribute(CharTermAttribute.class);
this.offsetAttr = addAttribute(OffsetAttribute.class); this.offsetAttr = addAttribute(OffsetAttribute.class);

View File

@ -22,6 +22,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer; import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
@ -31,19 +32,29 @@ public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory {
private String descriptorPath; private String descriptorPath;
private String tokenType; private String tokenType;
private Map<String, Object> configurationParameters;
@Override @Override
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
descriptorPath = args.get("descriptorPath"); configurationParameters = new HashMap<String, Object>();
for (String k : args.keySet()) {
if (k.equals("tokenType")) {
tokenType = args.get("tokenType"); tokenType = args.get("tokenType");
if (descriptorPath == null || tokenType == null) { } else if (k.equals("descriptorPath")) {
throw new IllegalArgumentException("Both descriptorPath and tokenType are mandatory"); descriptorPath = args.get("descriptorPath");
} else {
configurationParameters.put(k, args.get(k));
} }
} }
if (descriptorPath == null || tokenType == null ) {
throw new IllegalArgumentException("descriptorPath and tokenType are mandatory");
}
}
@Override @Override
public Tokenizer create(Reader input) { public Tokenizer create(Reader input) {
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input); return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, input);
} }
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens * An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens
@ -28,15 +29,17 @@ public final class UIMABaseAnalyzer extends Analyzer {
private final String descriptorPath; private final String descriptorPath;
private final String tokenType; private final String tokenType;
private final Map<String, Object> configurationParameters;
public UIMABaseAnalyzer(String descriptorPath, String tokenType) { public UIMABaseAnalyzer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters) {
this.descriptorPath = descriptorPath; this.descriptorPath = descriptorPath;
this.tokenType = tokenType; this.tokenType = tokenType;
this.configurationParameters = configurationParameters;
} }
@Override @Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, reader)); return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, reader));
} }
} }

View File

@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase * {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase
@ -28,15 +29,17 @@ public final class UIMATypeAwareAnalyzer extends Analyzer {
private final String descriptorPath; private final String descriptorPath;
private final String tokenType; private final String tokenType;
private final String featurePath; private final String featurePath;
private final Map<String, Object> configurationParameters;
public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath) { public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath, Map<String, Object> configurationParameters) {
this.descriptorPath = descriptorPath; this.descriptorPath = descriptorPath;
this.tokenType = tokenType; this.tokenType = tokenType;
this.featurePath = featurePath; this.featurePath = featurePath;
this.configurationParameters = configurationParameters;
} }
@Override @Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) { protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, reader)); return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, reader));
} }
} }

View File

@ -29,6 +29,7 @@ import org.apache.uima.cas.text.AnnotationFS;
import java.io.IOException; import java.io.IOException;
import java.io.Reader; import java.io.Reader;
import java.util.Map;
/** /**
* A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to * A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to
@ -50,8 +51,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
private int finalOffset = 0; private int finalOffset = 0;
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) { public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map<String, Object> configurationParameters, Reader input) {
super(input, descriptorPath); super(input, descriptorPath, configurationParameters);
this.tokenTypeString = tokenType; this.tokenTypeString = tokenType;
this.termAttr = addAttribute(CharTermAttribute.class); this.termAttr = addAttribute(CharTermAttribute.class);
this.typeAttr = addAttribute(TypeAttribute.class); this.typeAttr = addAttribute(TypeAttribute.class);

View File

@ -18,10 +18,10 @@ package org.apache.lucene.analysis.uima;
*/ */
import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
import org.apache.lucene.analysis.util.TokenizerFactory; import org.apache.lucene.analysis.util.TokenizerFactory;
import java.io.Reader; import java.io.Reader;
import java.util.HashMap;
import java.util.Map; import java.util.Map;
/** /**
@ -32,13 +32,23 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
private String descriptorPath; private String descriptorPath;
private String tokenType; private String tokenType;
private String featurePath; private String featurePath;
private Map<String, Object> configurationParameters;
@Override @Override
public void init(Map<String, String> args) { public void init(Map<String, String> args) {
super.init(args); super.init(args);
descriptorPath = args.get("descriptorPath"); configurationParameters = new HashMap<String, Object>();
tokenType = args.get("tokenType"); for (String k : args.keySet()) {
if (k.equals("featurePath")) {
featurePath = args.get("featurePath"); featurePath = args.get("featurePath");
} else if (k.equals("tokenType")) {
tokenType = args.get("tokenType");
} else if (k.equals("descriptorPath")) {
descriptorPath = args.get("descriptorPath");
} else {
configurationParameters.put(k, args.get(k));
}
}
if (descriptorPath == null || tokenType == null || featurePath == null) { if (descriptorPath == null || tokenType == null || featurePath == null) {
throw new IllegalArgumentException("descriptorPath, tokenType, and featurePath are mandatory"); throw new IllegalArgumentException("descriptorPath, tokenType, and featurePath are mandatory");
} }
@ -46,6 +56,6 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
@Override @Override
public Tokenizer create(Reader input) { public Tokenizer create(Reader input) {
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input); return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, input);
} }
} }

View File

@ -20,7 +20,7 @@
<primitive>true</primitive> <primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName> <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
<analysisEngineMetaData> <analysisEngineMetaData>
<name>DummyPoSTagger</name> <name>EntityAnnotator</name>
<description/> <description/>
<version>1.0</version> <version>1.0</version>
<vendor>ASF</vendor> <vendor>ASF</vendor>

View File

@ -20,9 +20,28 @@
<primitive>true</primitive> <primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName> <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
<analysisEngineMetaData> <analysisEngineMetaData>
<name>DummyPoSTagger</name> <name>WSTokenizer</name>
<version>1.0</version> <version>1.0</version>
<vendor>ASF</vendor> <vendor>ASF</vendor>
<configurationParameters>
<configurationParameter>
<name>line-end</name>
<description>
the string used as line end
</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>false</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>line-end</name>
<value>
<string>\n</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription> <typeSystemDescription>
<types> <types>
<typeDescription> <typeDescription>

View File

@ -36,6 +36,8 @@ import org.junit.Before;
import org.junit.Test; import org.junit.Test;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
/** /**
* Testcase for {@link UIMABaseAnalyzer} * Testcase for {@link UIMABaseAnalyzer}
@ -48,7 +50,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
@Before @Before
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"); analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation", null);
} }
@Override @Override
@ -120,7 +122,15 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
@Test @Test
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"), checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null),
100 * RANDOM_MULTIPLIER);
}
@Test
public void testRandomStringsWithConfigurationParameters() throws Exception {
Map<String, Object> cp = new HashMap<String, Object>();
cp.put("line-end", "\r");
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp),
100 * RANDOM_MULTIPLIER); 100 * RANDOM_MULTIPLIER);
} }

View File

@ -37,7 +37,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
public void setUp() throws Exception { public void setUp() throws Exception {
super.setUp(); super.setUp();
analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml", analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml",
"org.apache.uima.TokenAnnotation", "posTag"); "org.apache.uima.TokenAnnotation", "posTag", null);
} }
@Override @Override
@ -63,7 +63,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
@Test @Test
public void testRandomStrings() throws Exception { public void testRandomStrings() throws Exception {
checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml", checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
"org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER); "org.apache.lucene.uima.ts.TokenAnnotation", "pos", null), 100 * RANDOM_MULTIPLIER);
} }
} }

View File

@ -17,11 +17,13 @@ package org.apache.lucene.analysis.uima.an;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type; import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
/** /**
* Dummy implementation of a UIMA based whitespace tokenizer * Dummy implementation of a UIMA based whitespace tokenizer
@ -30,15 +32,21 @@ public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation"; private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation"; private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
private static final String CR = "\n"; private String lineEnd;
private static final String WHITESPACE = " "; private static final String WHITESPACE = " ";
@Override
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
lineEnd = String.valueOf(aContext.getConfigParameterValue("line-end"));
}
@Override @Override
public void process(JCas jCas) throws AnalysisEngineProcessException { public void process(JCas jCas) throws AnalysisEngineProcessException {
Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE); Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE); Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
int i = 0; int i = 0;
for (String sentenceString : jCas.getDocumentText().split(CR)) { for (String sentenceString : jCas.getDocumentText().split(lineEnd)) {
// add the sentence // add the sentence
AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length()); AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
jCas.addFsToIndexes(sentenceAnnotation); jCas.addFsToIndexes(sentenceAnnotation);