mirror of https://github.com/apache/lucene.git
LUCENE-4749 - exposed UIMA AEs config parameters in analysis/uima tools
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442106 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
da8488a2da
commit
5e556813fa
|
@ -28,6 +28,8 @@ import org.apache.uima.resource.ResourceInitializationException;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a
|
||||
|
@ -39,10 +41,10 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
|
|||
protected final AnalysisEngine ae;
|
||||
protected final CAS cas;
|
||||
|
||||
protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
|
||||
protected BaseUIMATokenizer(Reader reader, String descriptorPath, Map<String, Object> configurationParameters) {
|
||||
super(reader);
|
||||
try {
|
||||
ae = AEProviderFactory.getInstance().getAEProvider(descriptorPath).getAE();
|
||||
ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE();
|
||||
cas = ae.newCAS();
|
||||
} catch (ResourceInitializationException e) {
|
||||
throw new RuntimeException(e);
|
||||
|
|
|
@ -26,6 +26,7 @@ import org.apache.uima.cas.text.AnnotationFS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* a {@link Tokenizer} which creates tokens from UIMA Annotations
|
||||
|
@ -40,8 +41,8 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
|
|||
|
||||
private int finalOffset = 0;
|
||||
|
||||
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
|
||||
super(input, descriptorPath);
|
||||
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters, Reader input) {
|
||||
super(input, descriptorPath, configurationParameters);
|
||||
this.tokenTypeString = tokenType;
|
||||
this.termAttr = addAttribute(CharTermAttribute.class);
|
||||
this.offsetAttr = addAttribute(OffsetAttribute.class);
|
||||
|
|
|
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
|
|||
import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -31,19 +32,29 @@ public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory {
|
|||
|
||||
private String descriptorPath;
|
||||
private String tokenType;
|
||||
private Map<String, Object> configurationParameters;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
configurationParameters = new HashMap<String, Object>();
|
||||
for (String k : args.keySet()) {
|
||||
if (k.equals("tokenType")) {
|
||||
tokenType = args.get("tokenType");
|
||||
if (descriptorPath == null || tokenType == null) {
|
||||
throw new IllegalArgumentException("Both descriptorPath and tokenType are mandatory");
|
||||
} else if (k.equals("descriptorPath")) {
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
} else {
|
||||
configurationParameters.put(k, args.get(k));
|
||||
}
|
||||
}
|
||||
if (descriptorPath == null || tokenType == null ) {
|
||||
throw new IllegalArgumentException("descriptorPath and tokenType are mandatory");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input);
|
||||
return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens
|
||||
|
@ -28,15 +29,17 @@ public final class UIMABaseAnalyzer extends Analyzer {
|
|||
|
||||
private final String descriptorPath;
|
||||
private final String tokenType;
|
||||
private final Map<String, Object> configurationParameters;
|
||||
|
||||
public UIMABaseAnalyzer(String descriptorPath, String tokenType) {
|
||||
public UIMABaseAnalyzer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters) {
|
||||
this.descriptorPath = descriptorPath;
|
||||
this.tokenType = tokenType;
|
||||
this.configurationParameters = configurationParameters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, reader));
|
||||
return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, reader));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
|
|||
import org.apache.lucene.analysis.Analyzer;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase
|
||||
|
@ -28,15 +29,17 @@ public final class UIMATypeAwareAnalyzer extends Analyzer {
|
|||
private final String descriptorPath;
|
||||
private final String tokenType;
|
||||
private final String featurePath;
|
||||
private final Map<String, Object> configurationParameters;
|
||||
|
||||
public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath) {
|
||||
public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath, Map<String, Object> configurationParameters) {
|
||||
this.descriptorPath = descriptorPath;
|
||||
this.tokenType = tokenType;
|
||||
this.featurePath = featurePath;
|
||||
this.configurationParameters = configurationParameters;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||
return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, reader));
|
||||
return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, reader));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.uima.cas.text.AnnotationFS;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to
|
||||
|
@ -50,8 +51,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
|
|||
|
||||
private int finalOffset = 0;
|
||||
|
||||
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
|
||||
super(input, descriptorPath);
|
||||
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map<String, Object> configurationParameters, Reader input) {
|
||||
super(input, descriptorPath, configurationParameters);
|
||||
this.tokenTypeString = tokenType;
|
||||
this.termAttr = addAttribute(CharTermAttribute.class);
|
||||
this.typeAttr = addAttribute(TypeAttribute.class);
|
||||
|
|
|
@ -18,10 +18,10 @@ package org.apache.lucene.analysis.uima;
|
|||
*/
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
|
||||
import org.apache.lucene.analysis.util.TokenizerFactory;
|
||||
|
||||
import java.io.Reader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -32,13 +32,23 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
|
|||
private String descriptorPath;
|
||||
private String tokenType;
|
||||
private String featurePath;
|
||||
private Map<String, Object> configurationParameters;
|
||||
|
||||
@Override
|
||||
public void init(Map<String, String> args) {
|
||||
super.init(args);
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
tokenType = args.get("tokenType");
|
||||
configurationParameters = new HashMap<String, Object>();
|
||||
for (String k : args.keySet()) {
|
||||
if (k.equals("featurePath")) {
|
||||
featurePath = args.get("featurePath");
|
||||
} else if (k.equals("tokenType")) {
|
||||
tokenType = args.get("tokenType");
|
||||
} else if (k.equals("descriptorPath")) {
|
||||
descriptorPath = args.get("descriptorPath");
|
||||
} else {
|
||||
configurationParameters.put(k, args.get(k));
|
||||
}
|
||||
}
|
||||
if (descriptorPath == null || tokenType == null || featurePath == null) {
|
||||
throw new IllegalArgumentException("descriptorPath, tokenType, and featurePath are mandatory");
|
||||
}
|
||||
|
@ -46,6 +56,6 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
|
|||
|
||||
@Override
|
||||
public Tokenizer create(Reader input) {
|
||||
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input);
|
||||
return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, input);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>DummyPoSTagger</name>
|
||||
<name>EntityAnnotator</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor>ASF</vendor>
|
||||
|
|
|
@ -20,9 +20,28 @@
|
|||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>DummyPoSTagger</name>
|
||||
<name>WSTokenizer</name>
|
||||
<version>1.0</version>
|
||||
<vendor>ASF</vendor>
|
||||
<configurationParameters>
|
||||
<configurationParameter>
|
||||
<name>line-end</name>
|
||||
<description>
|
||||
the string used as line end
|
||||
</description>
|
||||
<type>String</type>
|
||||
<multiValued>false</multiValued>
|
||||
<mandatory>false</mandatory>
|
||||
</configurationParameter>
|
||||
</configurationParameters>
|
||||
<configurationParameterSettings>
|
||||
<nameValuePair>
|
||||
<name>line-end</name>
|
||||
<value>
|
||||
<string>\n</string>
|
||||
</value>
|
||||
</nameValuePair>
|
||||
</configurationParameterSettings>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
|
|
|
@ -36,6 +36,8 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Testcase for {@link UIMABaseAnalyzer}
|
||||
|
@ -48,7 +50,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
@Before
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation");
|
||||
analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation", null);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -120,7 +122,15 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Test
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
|
||||
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null),
|
||||
100 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRandomStringsWithConfigurationParameters() throws Exception {
|
||||
Map<String, Object> cp = new HashMap<String, Object>();
|
||||
cp.put("line-end", "\r");
|
||||
checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp),
|
||||
100 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
|
|
|
@ -37,7 +37,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml",
|
||||
"org.apache.uima.TokenAnnotation", "posTag");
|
||||
"org.apache.uima.TokenAnnotation", "posTag", null);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -63,7 +63,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
@Test
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
|
||||
"org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER);
|
||||
"org.apache.lucene.uima.ts.TokenAnnotation", "pos", null), 100 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,11 +17,13 @@ package org.apache.lucene.analysis.uima.an;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.UimaContext;
|
||||
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.Type;
|
||||
import org.apache.uima.cas.text.AnnotationFS;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.resource.ResourceInitializationException;
|
||||
|
||||
/**
|
||||
* Dummy implementation of a UIMA based whitespace tokenizer
|
||||
|
@ -30,15 +32,21 @@ public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
|
|||
|
||||
private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
|
||||
private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
|
||||
private static final String CR = "\n";
|
||||
private String lineEnd;
|
||||
private static final String WHITESPACE = " ";
|
||||
|
||||
@Override
|
||||
public void initialize(UimaContext aContext) throws ResourceInitializationException {
|
||||
super.initialize(aContext);
|
||||
lineEnd = String.valueOf(aContext.getConfigParameterValue("line-end"));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void process(JCas jCas) throws AnalysisEngineProcessException {
|
||||
Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
|
||||
Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
|
||||
int i = 0;
|
||||
for (String sentenceString : jCas.getDocumentText().split(CR)) {
|
||||
for (String sentenceString : jCas.getDocumentText().split(lineEnd)) {
|
||||
// add the sentence
|
||||
AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
|
||||
jCas.addFsToIndexes(sentenceAnnotation);
|
||||
|
|
Loading…
Reference in New Issue