LUCENE-4749 - exposed UIMA AEs config parameters in analysis/uima tools

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1442106 13f79535-47bb-0310-9956-ffa450edef68
2013-02-04 13:18:40 +00:00 · 2013-02-04 13:18:40 +00:00 · 5e556813fa
parent da8488a2da
commit 5e556813fa
12 changed files with 96 additions and 28 deletions
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java
@ -28,6 +28,8 @@ import org.apache.uima.resource.ResourceInitializationException;

 import java.io.IOException;
 import java.io.Reader;
+import java.util.HashMap;
+import java.util.Map;

 /**
 * Abstract base implementation of a {@link Tokenizer} which is able to analyze the given input with a
@ -39,10 +41,10 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
  protected final AnalysisEngine ae;
  protected final CAS cas;

-  protected BaseUIMATokenizer(Reader reader, String descriptorPath) {
+  protected BaseUIMATokenizer(Reader reader, String descriptorPath, Map<String, Object> configurationParameters) {
    super(reader);
    try {
-      ae = AEProviderFactory.getInstance().getAEProvider(descriptorPath).getAE();
+      ae = AEProviderFactory.getInstance().getAEProvider(null, descriptorPath, configurationParameters).getAE();
      cas = ae.newCAS();
    } catch (ResourceInitializationException e) {
      throw new RuntimeException(e);
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java
@ -26,6 +26,7 @@ import org.apache.uima.cas.text.AnnotationFS;

 import java.io.IOException;
 import java.io.Reader;
+import java.util.Map;

 /**
 * a {@link Tokenizer} which creates tokens from UIMA Annotations
@ -40,8 +41,8 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {

  private int finalOffset = 0;

-  public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
-    super(input, descriptorPath);
+  public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters, Reader input) {
+    super(input, descriptorPath, configurationParameters);
    this.tokenTypeString = tokenType;
    this.termAttr = addAttribute(CharTermAttribute.class);
    this.offsetAttr = addAttribute(OffsetAttribute.class);
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizerFactory.java
@ -22,6 +22,7 @@ import org.apache.lucene.analysis.util.TokenizerFactory;
 import org.apache.lucene.analysis.uima.UIMAAnnotationsTokenizer;

 import java.io.Reader;
+import java.util.HashMap;
 import java.util.Map;

 /**
@ -31,19 +32,29 @@ public class UIMAAnnotationsTokenizerFactory extends TokenizerFactory {

  private String descriptorPath;
  private String tokenType;
+  private Map<String, Object> configurationParameters;

  @Override
  public void init(Map<String, String> args) {
    super.init(args);
-    descriptorPath = args.get("descriptorPath");
+    configurationParameters = new HashMap<String, Object>();
+    for (String k : args.keySet()) {
+      if (k.equals("tokenType")) {
        tokenType = args.get("tokenType");
-    if (descriptorPath == null || tokenType == null) {
-      throw new IllegalArgumentException("Both descriptorPath and tokenType are mandatory");
+      } else if (k.equals("descriptorPath")) {
+        descriptorPath = args.get("descriptorPath");
+      } else {
+        configurationParameters.put(k, args.get(k));
      }
    }
+    if (descriptorPath == null || tokenType == null ) {
+      throw new IllegalArgumentException("descriptorPath and tokenType are mandatory");
+    }
+
+  }

  @Override
  public Tokenizer create(Reader input) {
-    return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, input);
+    return new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, input);
  }
 }
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMABaseAnalyzer.java
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
 import org.apache.lucene.analysis.Analyzer;

 import java.io.Reader;
+import java.util.Map;

 /**
 * An {@link Analyzer} which use the {@link UIMAAnnotationsTokenizer} for creating tokens
@ -28,15 +29,17 @@ public final class UIMABaseAnalyzer extends Analyzer {

  private final String descriptorPath;
  private final String tokenType;
+  private final Map<String, Object> configurationParameters;

-  public UIMABaseAnalyzer(String descriptorPath, String tokenType) {
+  public UIMABaseAnalyzer(String descriptorPath, String tokenType, Map<String, Object> configurationParameters) {
    this.descriptorPath = descriptorPath;
    this.tokenType = tokenType;
+    this.configurationParameters = configurationParameters;
  }

  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, reader));
+    return new TokenStreamComponents(new UIMAAnnotationsTokenizer(descriptorPath, tokenType, configurationParameters, reader));
  }

 }
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzer.java
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.uima;
 import org.apache.lucene.analysis.Analyzer;

 import java.io.Reader;
+import java.util.Map;

 /**
 * {@link Analyzer} which uses the {@link UIMATypeAwareAnnotationsTokenizer} for the tokenization phase
@ -28,15 +29,17 @@ public final class UIMATypeAwareAnalyzer extends Analyzer {
  private final String descriptorPath;
  private final String tokenType;
  private final String featurePath;
+  private final Map<String, Object> configurationParameters;

-  public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath) {
+  public UIMATypeAwareAnalyzer(String descriptorPath, String tokenType, String featurePath, Map<String, Object> configurationParameters) {
    this.descriptorPath = descriptorPath;
    this.tokenType = tokenType;
    this.featurePath = featurePath;
+    this.configurationParameters = configurationParameters;
  }

  @Override
  protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
-    return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, reader));
+    return new TokenStreamComponents(new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, reader));
  }
 }
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java
@ -29,6 +29,7 @@ import org.apache.uima.cas.text.AnnotationFS;

 import java.io.IOException;
 import java.io.Reader;
+import java.util.Map;

 /**
 * A {@link Tokenizer} which creates token from UIMA Annotations filling also their {@link TypeAttribute} according to
@ -50,8 +51,8 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {

  private int finalOffset = 0;

-  public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
-    super(input, descriptorPath);
+  public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Map<String, Object> configurationParameters, Reader input) {
+    super(input, descriptorPath, configurationParameters);
    this.tokenTypeString = tokenType;
    this.termAttr = addAttribute(CharTermAttribute.class);
    this.typeAttr = addAttribute(TypeAttribute.class);
--- a/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java
+++ b/lucene/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizerFactory.java
@ -18,10 +18,10 @@ package org.apache.lucene.analysis.uima;
 */

 import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.uima.UIMATypeAwareAnnotationsTokenizer;
 import org.apache.lucene.analysis.util.TokenizerFactory;

 import java.io.Reader;
+import java.util.HashMap;
 import java.util.Map;

 /**
@ -32,13 +32,23 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {
  private String descriptorPath;
  private String tokenType;
  private String featurePath;
+  private Map<String, Object> configurationParameters;

  @Override
  public void init(Map<String, String> args) {
    super.init(args);
-    descriptorPath = args.get("descriptorPath");
-    tokenType = args.get("tokenType");
+    configurationParameters = new HashMap<String, Object>();
+    for (String k : args.keySet()) {
+      if (k.equals("featurePath")) {
        featurePath = args.get("featurePath");
+      } else if (k.equals("tokenType")) {
+        tokenType = args.get("tokenType");
+      } else if (k.equals("descriptorPath")) {
+        descriptorPath = args.get("descriptorPath");
+      } else {
+        configurationParameters.put(k, args.get(k));
+      }
+    }
    if (descriptorPath == null || tokenType == null || featurePath == null) {
      throw new IllegalArgumentException("descriptorPath, tokenType, and featurePath are mandatory");
    }
@ -46,6 +56,6 @@ public class UIMATypeAwareAnnotationsTokenizerFactory extends TokenizerFactory {

  @Override
  public Tokenizer create(Reader input) {
-    return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, input);
+    return new UIMATypeAwareAnnotationsTokenizer(descriptorPath, tokenType, featurePath, configurationParameters, input);
  }
 }
--- a/lucene/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml
+++ b/lucene/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml
@ -20,7 +20,7 @@
  <primitive>true</primitive>
  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
  <analysisEngineMetaData>
-    <name>DummyPoSTagger</name>
+    <name>EntityAnnotator</name>
    <description/>
    <version>1.0</version>
    <vendor>ASF</vendor>
--- a/lucene/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml
+++ b/lucene/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml
@ -20,9 +20,28 @@
  <primitive>true</primitive>
  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
  <analysisEngineMetaData>
-    <name>DummyPoSTagger</name>
+    <name>WSTokenizer</name>
    <version>1.0</version>
    <vendor>ASF</vendor>
+    <configurationParameters>
+      <configurationParameter>
+        <name>line-end</name>
+        <description>
+          the string used as line end
+        </description>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+      </configurationParameter>
+    </configurationParameters>
+    <configurationParameterSettings>
+        <nameValuePair>
+          <name>line-end</name>
+          <value>
+            <string>\n</string>
+          </value>
+        </nameValuePair>
+    </configurationParameterSettings>
    <typeSystemDescription>
      <types>
        <typeDescription>
--- a/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
+++ b/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
@ -36,6 +36,8 @@ import org.junit.Before;
 import org.junit.Test;

 import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;

 /**
 * Testcase for {@link UIMABaseAnalyzer}
@ -48,7 +50,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
  @Before
  public void setUp() throws Exception {
    super.setUp();
-    analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation");
+    analyzer = new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation", null);
  }

  @Override
@ -120,7 +122,15 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {

  @Test
  public void testRandomStrings() throws Exception {
-    checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
+    checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", null),
+        100 * RANDOM_MULTIPLIER);
+  }
+
+  @Test
+  public void testRandomStringsWithConfigurationParameters() throws Exception {
+    Map<String, Object> cp = new HashMap<String, Object>();
+    cp.put("line-end", "\r");
+    checkRandomData(random(), new UIMABaseAnalyzer("/uima/TestWSTokenizerAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation", cp),
        100 * RANDOM_MULTIPLIER);
  }

--- a/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
+++ b/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
@ -37,7 +37,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
  public void setUp() throws Exception {
    super.setUp();
    analyzer = new UIMATypeAwareAnalyzer("/uima/AggregateSentenceAE.xml",
-        "org.apache.uima.TokenAnnotation", "posTag");
+        "org.apache.uima.TokenAnnotation", "posTag", null);
  }

  @Override
@ -63,7 +63,7 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
  @Test
  public void testRandomStrings() throws Exception {
    checkRandomData(random(), new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
-        "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 100 * RANDOM_MULTIPLIER);
+        "org.apache.lucene.uima.ts.TokenAnnotation", "pos", null), 100 * RANDOM_MULTIPLIER);
  }

 }
--- a/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java
+++ b/lucene/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java
@ -17,11 +17,13 @@ package org.apache.lucene.analysis.uima.an;
 * limitations under the License.
 */

+import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;

 /**
 * Dummy implementation of a UIMA based whitespace tokenizer
@ -30,15 +32,21 @@ public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {

  private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
  private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
-  private static final String CR = "\n";
+  private String lineEnd;
  private static final String WHITESPACE = " ";

+  @Override
+  public void initialize(UimaContext aContext) throws ResourceInitializationException {
+    super.initialize(aContext);
+    lineEnd = String.valueOf(aContext.getConfigParameterValue("line-end"));
+  }
+
  @Override
  public void process(JCas jCas) throws AnalysisEngineProcessException {
    Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
    Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
    int i = 0;
-    for (String sentenceString : jCas.getDocumentText().split(CR)) {
+    for (String sentenceString : jCas.getDocumentText().split(lineEnd)) {
      // add the sentence
      AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
      jCas.addFsToIndexes(sentenceAnnotation);