[LUCENE-3731] - creating and using simple wst and pos tagger implementations for analyzers' random string testing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244474 13f79535-47bb-0310-9956-ffa450edef68
2012-02-15 13:17:57 +00:00 · 2012-02-15 13:17:57 +00:00 · c454ae6a66
parent 55b5a4f4b2
commit c454ae6a66
11 changed files with 150 additions and 42 deletions
--- a/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml
+++ b/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml
@ -20,33 +20,30 @@
  <primitive>false</primitive>
  <delegateAnalysisEngineSpecifiers>
    <delegateAnalysisEngine key="WhitespaceTokenizer">
-      <import name="WhitespaceTokenizer"/>
+      <import location="TestWSTokenizerAE.xml"/>
    </delegateAnalysisEngine>
-    <delegateAnalysisEngine key="DummyPoSTagger">
-      <import location="DummyPoSTagger.xml"/>
+    <delegateAnalysisEngine key="PoSTagger">
+      <import location="TestPoSTaggerAE.xml"/>
    </delegateAnalysisEngine>
  </delegateAnalysisEngineSpecifiers>
  <analysisEngineMetaData>
-    <name>AggregateSentenceAE</name>
+    <name>TestAggregateSentenceAE</name>
    <description/>
    <version>1.0</version>
-    <vendor/>
-    <configurationParameterSettings/>
+    <vendor>ASF</vendor>
    <flowConstraints>
      <fixedFlow>
        <node>WhitespaceTokenizer</node>
-        <node>DummyPoSTagger</node>
+        <node>PoSTagger</node>
      </fixedFlow>
    </flowConstraints>
-    <fsIndexCollection/>
    <capabilities>
      <capability>
        <inputs/>
        <outputs>
-          <type allAnnotatorFeatures="true">org.apache.uima.SentenceAnnotation</type>
-          <type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
        </outputs>
-        <languagesSupported/>
      </capability>
    </capabilities>
    <operationalProperties>
@ -55,5 +52,4 @@
      <outputsNewCASes>false</outputsNewCASes>
    </operationalProperties>
  </analysisEngineMetaData>
-  <resourceManagerConfiguration/>
 </analysisEngineDescription>
--- a/modules/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml
+++ b/modules/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml
@ -18,18 +18,16 @@
 <analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
  <primitive>true</primitive>
-  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyEntityAnnotator</annotatorImplementationName>
+  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
  <analysisEngineMetaData>
    <name>DummyPoSTagger</name>
    <description/>
    <version>1.0</version>
    <vendor>ASF</vendor>
-    <configurationParameters/>
-    <configurationParameterSettings/>
    <typeSystemDescription>
      <types>
        <typeDescription>
-          <name>org.apache.solr.uima.ts.EntityAnnotation</name>
+          <name>org.apache.lucene.uima.ts.EntityAnnotation</name>
          <description/>
          <supertypeName>uima.tcas.Annotation</supertypeName>
          <features>
@ -47,13 +45,13 @@
        </typeDescription>
      </types>
    </typeSystemDescription>
-    <typePriorities/>
-    <fsIndexCollection/>
    <capabilities>
      <capability>
-        <inputs/>
+        <inputs>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
+        </inputs>
        <outputs>
-          <type allAnnotatorFeatures="true">org.apache.solr.uima.ts.EntityAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.EntityAnnotation</type>
        </outputs>
        <languagesSupported/>
      </capability>
--- a/modules/analysis/uima/src/test-files/uima/TestPoSTaggerAE.xml
+++ b/modules/analysis/uima/src/test-files/uima/TestPoSTaggerAE.xml
@ -18,24 +18,19 @@
 <analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
  <primitive>true</primitive>
-  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyPoSTagger</annotatorImplementationName>
+  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SamplePoSTagger</annotatorImplementationName>
  <analysisEngineMetaData>
    <name>DummyPoSTagger</name>
    <description/>
    <version>1.0</version>
    <vendor>ASF</vendor>
-    <configurationParameters/>
-    <configurationParameterSettings/>
-    <typeSystemDescription/>
-    <typePriorities/>
-    <fsIndexCollection/>
    <capabilities>
      <capability>
        <inputs>
-          <type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
        </inputs>
        <outputs>
-          <type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
        </outputs>
        <languagesSupported/>
      </capability>
@ -46,5 +41,4 @@
      <outputsNewCASes>false</outputsNewCASes>
    </operationalProperties>
  </analysisEngineMetaData>
-  <resourceManagerConfiguration/>
 </analysisEngineDescription>
--- a/modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml
+++ b/modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml
@ -0,0 +1,59 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+  <primitive>true</primitive>
+  <annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
+  <analysisEngineMetaData>
+    <name>DummyPoSTagger</name>
+    <version>1.0</version>
+    <vendor>ASF</vendor>
+    <typeSystemDescription>
+      <types>
+        <typeDescription>
+          <name>org.apache.lucene.uima.ts.TokenAnnotation</name>
+          <supertypeName>uima.tcas.Annotation</supertypeName>
+          <features>
+            <featureDescription>
+              <name>pos</name>
+              <rangeTypeName>uima.cas.String</rangeTypeName>
+            </featureDescription>
+          </features>
+        </typeDescription>
+        <typeDescription>
+          <name>org.apache.lucene.uima.ts.SentenceAnnotation</name>
+          <supertypeName>uima.tcas.Annotation</supertypeName>
+        </typeDescription>
+      </types>
+    </typeSystemDescription>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
+        </outputs>
+      </capability>
+    </capabilities>
+    <operationalProperties>
+      <modifiesCas>true</modifiesCas>
+      <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+      <outputsNewCASes>false</outputsNewCASes>
+    </operationalProperties>
+  </analysisEngineMetaData>
+</analysisEngineDescription>
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java
@ -118,7 +118,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {

  @Test
  public void testRandomStrings() throws Exception {
-    checkRandomData(random, new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"),
+    checkRandomData(random, new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
        1000 * RANDOM_MULTIPLIER);
  }

--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java
@ -60,8 +60,8 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {

  @Test
  public void testRandomStrings() throws Exception {
-    checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/AggregateDummySentenceAE.xml",
-        "org.apache.uima.TokenAnnotation", "tokenType"), 1000 * RANDOM_MULTIPLIER);
+    checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
+        "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER);
  }

 }
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java
@ -28,8 +28,8 @@ import static org.junit.Assert.assertNotNull;
 public class BasicAEProviderTest {

  @Test
-  public void testBasicInititalization() throws Exception {
-    AEProvider basicAEProvider = new BasicAEProvider("/uima/DummyEntityAE.xml");
+  public void testBasicInitialization() throws Exception {
+    AEProvider basicAEProvider = new BasicAEProvider("/uima/TestEntityAnnotatorAE.xml");
    AnalysisEngine analysisEngine = basicAEProvider.getAE();
    assertNotNull(analysisEngine);
  }
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java
@ -34,7 +34,7 @@ public class OverridingParamsAEProviderTest {
  @Test
  public void testNullMapInitialization() throws Exception {
    try {
-      AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", null);
+      AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", null);
      aeProvider.getAE();
      fail("should fail due to null Map passed");
    } catch (ResourceInitializationException e) {
@ -44,7 +44,7 @@ public class OverridingParamsAEProviderTest {

  @Test
  public void testEmptyMapInitialization() throws Exception {
-    AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", new HashMap<String, Object>());
+    AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", new HashMap<String, Object>());
    AnalysisEngine analysisEngine = aeProvider.getAE();
    assertNotNull(analysisEngine);
  }
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleEntityAnnotator.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleEntityAnnotator.java
@ -26,7 +26,10 @@ import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;

-public class DummyEntityAnnotator extends JCasAnnotator_ImplBase {
+/**
+ * Dummy implementation of an entity annotator to tag tokens as certain types of entities
+ */
+public class SampleEntityAnnotator extends JCasAnnotator_ImplBase {

  private static final String NP = "np";
  private static final String NPS = "nps";
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SamplePoSTagger.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SamplePoSTagger.java
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.uima.an;
 * limitations under the License.
 */

-import org.apache.uima.TokenAnnotation;
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.Feature;
@ -26,20 +25,21 @@ import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;

 /**
+ * Dummy implementation of a PoS tagger to add part of speech as token types
 */
-public class DummyPoSTagger extends JCasAnnotator_ImplBase {
+public class SamplePoSTagger extends JCasAnnotator_ImplBase {

  private static final String NUM = "NUM";
  private static final String WORD = "WORD";
-  private static final String TYPE_NAME = "org.apache.uima.TokenAnnotation";
-  private static final String FEATURE_NAME = "tokenType";
+  private static final String TYPE_NAME = "org.apache.lucene.uima.ts.TokenAnnotation";
+  private static final String FEATURE_NAME = "pos";

  @Override
  public void process(JCas jcas) throws AnalysisEngineProcessException {
    Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME);
    Feature posFeature = type.getFeatureByBaseName(FEATURE_NAME);

-    for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) {
+    for (Annotation annotation : jcas.getAnnotationIndex(type)) {
      String text = annotation.getCoveredText();
      String pos = extractPoS(text);
      annotation.setStringValue(posFeature, pos);
--- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java
+++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java
@ -0,0 +1,58 @@
+package org.apache.lucene.analysis.uima.an;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.Type;
+import org.apache.uima.cas.text.AnnotationFS;
+import org.apache.uima.jcas.JCas;
+
+/**
+ * Dummy implementation of a UIMA based whitespace tokenizer
+ */
+public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
+
+  private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
+  private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
+  private static final String CR = "\n";
+  private static final String WHITESPACE = " ";
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
+    Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
+    int i = 0;
+    for (String sentenceString : jCas.getDocumentText().split(CR)) {
+      // add the sentence
+      AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
+      jCas.addFsToIndexes(sentenceAnnotation);
+      i += sentenceString.length();
+    }
+
+    // get tokens
+    int j = 0;
+    for (String tokenString : jCas.getDocumentText().split(WHITESPACE)) {
+      int tokenLength = tokenString.length();
+      AnnotationFS tokenAnnotation = jCas.getCas().createAnnotation(tokenType, j, j + tokenLength);
+      jCas.addFsToIndexes(tokenAnnotation);
+      j += tokenLength;
+    }
+  }
+
+}