[LUCENE-3731] - creating and using simple wst and pos tagger implementations for analyzers' random string testing

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244474 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2012-02-15 13:17:57 +00:00
parent 55b5a4f4b2
commit c454ae6a66
11 changed files with 150 additions and 42 deletions

View File

@ -20,33 +20,30 @@
<primitive>false</primitive>
<delegateAnalysisEngineSpecifiers>
<delegateAnalysisEngine key="WhitespaceTokenizer">
<import name="WhitespaceTokenizer"/>
<import location="TestWSTokenizerAE.xml"/>
</delegateAnalysisEngine>
<delegateAnalysisEngine key="DummyPoSTagger">
<import location="DummyPoSTagger.xml"/>
<delegateAnalysisEngine key="PoSTagger">
<import location="TestPoSTaggerAE.xml"/>
</delegateAnalysisEngine>
</delegateAnalysisEngineSpecifiers>
<analysisEngineMetaData>
<name>AggregateSentenceAE</name>
<name>TestAggregateSentenceAE</name>
<description/>
<version>1.0</version>
<vendor/>
<configurationParameterSettings/>
<vendor>ASF</vendor>
<flowConstraints>
<fixedFlow>
<node>WhitespaceTokenizer</node>
<node>DummyPoSTagger</node>
<node>PoSTagger</node>
</fixedFlow>
</flowConstraints>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<outputs>
<type allAnnotatorFeatures="true">org.apache.uima.SentenceAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
</outputs>
<languagesSupported/>
</capability>
</capabilities>
<operationalProperties>
@ -55,5 +52,4 @@
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -18,18 +18,16 @@
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyEntityAnnotator</annotatorImplementationName>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyPoSTagger</name>
<description/>
<version>1.0</version>
<vendor>ASF</vendor>
<configurationParameters/>
<configurationParameterSettings/>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.solr.uima.ts.EntityAnnotation</name>
<name>org.apache.lucene.uima.ts.EntityAnnotation</name>
<description/>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
@ -47,13 +45,13 @@
</typeDescription>
</types>
</typeSystemDescription>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs/>
<inputs>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
</inputs>
<outputs>
<type allAnnotatorFeatures="true">org.apache.solr.uima.ts.EntityAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.EntityAnnotation</type>
</outputs>
<languagesSupported/>
</capability>

View File

@ -18,24 +18,19 @@
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyPoSTagger</annotatorImplementationName>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SamplePoSTagger</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyPoSTagger</name>
<description/>
<version>1.0</version>
<vendor>ASF</vendor>
<configurationParameters/>
<configurationParameterSettings/>
<typeSystemDescription/>
<typePriorities/>
<fsIndexCollection/>
<capabilities>
<capability>
<inputs>
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
</inputs>
<outputs>
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
</outputs>
<languagesSupported/>
</capability>
@ -46,5 +41,4 @@
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
<resourceManagerConfiguration/>
</analysisEngineDescription>

View File

@ -0,0 +1,59 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
<analysisEngineMetaData>
<name>DummyPoSTagger</name>
<version>1.0</version>
<vendor>ASF</vendor>
<typeSystemDescription>
<types>
<typeDescription>
<name>org.apache.lucene.uima.ts.TokenAnnotation</name>
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>pos</name>
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>org.apache.lucene.uima.ts.SentenceAnnotation</name>
<supertypeName>uima.tcas.Annotation</supertypeName>
</typeDescription>
</types>
</typeSystemDescription>
<capabilities>
<capability>
<inputs/>
<outputs>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
</outputs>
</capability>
</capabilities>
<operationalProperties>
<modifiesCas>true</modifiesCas>
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
<outputsNewCASes>false</outputsNewCASes>
</operationalProperties>
</analysisEngineMetaData>
</analysisEngineDescription>

View File

@ -118,7 +118,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random, new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"),
checkRandomData(random, new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
1000 * RANDOM_MULTIPLIER);
}

View File

@ -60,8 +60,8 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
@Test
public void testRandomStrings() throws Exception {
checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/AggregateDummySentenceAE.xml",
"org.apache.uima.TokenAnnotation", "tokenType"), 1000 * RANDOM_MULTIPLIER);
checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
"org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER);
}
}

View File

@ -28,8 +28,8 @@ import static org.junit.Assert.assertNotNull;
public class BasicAEProviderTest {
@Test
public void testBasicInititalization() throws Exception {
AEProvider basicAEProvider = new BasicAEProvider("/uima/DummyEntityAE.xml");
public void testBasicInitialization() throws Exception {
AEProvider basicAEProvider = new BasicAEProvider("/uima/TestEntityAnnotatorAE.xml");
AnalysisEngine analysisEngine = basicAEProvider.getAE();
assertNotNull(analysisEngine);
}

View File

@ -34,7 +34,7 @@ public class OverridingParamsAEProviderTest {
@Test
public void testNullMapInitialization() throws Exception {
try {
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", null);
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", null);
aeProvider.getAE();
fail("should fail due to null Map passed");
} catch (ResourceInitializationException e) {
@ -44,7 +44,7 @@ public class OverridingParamsAEProviderTest {
@Test
public void testEmptyMapInitialization() throws Exception {
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", new HashMap<String, Object>());
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", new HashMap<String, Object>());
AnalysisEngine analysisEngine = aeProvider.getAE();
assertNotNull(analysisEngine);
}

View File

@ -26,7 +26,10 @@ import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
public class DummyEntityAnnotator extends JCasAnnotator_ImplBase {
/**
* Dummy implementation of an entity annotator to tag tokens as certain types of entities
*/
public class SampleEntityAnnotator extends JCasAnnotator_ImplBase {
private static final String NP = "np";
private static final String NPS = "nps";

View File

@ -17,7 +17,6 @@ package org.apache.lucene.analysis.uima.an;
* limitations under the License.
*/
import org.apache.uima.TokenAnnotation;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Feature;
@ -26,20 +25,21 @@ import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
/**
* Dummy implementation of a PoS tagger to add part of speech as token types
*/
public class DummyPoSTagger extends JCasAnnotator_ImplBase {
public class SamplePoSTagger extends JCasAnnotator_ImplBase {
private static final String NUM = "NUM";
private static final String WORD = "WORD";
private static final String TYPE_NAME = "org.apache.uima.TokenAnnotation";
private static final String FEATURE_NAME = "tokenType";
private static final String TYPE_NAME = "org.apache.lucene.uima.ts.TokenAnnotation";
private static final String FEATURE_NAME = "pos";
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME);
Feature posFeature = type.getFeatureByBaseName(FEATURE_NAME);
for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) {
for (Annotation annotation : jcas.getAnnotationIndex(type)) {
String text = annotation.getCoveredText();
String pos = extractPoS(text);
annotation.setStringValue(posFeature, pos);

View File

@ -0,0 +1,58 @@
package org.apache.lucene.analysis.uima.an;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.jcas.JCas;
/**
* Dummy implementation of a UIMA based whitespace tokenizer
*/
public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
private static final String CR = "\n";
private static final String WHITESPACE = " ";
@Override
public void process(JCas jCas) throws AnalysisEngineProcessException {
Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
int i = 0;
for (String sentenceString : jCas.getDocumentText().split(CR)) {
// add the sentence
AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
jCas.addFsToIndexes(sentenceAnnotation);
i += sentenceString.length();
}
// get tokens
int j = 0;
for (String tokenString : jCas.getDocumentText().split(WHITESPACE)) {
int tokenLength = tokenString.length();
AnnotationFS tokenAnnotation = jCas.getCas().createAnnotation(tokenType, j, j + tokenLength);
jCas.addFsToIndexes(tokenAnnotation);
j += tokenLength;
}
}
}