mirror of https://github.com/apache/lucene.git
[LUCENE-3731] - creating and using simple wst and pos tagger implementations for analyzers' random string testing
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244474 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
55b5a4f4b2
commit
c454ae6a66
|
@ -20,33 +20,30 @@
|
|||
<primitive>false</primitive>
|
||||
<delegateAnalysisEngineSpecifiers>
|
||||
<delegateAnalysisEngine key="WhitespaceTokenizer">
|
||||
<import name="WhitespaceTokenizer"/>
|
||||
<import location="TestWSTokenizerAE.xml"/>
|
||||
</delegateAnalysisEngine>
|
||||
<delegateAnalysisEngine key="DummyPoSTagger">
|
||||
<import location="DummyPoSTagger.xml"/>
|
||||
<delegateAnalysisEngine key="PoSTagger">
|
||||
<import location="TestPoSTaggerAE.xml"/>
|
||||
</delegateAnalysisEngine>
|
||||
</delegateAnalysisEngineSpecifiers>
|
||||
<analysisEngineMetaData>
|
||||
<name>AggregateSentenceAE</name>
|
||||
<name>TestAggregateSentenceAE</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor/>
|
||||
<configurationParameterSettings/>
|
||||
<vendor>ASF</vendor>
|
||||
<flowConstraints>
|
||||
<fixedFlow>
|
||||
<node>WhitespaceTokenizer</node>
|
||||
<node>DummyPoSTagger</node>
|
||||
<node>PoSTagger</node>
|
||||
</fixedFlow>
|
||||
</flowConstraints>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.uima.SentenceAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
|
||||
</outputs>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
|
@ -55,5 +52,4 @@
|
|||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -18,18 +18,16 @@
|
|||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyEntityAnnotator</annotatorImplementationName>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleEntityAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>DummyPoSTagger</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor>ASF</vendor>
|
||||
<configurationParameters/>
|
||||
<configurationParameterSettings/>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.solr.uima.ts.EntityAnnotation</name>
|
||||
<name>org.apache.lucene.uima.ts.EntityAnnotation</name>
|
||||
<description/>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
|
@ -47,13 +45,13 @@
|
|||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<inputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
|
||||
</inputs>
|
||||
<outputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.solr.uima.ts.EntityAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.EntityAnnotation</type>
|
||||
</outputs>
|
||||
<languagesSupported/>
|
||||
</capability>
|
|
@ -18,24 +18,19 @@
|
|||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.DummyPoSTagger</annotatorImplementationName>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SamplePoSTagger</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>DummyPoSTagger</name>
|
||||
<description/>
|
||||
<version>1.0</version>
|
||||
<vendor>ASF</vendor>
|
||||
<configurationParameters/>
|
||||
<configurationParameterSettings/>
|
||||
<typeSystemDescription/>
|
||||
<typePriorities/>
|
||||
<fsIndexCollection/>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
|
||||
</inputs>
|
||||
<outputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.uima.TokenAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
|
||||
</outputs>
|
||||
<languagesSupported/>
|
||||
</capability>
|
||||
|
@ -46,5 +41,4 @@
|
|||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
<resourceManagerConfiguration/>
|
||||
</analysisEngineDescription>
|
|
@ -0,0 +1,59 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
contributor license agreements. See the NOTICE file distributed with
|
||||
this work for additional information regarding copyright ownership.
|
||||
The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
(the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
|
||||
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
|
||||
<primitive>true</primitive>
|
||||
<annotatorImplementationName>org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator</annotatorImplementationName>
|
||||
<analysisEngineMetaData>
|
||||
<name>DummyPoSTagger</name>
|
||||
<version>1.0</version>
|
||||
<vendor>ASF</vendor>
|
||||
<typeSystemDescription>
|
||||
<types>
|
||||
<typeDescription>
|
||||
<name>org.apache.lucene.uima.ts.TokenAnnotation</name>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
<features>
|
||||
<featureDescription>
|
||||
<name>pos</name>
|
||||
<rangeTypeName>uima.cas.String</rangeTypeName>
|
||||
</featureDescription>
|
||||
</features>
|
||||
</typeDescription>
|
||||
<typeDescription>
|
||||
<name>org.apache.lucene.uima.ts.SentenceAnnotation</name>
|
||||
<supertypeName>uima.tcas.Annotation</supertypeName>
|
||||
</typeDescription>
|
||||
</types>
|
||||
</typeSystemDescription>
|
||||
<capabilities>
|
||||
<capability>
|
||||
<inputs/>
|
||||
<outputs>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.TokenAnnotation</type>
|
||||
<type allAnnotatorFeatures="true">org.apache.lucene.uima.ts.SentenceAnnotation</type>
|
||||
</outputs>
|
||||
</capability>
|
||||
</capabilities>
|
||||
<operationalProperties>
|
||||
<modifiesCas>true</modifiesCas>
|
||||
<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
|
||||
<outputsNewCASes>false</outputsNewCASes>
|
||||
</operationalProperties>
|
||||
</analysisEngineMetaData>
|
||||
</analysisEngineDescription>
|
|
@ -118,7 +118,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Test
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"),
|
||||
checkRandomData(random, new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"),
|
||||
1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
|
|
|
@ -60,8 +60,8 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase {
|
|||
|
||||
@Test
|
||||
public void testRandomStrings() throws Exception {
|
||||
checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/AggregateDummySentenceAE.xml",
|
||||
"org.apache.uima.TokenAnnotation", "tokenType"), 1000 * RANDOM_MULTIPLIER);
|
||||
checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml",
|
||||
"org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -28,8 +28,8 @@ import static org.junit.Assert.assertNotNull;
|
|||
public class BasicAEProviderTest {
|
||||
|
||||
@Test
|
||||
public void testBasicInititalization() throws Exception {
|
||||
AEProvider basicAEProvider = new BasicAEProvider("/uima/DummyEntityAE.xml");
|
||||
public void testBasicInitialization() throws Exception {
|
||||
AEProvider basicAEProvider = new BasicAEProvider("/uima/TestEntityAnnotatorAE.xml");
|
||||
AnalysisEngine analysisEngine = basicAEProvider.getAE();
|
||||
assertNotNull(analysisEngine);
|
||||
}
|
||||
|
|
|
@ -34,7 +34,7 @@ public class OverridingParamsAEProviderTest {
|
|||
@Test
|
||||
public void testNullMapInitialization() throws Exception {
|
||||
try {
|
||||
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", null);
|
||||
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", null);
|
||||
aeProvider.getAE();
|
||||
fail("should fail due to null Map passed");
|
||||
} catch (ResourceInitializationException e) {
|
||||
|
@ -44,7 +44,7 @@ public class OverridingParamsAEProviderTest {
|
|||
|
||||
@Test
|
||||
public void testEmptyMapInitialization() throws Exception {
|
||||
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", new HashMap<String, Object>());
|
||||
AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", new HashMap<String, Object>());
|
||||
AnalysisEngine analysisEngine = aeProvider.getAE();
|
||||
assertNotNull(analysisEngine);
|
||||
}
|
||||
|
|
|
@ -26,7 +26,10 @@ import org.apache.uima.cas.text.AnnotationFS;
|
|||
import org.apache.uima.jcas.JCas;
|
||||
import org.apache.uima.jcas.tcas.Annotation;
|
||||
|
||||
public class DummyEntityAnnotator extends JCasAnnotator_ImplBase {
|
||||
/**
|
||||
* Dummy implementation of an entity annotator to tag tokens as certain types of entities
|
||||
*/
|
||||
public class SampleEntityAnnotator extends JCasAnnotator_ImplBase {
|
||||
|
||||
private static final String NP = "np";
|
||||
private static final String NPS = "nps";
|
|
@ -17,7 +17,6 @@ package org.apache.lucene.analysis.uima.an;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.TokenAnnotation;
|
||||
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.Feature;
|
||||
|
@ -26,20 +25,21 @@ import org.apache.uima.jcas.JCas;
|
|||
import org.apache.uima.jcas.tcas.Annotation;
|
||||
|
||||
/**
|
||||
* Dummy implementation of a PoS tagger to add part of speech as token types
|
||||
*/
|
||||
public class DummyPoSTagger extends JCasAnnotator_ImplBase {
|
||||
public class SamplePoSTagger extends JCasAnnotator_ImplBase {
|
||||
|
||||
private static final String NUM = "NUM";
|
||||
private static final String WORD = "WORD";
|
||||
private static final String TYPE_NAME = "org.apache.uima.TokenAnnotation";
|
||||
private static final String FEATURE_NAME = "tokenType";
|
||||
private static final String TYPE_NAME = "org.apache.lucene.uima.ts.TokenAnnotation";
|
||||
private static final String FEATURE_NAME = "pos";
|
||||
|
||||
@Override
|
||||
public void process(JCas jcas) throws AnalysisEngineProcessException {
|
||||
Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME);
|
||||
Feature posFeature = type.getFeatureByBaseName(FEATURE_NAME);
|
||||
|
||||
for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) {
|
||||
for (Annotation annotation : jcas.getAnnotationIndex(type)) {
|
||||
String text = annotation.getCoveredText();
|
||||
String pos = extractPoS(text);
|
||||
annotation.setStringValue(posFeature, pos);
|
|
@ -0,0 +1,58 @@
|
|||
package org.apache.lucene.analysis.uima.an;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
|
||||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
|
||||
import org.apache.uima.cas.Type;
|
||||
import org.apache.uima.cas.text.AnnotationFS;
|
||||
import org.apache.uima.jcas.JCas;
|
||||
|
||||
/**
|
||||
* Dummy implementation of a UIMA based whitespace tokenizer
|
||||
*/
|
||||
public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase {
|
||||
|
||||
private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation";
|
||||
private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation";
|
||||
private static final String CR = "\n";
|
||||
private static final String WHITESPACE = " ";
|
||||
|
||||
@Override
|
||||
public void process(JCas jCas) throws AnalysisEngineProcessException {
|
||||
Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE);
|
||||
Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE);
|
||||
int i = 0;
|
||||
for (String sentenceString : jCas.getDocumentText().split(CR)) {
|
||||
// add the sentence
|
||||
AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length());
|
||||
jCas.addFsToIndexes(sentenceAnnotation);
|
||||
i += sentenceString.length();
|
||||
}
|
||||
|
||||
// get tokens
|
||||
int j = 0;
|
||||
for (String tokenString : jCas.getDocumentText().split(WHITESPACE)) {
|
||||
int tokenLength = tokenString.length();
|
||||
AnnotationFS tokenAnnotation = jCas.getCas().createAnnotation(tokenType, j, j + tokenLength);
|
||||
jCas.addFsToIndexes(tokenAnnotation);
|
||||
j += tokenLength;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue