From c454ae6a66c838d8a47aa2051e54a5694ca6b918 Mon Sep 17 00:00:00 2001 From: Tommaso Teofili Date: Wed, 15 Feb 2012 13:17:57 +0000 Subject: [PATCH] [LUCENE-3731] - creating and using simple wst and pos tagger implementations for analyzers' random string testing git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1244474 13f79535-47bb-0310-9956-ffa450edef68 --- ...enceAE.xml => TestAggregateSentenceAE.xml} | 20 +++---- ...EntityAE.xml => TestEntityAnnotatorAE.xml} | 14 ++--- ...DummyPoSTagger.xml => TestPoSTaggerAE.xml} | 12 +--- .../src/test-files/uima/TestWSTokenizerAE.xml | 59 +++++++++++++++++++ .../analysis/uima/UIMABaseAnalyzerTest.java | 2 +- .../uima/UIMATypeAwareAnalyzerTest.java | 4 +- .../analysis/uima/ae/BasicAEProviderTest.java | 4 +- .../ae/OverridingParamsAEProviderTest.java | 4 +- ...otator.java => SampleEntityAnnotator.java} | 5 +- ...mmyPoSTagger.java => SamplePoSTagger.java} | 10 ++-- .../uima/an/SampleWSTokenizerAnnotator.java | 58 ++++++++++++++++++ 11 files changed, 150 insertions(+), 42 deletions(-) rename modules/analysis/uima/src/test-files/uima/{AggregateDummySentenceAE.xml => TestAggregateSentenceAE.xml} (77%) rename modules/analysis/uima/src/test-files/uima/{DummyEntityAE.xml => TestEntityAnnotatorAE.xml} (86%) rename modules/analysis/uima/src/test-files/uima/{DummyPoSTagger.xml => TestPoSTaggerAE.xml} (80%) create mode 100644 modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml rename modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/{DummyEntityAnnotator.java => SampleEntityAnnotator.java} (93%) rename modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/{DummyPoSTagger.java => SamplePoSTagger.java} (83%) create mode 100644 modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java diff --git a/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml b/modules/analysis/uima/src/test-files/uima/TestAggregateSentenceAE.xml similarity index 77% rename from modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml rename to modules/analysis/uima/src/test-files/uima/TestAggregateSentenceAE.xml index 8769b189404..51a778b133f 100644 --- a/modules/analysis/uima/src/test-files/uima/AggregateDummySentenceAE.xml +++ b/modules/analysis/uima/src/test-files/uima/TestAggregateSentenceAE.xml @@ -20,33 +20,30 @@ false - + - - + + - AggregateSentenceAE + TestAggregateSentenceAE 1.0 - - + ASF WhitespaceTokenizer - DummyPoSTagger + PoSTagger - - org.apache.uima.SentenceAnnotation - org.apache.uima.TokenAnnotation + org.apache.lucene.uima.ts.SentenceAnnotation + org.apache.lucene.uima.ts.TokenAnnotation - @@ -55,5 +52,4 @@ false - diff --git a/modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml b/modules/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml similarity index 86% rename from modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml rename to modules/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml index 8827562a569..d7ec826fd50 100644 --- a/modules/analysis/uima/src/test-files/uima/DummyEntityAE.xml +++ b/modules/analysis/uima/src/test-files/uima/TestEntityAnnotatorAE.xml @@ -18,18 +18,16 @@ org.apache.uima.java true - org.apache.lucene.analysis.uima.an.DummyEntityAnnotator + org.apache.lucene.analysis.uima.an.SampleEntityAnnotator DummyPoSTagger 1.0 ASF - - - org.apache.solr.uima.ts.EntityAnnotation + org.apache.lucene.uima.ts.EntityAnnotation uima.tcas.Annotation @@ -47,13 +45,13 @@ - - - + + org.apache.lucene.uima.ts.TokenAnnotation + - org.apache.solr.uima.ts.EntityAnnotation + org.apache.lucene.uima.ts.EntityAnnotation diff --git a/modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml b/modules/analysis/uima/src/test-files/uima/TestPoSTaggerAE.xml similarity index 80% rename from modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml rename to modules/analysis/uima/src/test-files/uima/TestPoSTaggerAE.xml index 7677502b959..eede632eadf 100644 --- a/modules/analysis/uima/src/test-files/uima/DummyPoSTagger.xml +++ b/modules/analysis/uima/src/test-files/uima/TestPoSTaggerAE.xml @@ -18,24 +18,19 @@ org.apache.uima.java true - org.apache.lucene.analysis.uima.an.DummyPoSTagger + org.apache.lucene.analysis.uima.an.SamplePoSTagger DummyPoSTagger 1.0 ASF - - - - - - org.apache.uima.TokenAnnotation + org.apache.lucene.uima.ts.TokenAnnotation - org.apache.uima.TokenAnnotation + org.apache.lucene.uima.ts.TokenAnnotation @@ -46,5 +41,4 @@ false - diff --git a/modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml b/modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml new file mode 100644 index 00000000000..596a830eb21 --- /dev/null +++ b/modules/analysis/uima/src/test-files/uima/TestWSTokenizerAE.xml @@ -0,0 +1,59 @@ + + + + org.apache.uima.java + true + org.apache.lucene.analysis.uima.an.SampleWSTokenizerAnnotator + + DummyPoSTagger + 1.0 + ASF + + + + org.apache.lucene.uima.ts.TokenAnnotation + uima.tcas.Annotation + + + pos + uima.cas.String + + + + + org.apache.lucene.uima.ts.SentenceAnnotation + uima.tcas.Annotation + + + + + + + + org.apache.lucene.uima.ts.TokenAnnotation + org.apache.lucene.uima.ts.SentenceAnnotation + + + + + true + true + false + + + diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java index 7b828527201..0fee64a0823 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMABaseAnalyzerTest.java @@ -118,7 +118,7 @@ public class UIMABaseAnalyzerTest extends BaseTokenStreamTestCase { @Test public void testRandomStrings() throws Exception { - checkRandomData(random, new UIMABaseAnalyzer("/uima/AggregateSentenceAE.xml", "org.apache.uima.TokenAnnotation"), + checkRandomData(random, new UIMABaseAnalyzer("/uima/TestAggregateSentenceAE.xml", "org.apache.lucene.uima.ts.TokenAnnotation"), 1000 * RANDOM_MULTIPLIER); } diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java index e7b4de9ea67..85035cc5d77 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/UIMATypeAwareAnalyzerTest.java @@ -60,8 +60,8 @@ public class UIMATypeAwareAnalyzerTest extends BaseTokenStreamTestCase { @Test public void testRandomStrings() throws Exception { - checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/AggregateDummySentenceAE.xml", - "org.apache.uima.TokenAnnotation", "tokenType"), 1000 * RANDOM_MULTIPLIER); + checkRandomData(random, new UIMATypeAwareAnalyzer("/uima/TestAggregateSentenceAE.xml", + "org.apache.lucene.uima.ts.TokenAnnotation", "pos"), 1000 * RANDOM_MULTIPLIER); } } diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java index 08735d9d713..3fe46addd8a 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/BasicAEProviderTest.java @@ -28,8 +28,8 @@ import static org.junit.Assert.assertNotNull; public class BasicAEProviderTest { @Test - public void testBasicInititalization() throws Exception { - AEProvider basicAEProvider = new BasicAEProvider("/uima/DummyEntityAE.xml"); + public void testBasicInitialization() throws Exception { + AEProvider basicAEProvider = new BasicAEProvider("/uima/TestEntityAnnotatorAE.xml"); AnalysisEngine analysisEngine = basicAEProvider.getAE(); assertNotNull(analysisEngine); } diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java index f8325fe5968..c3419253a2a 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/ae/OverridingParamsAEProviderTest.java @@ -34,7 +34,7 @@ public class OverridingParamsAEProviderTest { @Test public void testNullMapInitialization() throws Exception { try { - AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", null); + AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", null); aeProvider.getAE(); fail("should fail due to null Map passed"); } catch (ResourceInitializationException e) { @@ -44,7 +44,7 @@ public class OverridingParamsAEProviderTest { @Test public void testEmptyMapInitialization() throws Exception { - AEProvider aeProvider = new OverridingParamsAEProvider("/uima/DummyEntityAE.xml", new HashMap()); + AEProvider aeProvider = new OverridingParamsAEProvider("/uima/TestEntityAnnotatorAE.xml", new HashMap()); AnalysisEngine analysisEngine = aeProvider.getAE(); assertNotNull(analysisEngine); } diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleEntityAnnotator.java similarity index 93% rename from modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java rename to modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleEntityAnnotator.java index bd6cc9c9e67..319380b0f0d 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyEntityAnnotator.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleEntityAnnotator.java @@ -26,7 +26,10 @@ import org.apache.uima.cas.text.AnnotationFS; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; -public class DummyEntityAnnotator extends JCasAnnotator_ImplBase { +/** + * Dummy implementation of an entity annotator to tag tokens as certain types of entities + */ +public class SampleEntityAnnotator extends JCasAnnotator_ImplBase { private static final String NP = "np"; private static final String NPS = "nps"; diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SamplePoSTagger.java similarity index 83% rename from modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java rename to modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SamplePoSTagger.java index a120bce3997..6ff1468dc98 100644 --- a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/DummyPoSTagger.java +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SamplePoSTagger.java @@ -17,7 +17,6 @@ package org.apache.lucene.analysis.uima.an; * limitations under the License. */ -import org.apache.uima.TokenAnnotation; import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.Feature; @@ -26,20 +25,21 @@ import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; /** + * Dummy implementation of a PoS tagger to add part of speech as token types */ -public class DummyPoSTagger extends JCasAnnotator_ImplBase { +public class SamplePoSTagger extends JCasAnnotator_ImplBase { private static final String NUM = "NUM"; private static final String WORD = "WORD"; - private static final String TYPE_NAME = "org.apache.uima.TokenAnnotation"; - private static final String FEATURE_NAME = "tokenType"; + private static final String TYPE_NAME = "org.apache.lucene.uima.ts.TokenAnnotation"; + private static final String FEATURE_NAME = "pos"; @Override public void process(JCas jcas) throws AnalysisEngineProcessException { Type type = jcas.getCas().getTypeSystem().getType(TYPE_NAME); Feature posFeature = type.getFeatureByBaseName(FEATURE_NAME); - for (Annotation annotation : jcas.getAnnotationIndex(TokenAnnotation.type)) { + for (Annotation annotation : jcas.getAnnotationIndex(type)) { String text = annotation.getCoveredText(); String pos = extractPoS(text); annotation.setStringValue(posFeature, pos); diff --git a/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java new file mode 100644 index 00000000000..b33666b0321 --- /dev/null +++ b/modules/analysis/uima/src/test/org/apache/lucene/analysis/uima/an/SampleWSTokenizerAnnotator.java @@ -0,0 +1,58 @@ +package org.apache.lucene.analysis.uima.an; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.text.AnnotationFS; +import org.apache.uima.jcas.JCas; + +/** + * Dummy implementation of a UIMA based whitespace tokenizer + */ +public class SampleWSTokenizerAnnotator extends JCasAnnotator_ImplBase { + + private final static String TOKEN_TYPE = "org.apache.lucene.uima.ts.TokenAnnotation"; + private final static String SENTENCE_TYPE = "org.apache.lucene.uima.ts.SentenceAnnotation"; + private static final String CR = "\n"; + private static final String WHITESPACE = " "; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + Type sentenceType = jCas.getCas().getTypeSystem().getType(SENTENCE_TYPE); + Type tokenType = jCas.getCas().getTypeSystem().getType(TOKEN_TYPE); + int i = 0; + for (String sentenceString : jCas.getDocumentText().split(CR)) { + // add the sentence + AnnotationFS sentenceAnnotation = jCas.getCas().createAnnotation(sentenceType, i, sentenceString.length()); + jCas.addFsToIndexes(sentenceAnnotation); + i += sentenceString.length(); + } + + // get tokens + int j = 0; + for (String tokenString : jCas.getDocumentText().split(WHITESPACE)) { + int tokenLength = tokenString.length(); + AnnotationFS tokenAnnotation = jCas.getCas().createAnnotation(tokenType, j, j + tokenLength); + jCas.addFsToIndexes(tokenAnnotation); + j += tokenLength; + } + } + +}