diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java index 9e637c7428d..09d57ed4c52 100644 --- a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/BaseUIMATokenizer.java @@ -51,18 +51,25 @@ public abstract class BaseUIMATokenizer extends Tokenizer { /** * analyzes the tokenizer input using the given analysis engine - * + *
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures) * * @throws AnalysisEngineProcessException * @throws IOException */ - protected void analyzeInput() throws AnalysisEngineProcessException,IOException { + protected void analyzeInput() throws AnalysisEngineProcessException, IOException { cas.reset(); cas.setDocumentText(toString(input)); ae.process(cas); } + /** + * initialize the FSIterator which is used to build tokens at each incrementToken() method call + * + * @throws IOException + */ + protected abstract void initializeIterator() throws IOException; + private String toString(Reader reader) throws IOException { StringBuilder stringBuilder = new StringBuilder(); int ch; @@ -82,6 +89,6 @@ public abstract class BaseUIMATokenizer extends Tokenizer { public void end() throws IOException { iterator = null; } - - + + } diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java index 159d7fabfdb..45255317c6c 100644 --- a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMAAnnotationsTokenizer.java @@ -37,7 +37,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { private final OffsetAttribute offsetAttr; private final String tokenTypeString; - + private int finalOffset = 0; public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) { @@ -47,8 +47,12 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { this.offsetAttr = addAttribute(OffsetAttribute.class); } - private void analyzeText() throws IOException, AnalysisEngineProcessException { - analyzeInput(); + protected void initializeIterator() throws IOException { + try { + analyzeInput(); + } catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } finalOffset = correctOffset(cas.getDocumentText().length()); Type tokenType = cas.getTypeSystem().getType(tokenTypeString); iterator = cas.getAnnotationIndex(tokenType).iterator(); @@ -57,11 +61,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer { @Override public boolean incrementToken() throws IOException { if (iterator == null) { - try { - analyzeText(); - } catch (Exception e) { - throw new IOException(e); - } + initializeIterator(); } if (iterator.hasNext()) { clearAttributes(); diff --git a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java index e2b0bba2158..1246274397f 100644 --- a/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java +++ b/modules/analysis/uima/src/java/org/apache/lucene/analysis/uima/UIMATypeAwareAnnotationsTokenizer.java @@ -47,7 +47,7 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer { private final String typeAttributeFeaturePath; private FeaturePath featurePath; - + private int finalOffset = 0; public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) { @@ -59,23 +59,29 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer { this.typeAttributeFeaturePath = typeAttributeFeaturePath; } - private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException { - analyzeInput(); + protected void initializeIterator() throws IOException { + try { + analyzeInput(); + } catch (AnalysisEngineProcessException e) { + throw new IOException(e); + } + featurePath = cas.createFeaturePath(); + try { + featurePath.initialize(typeAttributeFeaturePath); + } catch (CASException e) { + featurePath = null; + throw new IOException(e); + } finalOffset = correctOffset(cas.getDocumentText().length()); Type tokenType = cas.getTypeSystem().getType(tokenTypeString); iterator = cas.getAnnotationIndex(tokenType).iterator(); - featurePath = cas.createFeaturePath(); - featurePath.initialize(typeAttributeFeaturePath); + } @Override public boolean incrementToken() throws IOException { if (iterator == null) { - try { - analyzeText(); - } catch (Exception e) { - throw new IOException(e); - } + initializeIterator(); } if (iterator.hasNext()) { clearAttributes();