[LUCENE-3731] - refactored analyzeText method to initializeIterator and made it abstract inside BaseUIMATokenizer

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1293614 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Tommaso Teofili 2012-02-25 14:14:00 +00:00
parent 2e015271c5
commit 482c0610fd
3 changed files with 35 additions and 22 deletions

View File

@ -51,18 +51,25 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
/**
* analyzes the tokenizer input using the given analysis engine
*
* <p/>
* {@link #cas} will be filled with extracted metadata (UIMA annotations, feature structures)
*
* @throws AnalysisEngineProcessException
* @throws IOException
*/
protected void analyzeInput() throws AnalysisEngineProcessException,IOException {
protected void analyzeInput() throws AnalysisEngineProcessException, IOException {
cas.reset();
cas.setDocumentText(toString(input));
ae.process(cas);
}
/**
* initialize the FSIterator which is used to build tokens at each incrementToken() method call
*
* @throws IOException
*/
protected abstract void initializeIterator() throws IOException;
private String toString(Reader reader) throws IOException {
StringBuilder stringBuilder = new StringBuilder();
int ch;
@ -82,6 +89,6 @@ public abstract class BaseUIMATokenizer extends Tokenizer {
public void end() throws IOException {
iterator = null;
}
}

View File

@ -37,7 +37,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
private final OffsetAttribute offsetAttr;
private final String tokenTypeString;
private int finalOffset = 0;
public UIMAAnnotationsTokenizer(String descriptorPath, String tokenType, Reader input) {
@ -47,8 +47,12 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
this.offsetAttr = addAttribute(OffsetAttribute.class);
}
private void analyzeText() throws IOException, AnalysisEngineProcessException {
analyzeInput();
protected void initializeIterator() throws IOException {
try {
analyzeInput();
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();
@ -57,11 +61,7 @@ public final class UIMAAnnotationsTokenizer extends BaseUIMATokenizer {
@Override
public boolean incrementToken() throws IOException {
if (iterator == null) {
try {
analyzeText();
} catch (Exception e) {
throw new IOException(e);
}
initializeIterator();
}
if (iterator.hasNext()) {
clearAttributes();

View File

@ -47,7 +47,7 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
private final String typeAttributeFeaturePath;
private FeaturePath featurePath;
private int finalOffset = 0;
public UIMATypeAwareAnnotationsTokenizer(String descriptorPath, String tokenType, String typeAttributeFeaturePath, Reader input) {
@ -59,23 +59,29 @@ public final class UIMATypeAwareAnnotationsTokenizer extends BaseUIMATokenizer {
this.typeAttributeFeaturePath = typeAttributeFeaturePath;
}
private void analyzeText() throws IOException, AnalysisEngineProcessException, CASException {
analyzeInput();
protected void initializeIterator() throws IOException {
try {
analyzeInput();
} catch (AnalysisEngineProcessException e) {
throw new IOException(e);
}
featurePath = cas.createFeaturePath();
try {
featurePath.initialize(typeAttributeFeaturePath);
} catch (CASException e) {
featurePath = null;
throw new IOException(e);
}
finalOffset = correctOffset(cas.getDocumentText().length());
Type tokenType = cas.getTypeSystem().getType(tokenTypeString);
iterator = cas.getAnnotationIndex(tokenType).iterator();
featurePath = cas.createFeaturePath();
featurePath.initialize(typeAttributeFeaturePath);
}
@Override
public boolean incrementToken() throws IOException {
if (iterator == null) {
try {
analyzeText();
} catch (Exception e) {
throw new IOException(e);
}
initializeIterator();
}
if (iterator.hasNext()) {
clearAttributes();