diff --git a/CHANGES.txt b/CHANGES.txt
index 667cc632080..849f45e3504 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -25,6 +25,15 @@ API Changes
and deprecate FSDirectory.getDirectory(). FSDirectory instances
are not required to be singletons per path. (yonik)
+4. LUCENE-1422: New TokenStream API that uses a new class called
+ AttributeSource instead of the now deprecated Token class. All attributes
+ that the Token class had have been moved into separate classes:
+ TermAttribute, OffsetAttribute, PositionIncrementAttribute,
+ PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
+ is much more flexible; it allows to combine the Attributes arbitrarily
+ and also to define custom Attributes. The new API has the same performance
+ as the old next(Token) approach. (Michael Busch)
+
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
index d91074a2653..3a4ab989fa5 100644
--- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@@ -22,6 +22,8 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import org.apache.lucene.util.AttributeSource;
+
/**
* This class can be used if the Tokens of a TokenStream
* are intended to be consumed more than once. It caches
@@ -34,12 +36,31 @@ import java.util.List;
*/
public class CachingTokenFilter extends TokenFilter {
private List cache;
- private Iterator iterator;
+ private Iterator iterator;
public CachingTokenFilter(TokenStream input) {
super(input);
}
+ public boolean incrementToken() throws IOException {
+ if (cache == null) {
+ // fill cache lazily
+ cache = new LinkedList();
+ fillCache();
+ iterator = cache.iterator();
+ }
+
+ if (!iterator.hasNext()) {
+ // the cache is exhausted, return null
+ return false;
+ }
+ // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+ AttributeSource state = (AttributeSource) iterator.next();
+ state.restoreState(this);
+ return true;
+ }
+
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (cache == null) {
@@ -60,10 +81,17 @@ public class CachingTokenFilter extends TokenFilter {
public void reset() throws IOException {
if(cache != null) {
- iterator = cache.iterator();
+ iterator = cache.iterator();
}
}
+ private void fillCache() throws IOException {
+ while(input.incrementToken()) {
+ cache.add(captureState());
+ }
+ }
+
+ /** @deprecated */
private void fillCache(final Token reusableToken) throws IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
cache.add(nextToken.clone());
diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java
index d4356651fdf..5d090e71640 100644
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@@ -20,16 +20,24 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
super(input);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
@@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer {
return c;
}
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+
+ if (bufferIndex >= dataLen) {
+ offset += dataLen;
+ dataLen = input.read(ioBuffer);
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return false;
+ }
+ bufferIndex = 0;
+ }
+
+ final char c = ioBuffer[bufferIndex++];
+
+ if (isTokenChar(c)) { // if it's a token char
+
+ if (length == 0) // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.length)
+ buffer = termAtt.resizeTermBuffer(1+length);
+
+ buffer[length++] = normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN) // buffer overflow!
+ break;
+
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.setTermLength(length);
+ offsetAtt.setStartOffset(start);
+ offsetAtt.setEndOffset(start+length);
+ return true;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();
diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
index 8f10e984702..3a5a1170ece 100644
--- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
+++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
@@ -1,5 +1,7 @@
package org.apache.lucene.analysis;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -27,11 +29,33 @@ package org.apache.lucene.analysis;
public class ISOLatin1AccentFilter extends TokenFilter {
public ISOLatin1AccentFilter(TokenStream input) {
super(input);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private char[] output = new char[256];
private int outputPos;
-
+ private TermAttribute termAtt;
+
+ public final boolean incrementToken() throws java.io.IOException {
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for(int i=0;i
+ A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;
This is an abstract class.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ NOTE: subclasses must override
+ {@link #incrementToken()} if the new TokenStream API is used
+ and {@link #next(Token)} or {@link #next()} if the old
+ TokenStream API is used.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
+
+ See {@link TokenStream}
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
@@ -32,9 +39,10 @@ public abstract class TokenFilter extends TokenStream {
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
+ super(input);
this.input = input;
}
-
+
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
@@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream {
super.reset();
input.reset();
}
+
+ public boolean useNewAPI() {
+ return input.useNewAPI();
+ }
+
+ /**
+ * Sets whether or not to use the new TokenStream API. Settings this
+ * will apply to this Filter and all TokenStream/Filters upstream.
+ */
+ public void setUseNewAPI(boolean use) {
+ input.setUseNewAPI(use);
+ }
+
}
diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java
index 604f4a27cd5..6a9161e8ae3 100644
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@@ -17,9 +17,12 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
-import org.apache.lucene.index.Payload;
-
import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeSource;
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
@@ -31,13 +34,140 @@ import java.io.IOException;
+ For that reason TokenStream extends {@link AttributeSource}
+ now. Note that only one instance per {@link Attribute} is
+ created and reused for every token. This approach reduces
+ object creations and allows local caching of references to
+ the {@link Attribute}s. See {@link #incrementToken()} for further details.
+
+ The workflow of the new TokenStream API is as follows:
+
+ Sometimes it is desirable to capture a current state of a
+ TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
+ {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
+ {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
+
+ NOTE: In order to enable the new API the method
+ {@link #useNewAPI()} has to be called with useNewAPI=true.
+ Otherwise the deprecated method {@link #next(Token)} will
+ be used by Lucene consumers (indexer and queryparser) to
+ consume the tokens. {@link #next(Token)} will be removed
+ in Lucene 3.0.
+
+ NOTE: To use the old API subclasses must override {@link #next(Token)}.
+ It's also OK to instead override {@link #next()} but that
+ method is slower compared to {@link #next(Token)}.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
*/
-public abstract class TokenStream {
+public abstract class TokenStream extends AttributeSource {
+ private static boolean useNewAPIDefault = false;
+ private boolean useNewAPI = useNewAPIDefault;
+
+ protected TokenStream() {
+ super();
+ }
+
+ protected TokenStream(AttributeSource input) {
+ super(input);
+ }
+ /**
+ * Returns whether or not the new TokenStream APIs are used
+ * by default.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ */
+ public static boolean useNewAPIDefault() {
+ return useNewAPIDefault;
+ }
+
+ /**
+ * Use this API to enable or disable the new TokenStream API.
+ * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ *
+ * If set to true, the indexer will call {@link #incrementToken()}
+ * to consume Tokens from this stream.
+ *
+ * If set to false, the indexer will call {@link #next(Token)}
+ * instead.
+ */
+ public static void setUseNewAPIDefault(boolean use) {
+ useNewAPIDefault = use;
+ }
+
+ /**
+ * Returns whether or not the new TokenStream APIs are used
+ * for this stream.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ */
+ public boolean useNewAPI() {
+ return useNewAPI;
+ }
+
+ /**
+ * Use this API to enable or disable the new TokenStream API
+ * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ *
+ * If set to true, the indexer will call {@link #incrementToken()}
+ * to consume Tokens from this stream.
+ *
+ * If set to false, the indexer will call {@link #next(Token)}
+ * instead.
+ *
+ * NOTE: All streams and filters in one chain must use the
+ * same API.
+ */
+ public void setUseNewAPI(boolean use) {
+ useNewAPI = use;
+ }
+
+ /**
+ * Consumers (e. g. the indexer) use this method to advance the stream
+ * to the next token. Implementing classes must implement this method
+ * and update the appropriate {@link Attribute}s with content of the
+ * next token.
+ *
+ * This method is called for every token of a document, so an efficient
+ * implementation is crucial for good performance. To avoid calls to
+ * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
+ * downcasts, references to all {@link Attribute}s that this stream uses
+ * should be retrieved during instantiation.
+ *
+ * To make sure that filters and consumers know which attributes are available
+ * the attributes must be added during instantiation. Filters and
+ * consumers are not required to check for availability of attributes in {@link #incrementToken()}.
+ *
+ * @return false for end of stream; true otherwise
+ *
+ *
+ * Note that this method will be defined abstract in Lucene 3.0.
+ */
+ public boolean incrementToken() throws IOException {
+ // subclasses must implement this method; will be made abstract in Lucene 3.0
+ return false;
+ }
+
/** Returns the next token in the stream, or null at EOS.
* @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
@@ -84,6 +214,8 @@ public abstract class TokenStream {
* is not required to check for null before using it, but it is a
* good idea to assert that it is not null.)
* @return next token in the stream or null if end-of-stream was hit
+ * @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
+ * APIs should be used instead. See also {@link #useNewAPI()}.
*/
public Token next(final Token reusableToken) throws IOException {
// We don't actually use inputToken, but still add this assert
@@ -107,4 +239,25 @@ public abstract class TokenStream {
/** Releases resources associated with this stream. */
public void close() throws IOException {}
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append('(');
+
+ if (hasAttributes()) {
+ // TODO Java 1.5
+ //Iterator
This is an abstract class.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ NOTE: In order to enable the new API the method
+ {@link #useNewAPI()} has to be called with useNewAPI=true.
+ Otherwise the deprecated method {@link #next(Token)} will
+ be used by Lucene consumers (indexer and queryparser) to
+ consume the tokens. {@link #next(Token)} will be removed
+ in Lucene 3.0.
+ NOTE: To use the old API subclasses must override {@link #next(Token)}.
+ It's also OK to instead override {@link #next()} but that
+ method is slower compared to {@link #next(Token)}.
+
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
*/
public abstract class Tokenizer extends TokenStream {
diff --git a/src/java/org/apache/lucene/analysis/package.html b/src/java/org/apache/lucene/analysis/package.html
index d5fe91c0041..3ae9fb0f627 100644
--- a/src/java/org/apache/lucene/analysis/package.html
+++ b/src/java/org/apache/lucene/analysis/package.html
@@ -35,8 +35,7 @@ application using Lucene to use an appropriate Parser to convert the orig
Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
-input text into small indexing elements –
-{@link org.apache.lucene.analysis.Token Tokens}.
+input text into small indexing elements – tokens.
The way input text is broken into tokens very
much dictates further capabilities of search upon that text.
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
@@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):
@@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):
The following sections discuss some aspects of implementing your own analyzer.
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
is called multiple times for the same field name, we could say that each such call creates a new
@@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package.
};
By default, all tokens created by Analyzers and Tokenizers have a
- {@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
+ {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
This means that the position stored for that token in the index would be one more than
that of the previous token.
Recall that phrase and proximity searches rely on position info.
@@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package.
If this behavior does not fit the application needs,
a modified analyzer can be used, that would increment further the positions of
tokens following a removed stop word, using
- {@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
+ {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
This can be done with something like:
+
+ To make sure that filters and consumers know which attributes are available
+ the attributes must be added in the during instantiation. Filters and
+ consumers are not required to check for availability of attributes in {@link #incrementToken()}.
+ Tokenization
Hints, Tips and Traps
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
- Token t = ts.next();
- while (t!=null) {
- System.out.println("token: "+t));
+ while (ts.incrementToken()) {
+ System.out.println("token: "+ts));
t = ts.next();
}
@@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.
Field Section Boundaries
+
Field Section Boundaries
Token Position Increments
+
Token Position Increments
public TokenStream tokenStream(final String fieldName, Reader reader) {
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
TokenStream res = new TokenStream() {
- public Token next() throws IOException {
+ TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
+ public boolean incrementToken() throws IOException {
int extraIncrement = 0;
while (true) {
- Token t = ts.next();
- if (t!=null) {
- if (stopWords.contains(t.termText())) {
+ boolean hasNext = ts.incrementToken();
+ if (hasNext) {
+ if (stopWords.contains(termAtt.term())) {
extraIncrement++; // filter this word
continue;
}
if (extraIncrement>0) {
- t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
}
}
- return t;
+ return hasNext;
}
}
};
@@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
same position as that token, and so would they be seen by phrase and proximity searches.
+ With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token + has getter and setter methods for different properties like positionIncrement and termText. + While this approach was sufficient for the default indexing format, it is not versatile enough for + Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom + index formats. +
++A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API +is necessary that can transport custom types of data from the documents to the indexer. +
++ Lucene now provides six Attributes out of the box, which replace the variables the Token class has: +
The term text of a token.
The start and end offset of token in characters.
See above for detailed information about position increment.
The payload that a Token can optionally have.
The type of the token. Default is 'word'.
Optional flags a token can have.
Class
)
+of an Attribute as an argument and returns an instance. If an Attribute of the same type was previously added, then
+the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
+can safely call addAttribute() with the same Attribute type multiple times.
++public class MyAnalyzer extends Analyzer { + + public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream stream = new WhitespaceTokenizer(reader); + return stream; + } + + public static void main(String[] args) throws IOException { + // text to tokenize + final String text = "This is a demo of the new TokenStream API"; + + MyAnalyzer analyzer = new MyAnalyzer(); + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + + // get the TermAttribute from the TokenStream + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + + // print all tokens until stream is exhausted + while (stream.incrementToken()) { + System.out.println(termAtt.term()); + } + } +} ++In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and +prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides. +Here is the output: +
+This +is +a +demo +of +the +new +TokenStream +API ++
+ public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream stream = new WhitespaceTokenizer(reader); + stream = new LengthFilter(stream, 3, Integer.MAX_VALUE); + return stream; + } ++Note how now only words with 3 or more characters are contained in the output: +
+This +demo +the +new +TokenStream +API ++Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core): +
+public final class LengthFilter extends TokenFilter { + + final int min; + final int max; + + private TermAttribute termAtt; + + /** + * Build a filter that removes words that are too long or too + * short from the text. + */ + public LengthFilter(TokenStream in, int min, int max) + { + super(in); + this.min = min; + this.max = max; + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + /** + * Returns the next input Token whose term() is the right len + */ + public final boolean incrementToken() throws IOException + { + assert termAtt != null; + // return the first non-stop word found + while (input.incrementToken()) { + int len = termAtt.termLength(); + if (len >= min && len <= max) { + return true; + } + // note: else we ignore it but should we index each part of it? + } + // reached EOS -- return null + return false; + } +} ++The TermAttribute is added in the constructor and stored in the instance variable
termAtt
.
+Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
+addAttribute()
call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
+are retrieved from the input stream in the incrementToken()
method. By looking at the term text
+in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
+Note how incrementToken()
can efficiently access the instance variable; no attribute lookup or downcasting
+is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
+PartOfSpeechAttribute
:
++ public static enum PartOfSpeech { + Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown + } + + public static final class PartOfSpeechAttribute extends Attribute { + + private PartOfSpeech pos = PartOfSpeech.Unknown; + + public void setPartOfSpeech(PartOfSpeech pos) { + this.pos = pos; + } + + public PartOfSpeech getPartOfSpeech() { + return pos; + } + + public void clear() { + pos = PartOfSpeech.Unknown; + } + + public void copyTo(Attribute target) { + ((PartOfSpeechAttribute) target).pos = pos; + } + + public boolean equals(Object other) { + if (other == this) { + return true; + } + + if (other instanceof PartOfSpeechAttribute) { + return pos == ((PartOfSpeechAttribute) other).pos; + } + + return false; + } + + public int hashCode() { + return pos.ordinal(); + } + + public String toString() { + return "PartOfSpeech=" + pos; + } + } ++This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the +new
Attribute
class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(), toString()
.
+Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
+that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
++ public static class PartOfSpeechTaggingFilter extends TokenFilter { + PartOfSpeechAttribute posAtt; + TermAttribute termAtt; + + protected PartOfSpeechTaggingFilter(TokenStream input) { + super(input); + posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class); + termAtt = (TermAttribute) addAttribute(TermAttribute.class); + } + + public boolean incrementToken() throws IOException { + if (!input.incrementToken()) {return false;} + posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength())); + return true; + } + + // determine the part of speech for the given term + protected PartOfSpeech determinePOS(char[] term, int offset, int length) { + // naive implementation that tags every uppercased word as noun + if (length > 0 && Character.isUpperCase(term[0])) { + return PartOfSpeech.Noun; + } + return PartOfSpeech.Unknown; + } + } ++Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and +stores references in instance variables. Now we need to add the filter to the chain: +
+ public TokenStream tokenStream(String fieldName, Reader reader) { + TokenStream stream = new WhitespaceTokenizer(reader); + stream = new LengthFilter(stream, 3, Integer.MAX_VALUE); + stream = new PartOfSpeechTaggingFilter(stream); + return stream; + } ++Now let's look at the output: +
+This +demo +the +new +TokenStream +API ++Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not +affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer +to make use of the new PartOfSpeechAttribute and print it out: +
+ public static void main(String[] args) throws IOException { + // text to tokenize + final String text = "This is a demo of the new TokenStream API"; + + MyAnalyzer analyzer = new MyAnalyzer(); + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + + // get the TermAttribute from the TokenStream + TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class); + + // get the PartOfSpeechAttribute from the TokenStream + PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class); + + // print all tokens until stream is exhausted + while (stream.incrementToken()) { + System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech()); + } + } ++The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in +the while loop that consumes the stream. Here is the new output: +
+This: Noun +demo: Unknown +the: Unknown +new: Unknown +TokenStream: Noun +API: Noun ++Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive +part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it +is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new +API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token +of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words +as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise). +As a small hint, this is how the new Attribute class could begin: +
+ public class FirstTokenOfSentenceAttribute extends Attribute { + + private boolean firstToken; + + public void setFirstToken(boolean firstToken) { + this.firstToken = firstToken; + } + + public boolean getFirstToken() { + return firstToken; + } + + public void clear() { + firstToken = false; + } + + ... +