From 898cfe87cdb7dab5e754d777e5685ec31ff6527b Mon Sep 17 00:00:00 2001
From: Michael Busch
Date: Tue, 18 Nov 2008 23:41:49 +0000
Subject: [PATCH] LUCENE-1422: New TokenStream API that uses a new class called
AttributeSource instead of the now deprecated Token class. All attributes
that the Token class had have been moved into separate classes:
TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute,
TypeAttribute and FlagsAttribute.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@718798 13f79535-47bb-0310-9956-ffa450edef68
---
CHANGES.txt | 9 +
.../lucene/analysis/CachingTokenFilter.java | 32 +-
.../apache/lucene/analysis/CharTokenizer.java | 52 +++
.../analysis/ISOLatin1AccentFilter.java | 28 +-
.../lucene/analysis/KeywordTokenizer.java | 31 +-
.../apache/lucene/analysis/LengthFilter.java | 22 ++
.../lucene/analysis/LowerCaseFilter.java | 19 +
.../lucene/analysis/PorterStemFilter.java | 14 +
.../apache/lucene/analysis/SinkTokenizer.java | 26 +-
.../apache/lucene/analysis/StopFilter.java | 33 ++
.../lucene/analysis/TeeTokenFilter.java | 14 +
.../org/apache/lucene/analysis/Token.java | 8 +-
.../apache/lucene/analysis/TokenFilter.java | 29 +-
.../apache/lucene/analysis/TokenStream.java | 165 +++++++-
.../org/apache/lucene/analysis/Tokenizer.java | 17 +-
.../org/apache/lucene/analysis/package.html | 367 +++++++++++++++++-
.../analysis/standard/StandardFilter.java | 43 +-
.../analysis/standard/StandardTokenizer.java | 76 +++-
.../standard/StandardTokenizerImpl.java | 8 +
.../standard/StandardTokenizerImpl.jflex | 9 +
.../tokenattributes/FlagsAttribute.java | 86 ++++
.../tokenattributes/OffsetAttribute.java | 98 +++++
.../tokenattributes/PayloadAttribute.java | 109 ++++++
.../PositionIncrementAttribute.java | 106 +++++
.../tokenattributes/TermAttribute.java | 242 ++++++++++++
.../tokenattributes/TypeAttribute.java | 83 ++++
.../org/apache/lucene/index/DocInverter.java | 8 +-
.../lucene/index/DocInverterPerField.java | 45 ++-
.../lucene/index/DocInverterPerThread.java | 96 +++++
.../apache/lucene/index/FieldInvertState.java | 7 +
.../index/FreqProxTermsWriterPerField.java | 34 +-
.../index/InvertedDocConsumerPerField.java | 9 +-
src/java/org/apache/lucene/index/Payload.java | 3 +-
.../index/TermVectorsTermsWriterPerField.java | 32 +-
.../index/TermsHashConsumerPerField.java | 9 +-
.../lucene/index/TermsHashPerField.java | 35 +-
.../lucene/queryParser/QueryParser.java | 202 ++++++++--
.../apache/lucene/queryParser/QueryParser.jj | 194 +++++++--
.../queryParser/QueryParserTokenManager.java | 5 +-
.../apache/lucene/search/QueryTermVector.java | 15 +-
.../org/apache/lucene/util/Attribute.java | 95 +++++
.../apache/lucene/util/AttributeSource.java | 274 +++++++++++++
src/test/org/apache/lucene/AnalysisTest.java | 30 +-
.../lucene/analysis/TeeSinkTokenTest.java | 126 +++---
.../apache/lucene/analysis/TestAnalyzers.java | 55 +--
.../analysis/TestCachingTokenFilter.java | 22 +-
.../analysis/TestISOLatin1AccentFilter.java | 160 ++++----
.../lucene/analysis/TestKeywordAnalyzer.java | 9 +-
.../lucene/analysis/TestLengthFilter.java | 15 +-
.../analysis/TestPerFieldAnalzyerWrapper.java | 16 +-
.../lucene/analysis/TestStandardAnalyzer.java | 24 +-
.../lucene/analysis/TestStopAnalyzer.java | 29 +-
.../lucene/analysis/TestStopFilter.java | 42 +-
.../org/apache/lucene/analysis/TestToken.java | 1 +
.../lucene/index/TestDocumentWriter.java | 52 ++-
.../apache/lucene/index/TestIndexWriter.java | 90 +++--
.../lucene/index/TestMultiLevelSkipList.java | 21 +-
.../org/apache/lucene/index/TestPayloads.java | 39 +-
.../lucene/index/TestTermVectorsReader.java | 48 ++-
.../apache/lucene/index/TestTermdocPerf.java | 32 +-
.../lucene/queryParser/TestMultiAnalyzer.java | 77 ++--
.../TestMultiFieldQueryParser.java | 1 -
.../lucene/queryParser/TestQueryParser.java | 38 +-
.../lucene/search/TestPositionIncrement.java | 37 +-
.../apache/lucene/search/TestRangeQuery.java | 18 +-
.../lucene/search/payloads/PayloadHelper.java | 21 +-
.../payloads/TestBoostingTermQuery.java | 27 +-
.../lucene/search/spans/TestPayloadSpans.java | 33 +-
.../apache/lucene/util/LuceneTestCase.java | 2 +
69 files changed, 3226 insertions(+), 628 deletions(-)
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/FlagsAttribute.java
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/OffsetAttribute.java
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PayloadAttribute.java
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/PositionIncrementAttribute.java
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TermAttribute.java
create mode 100644 src/java/org/apache/lucene/analysis/tokenattributes/TypeAttribute.java
create mode 100644 src/java/org/apache/lucene/util/Attribute.java
create mode 100644 src/java/org/apache/lucene/util/AttributeSource.java
diff --git a/CHANGES.txt b/CHANGES.txt
index 667cc632080..849f45e3504 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -25,6 +25,15 @@ API Changes
and deprecate FSDirectory.getDirectory(). FSDirectory instances
are not required to be singletons per path. (yonik)
+4. LUCENE-1422: New TokenStream API that uses a new class called
+ AttributeSource instead of the now deprecated Token class. All attributes
+ that the Token class had have been moved into separate classes:
+ TermAttribute, OffsetAttribute, PositionIncrementAttribute,
+ PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
+ is much more flexible; it allows to combine the Attributes arbitrarily
+ and also to define custom Attributes. The new API has the same performance
+ as the old next(Token) approach. (Michael Busch)
+
Bug fixes
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
diff --git a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
index d91074a2653..3a4ab989fa5 100644
--- a/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/CachingTokenFilter.java
@@ -22,6 +22,8 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import org.apache.lucene.util.AttributeSource;
+
/**
* This class can be used if the Tokens of a TokenStream
* are intended to be consumed more than once. It caches
@@ -34,12 +36,31 @@ import java.util.List;
*/
public class CachingTokenFilter extends TokenFilter {
private List cache;
- private Iterator iterator;
+ private Iterator iterator;
public CachingTokenFilter(TokenStream input) {
super(input);
}
+ public boolean incrementToken() throws IOException {
+ if (cache == null) {
+ // fill cache lazily
+ cache = new LinkedList();
+ fillCache();
+ iterator = cache.iterator();
+ }
+
+ if (!iterator.hasNext()) {
+ // the cache is exhausted, return null
+ return false;
+ }
+ // Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
+ AttributeSource state = (AttributeSource) iterator.next();
+ state.restoreState(this);
+ return true;
+ }
+
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (cache == null) {
@@ -60,10 +81,17 @@ public class CachingTokenFilter extends TokenFilter {
public void reset() throws IOException {
if(cache != null) {
- iterator = cache.iterator();
+ iterator = cache.iterator();
}
}
+ private void fillCache() throws IOException {
+ while(input.incrementToken()) {
+ cache.add(captureState());
+ }
+ }
+
+ /** @deprecated */
private void fillCache(final Token reusableToken) throws IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
cache.add(nextToken.clone());
diff --git a/src/java/org/apache/lucene/analysis/CharTokenizer.java b/src/java/org/apache/lucene/analysis/CharTokenizer.java
index d4356651fdf..5d090e71640 100644
--- a/src/java/org/apache/lucene/analysis/CharTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/CharTokenizer.java
@@ -20,16 +20,24 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/** An abstract base class for simple, character-oriented tokenizers.*/
public abstract class CharTokenizer extends Tokenizer {
public CharTokenizer(Reader input) {
super(input);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private int offset = 0, bufferIndex = 0, dataLen = 0;
private static final int MAX_WORD_LEN = 255;
private static final int IO_BUFFER_SIZE = 4096;
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
+
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
/** Returns true iff a character should be included in a token. This
* tokenizer generates as tokens adjacent sequences of characters which
@@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer {
return c;
}
+ public final boolean incrementToken() throws IOException {
+ clearAttributes();
+ int length = 0;
+ int start = bufferIndex;
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+
+ if (bufferIndex >= dataLen) {
+ offset += dataLen;
+ dataLen = input.read(ioBuffer);
+ if (dataLen == -1) {
+ if (length > 0)
+ break;
+ else
+ return false;
+ }
+ bufferIndex = 0;
+ }
+
+ final char c = ioBuffer[bufferIndex++];
+
+ if (isTokenChar(c)) { // if it's a token char
+
+ if (length == 0) // start of token
+ start = offset + bufferIndex - 1;
+ else if (length == buffer.length)
+ buffer = termAtt.resizeTermBuffer(1+length);
+
+ buffer[length++] = normalize(c); // buffer it, normalized
+
+ if (length == MAX_WORD_LEN) // buffer overflow!
+ break;
+
+ } else if (length > 0) // at non-Letter w/ chars
+ break; // return 'em
+ }
+
+ termAtt.setTermLength(length);
+ offsetAtt.setStartOffset(start);
+ offsetAtt.setEndOffset(start+length);
+ return true;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();
diff --git a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
index 8f10e984702..3a5a1170ece 100644
--- a/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
+++ b/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java
@@ -1,5 +1,7 @@
package org.apache.lucene.analysis;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
@@ -27,11 +29,33 @@ package org.apache.lucene.analysis;
public class ISOLatin1AccentFilter extends TokenFilter {
public ISOLatin1AccentFilter(TokenStream input) {
super(input);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
private char[] output = new char[256];
private int outputPos;
-
+ private TermAttribute termAtt;
+
+ public final boolean incrementToken() throws java.io.IOException {
+ if (input.incrementToken()) {
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ // If no characters actually require rewriting then we
+ // just return token as-is:
+ for(int i=0;i= '\u00c0' && c <= '\uFB06') {
+ removeAccents(buffer, length);
+ termAtt.setTermBuffer(output, 0, outputPos);
+ break;
+ }
+ }
+ return true;
+ } else
+ return false;
+ }
+
+ /** @deprecated */
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
@@ -241,7 +265,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
case '\uFB06': // st
output[outputPos++] = 's';
output[outputPos++] = 't';
- break;
+ break;
default :
output[outputPos++] = c;
break;
diff --git a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
index 5b1cbf5f17f..3576ac15472 100644
--- a/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
+++ b/src/java/org/apache/lucene/analysis/KeywordTokenizer.java
@@ -20,6 +20,9 @@ package org.apache.lucene.analysis;
import java.io.IOException;
import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Emits the entire input as a single token.
*/
@@ -28,7 +31,9 @@ public class KeywordTokenizer extends Tokenizer {
private static final int DEFAULT_BUFFER_SIZE = 256;
private boolean done;
-
+ private TermAttribute termAtt;
+ private OffsetAttribute offsetAtt;
+
public KeywordTokenizer(Reader input) {
this(input, DEFAULT_BUFFER_SIZE);
}
@@ -36,8 +41,32 @@ public class KeywordTokenizer extends Tokenizer {
public KeywordTokenizer(Reader input, int bufferSize) {
super(input);
this.done = false;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (!done) {
+ done = true;
+ int upto = 0;
+ termAtt.clear();
+ char[] buffer = termAtt.termBuffer();
+ while (true) {
+ final int length = input.read(buffer, upto, buffer.length-upto);
+ if (length == -1) break;
+ upto += length;
+ if (upto == buffer.length)
+ buffer = termAtt.resizeTermBuffer(1+buffer.length);
+ }
+ termAtt.setTermLength(upto);
+ offsetAtt.setStartOffset(0);
+ offsetAtt.setEndOffset(upto);
+ return true;
+ }
+ return false;
}
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!done) {
diff --git a/src/java/org/apache/lucene/analysis/LengthFilter.java b/src/java/org/apache/lucene/analysis/LengthFilter.java
index 8176c86b182..b090cd23d9c 100644
--- a/src/java/org/apache/lucene/analysis/LengthFilter.java
+++ b/src/java/org/apache/lucene/analysis/LengthFilter.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Removes words that are too long and too short from the stream.
*
@@ -29,6 +31,8 @@ public final class LengthFilter extends TokenFilter {
final int min;
final int max;
+
+ private TermAttribute termAtt;
/**
* Build a filter that removes words that are too long or too
@@ -39,10 +43,28 @@ public final class LengthFilter extends TokenFilter {
super(in);
this.min = min;
this.max = max;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ /**
+ * Returns the next input Token whose term() is the right len
+ */
+ public final boolean incrementToken() throws IOException {
+ // return the first non-stop word found
+ while (input.incrementToken()) {
+ int len = termAtt.termLength();
+ if (len >= min && len <= max) {
+ return true;
+ }
+ // note: else we ignore it but should we index each part of it?
+ }
+ // reached EOS -- return null
+ return false;
}
/**
* Returns the next input Token whose term() is the right len
+ * @deprecated
*/
public final Token next(final Token reusableToken) throws IOException
{
diff --git a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
index 1e6316db1bd..0c146e2a64d 100644
--- a/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
+++ b/src/java/org/apache/lucene/analysis/LowerCaseFilter.java
@@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Normalizes token text to lower case.
*
@@ -27,8 +29,25 @@ import java.io.IOException;
public final class LowerCaseFilter extends TokenFilter {
public LowerCaseFilter(TokenStream in) {
super(in);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
}
+ private TermAttribute termAtt;
+
+ public final boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+
+ final char[] buffer = termAtt.termBuffer();
+ final int length = termAtt.termLength();
+ for(int i=0;i*/ lst = new ArrayList/**/();
protected Iterator/**/ iter;
-
+
public SinkTokenizer(List/**/ input) {
this.lst = input;
if (this.lst == null) this.lst = new ArrayList/**/();
@@ -61,10 +63,30 @@ public class SinkTokenizer extends Tokenizer {
return lst;
}
+ /**
+ * Increments this stream to the next token out of the list of cached tokens
+ * @throws IOException
+ */
+ public boolean incrementToken() throws IOException {
+ if (iter == null) iter = lst.iterator();
+ // Since this TokenStream can be reset we have to maintain the tokens as immutable
+ if (iter.hasNext()) {
+ AttributeSource state = (AttributeSource) iter.next();
+ state.restoreState(this);
+ return true;
+ }
+ return false;
+ }
+
+ public void add(AttributeSource source) throws IOException {
+ lst.add(source);
+ }
+
/**
* Returns the next token out of the list of cached tokens
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
+ * @deprecated
*/
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
@@ -77,8 +99,6 @@ public class SinkTokenizer extends Tokenizer {
return null;
}
-
-
/**
* Override this method to cache only certain tokens, or new tokens based
* on the old tokens.
diff --git a/src/java/org/apache/lucene/analysis/StopFilter.java b/src/java/org/apache/lucene/analysis/StopFilter.java
index 2fdd86c3036..b5fd0e9bf99 100644
--- a/src/java/org/apache/lucene/analysis/StopFilter.java
+++ b/src/java/org/apache/lucene/analysis/StopFilter.java
@@ -21,6 +21,9 @@ import java.io.IOException;
import java.util.Arrays;
import java.util.Set;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+
/**
* Removes stop words from a token stream.
*/
@@ -32,6 +35,9 @@ public final class StopFilter extends TokenFilter {
private final CharArraySet stopWords;
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
+ private TermAttribute termAtt;
+ private PositionIncrementAttribute posIncrAtt;
+
/**
* Construct a token stream filtering the given input.
*/
@@ -47,6 +53,7 @@ public final class StopFilter extends TokenFilter {
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
super(in);
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
+ init();
}
@@ -74,6 +81,7 @@ public final class StopFilter extends TokenFilter {
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
this.stopWords.addAll(stopWords);
}
+ init();
}
/**
@@ -85,6 +93,11 @@ public final class StopFilter extends TokenFilter {
public StopFilter(TokenStream in, Set stopWords) {
this(in, stopWords, false);
}
+
+ public void init() {
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+ }
/**
* Builds a Set from an array of stop words,
@@ -109,9 +122,29 @@ public final class StopFilter extends TokenFilter {
stopSet.addAll(Arrays.asList(stopWords));
return stopSet;
}
+
+ /**
+ * Returns the next input Token whose term() is not a stop word.
+ */
+ public final boolean incrementToken() throws IOException {
+ // return the first non-stop word found
+ int skippedPositions = 0;
+ while (input.incrementToken()) {
+ if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
+ if (enablePositionIncrements) {
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
+ }
+ return true;
+ }
+ skippedPositions += posIncrAtt.getPositionIncrement();
+ }
+ // reached EOS -- return null
+ return false;
+ }
/**
* Returns the next input Token whose term() is not a stop word.
+ * @deprecated
*/
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
diff --git a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
index 0a3ea04ad28..ec2606c1a00 100644
--- a/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/TeeTokenFilter.java
@@ -18,6 +18,7 @@
package org.apache.lucene.analysis;
import java.io.IOException;
+import java.util.Iterator;
/**
@@ -60,8 +61,21 @@ public class TeeTokenFilter extends TokenFilter {
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
super(input);
this.sink = sink;
+ Iterator it = getAttributesIterator();
+ while (it.hasNext()) {
+ sink.addAttribute(it.next().getClass());
+ }
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ sink.add(captureState());
+ return true;
+ }
+ return false;
}
+ /** @deprecated */
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
diff --git a/src/java/org/apache/lucene/analysis/Token.java b/src/java/org/apache/lucene/analysis/Token.java
index e831f383c46..f5d48516898 100644
--- a/src/java/org/apache/lucene/analysis/Token.java
+++ b/src/java/org/apache/lucene/analysis/Token.java
@@ -21,7 +21,11 @@ import org.apache.lucene.index.Payload;
import org.apache.lucene.index.TermPositions; // for javadoc
import org.apache.lucene.util.ArrayUtil;
-/** A Token is an occurrence of a term from the text of a field. It consists of
+/**
+ This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
+ See Javadocs in {@link TokenStream} for further details.
+
+ A Token is an occurrence of a term from the text of a field. It consists of
a term's text, the start and end offset of the term in the text of the field,
and a type string.
@@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;
@see org.apache.lucene.index.Payload
+ @deprecated A new TokenStream API was introduced with Lucene 2.9.
+ See javadocs in {@link TokenStream} for further details.
*/
public class Token implements Cloneable {
diff --git a/src/java/org/apache/lucene/analysis/TokenFilter.java b/src/java/org/apache/lucene/analysis/TokenFilter.java
index 300cb550a86..6988e332cc1 100644
--- a/src/java/org/apache/lucene/analysis/TokenFilter.java
+++ b/src/java/org/apache/lucene/analysis/TokenFilter.java
@@ -22,9 +22,16 @@ import java.io.IOException;
/** A TokenFilter is a TokenStream whose input is another token stream.
This is an abstract class.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ NOTE: subclasses must override
+ {@link #incrementToken()} if the new TokenStream API is used
+ and {@link #next(Token)} or {@link #next()} if the old
+ TokenStream API is used.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
+
+ See {@link TokenStream}
*/
public abstract class TokenFilter extends TokenStream {
/** The source of tokens for this filter. */
@@ -32,9 +39,10 @@ public abstract class TokenFilter extends TokenStream {
/** Construct a token stream filtering the given input. */
protected TokenFilter(TokenStream input) {
+ super(input);
this.input = input;
}
-
+
/** Close the input TokenStream. */
public void close() throws IOException {
input.close();
@@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream {
super.reset();
input.reset();
}
+
+ public boolean useNewAPI() {
+ return input.useNewAPI();
+ }
+
+ /**
+ * Sets whether or not to use the new TokenStream API. Settings this
+ * will apply to this Filter and all TokenStream/Filters upstream.
+ */
+ public void setUseNewAPI(boolean use) {
+ input.setUseNewAPI(use);
+ }
+
}
diff --git a/src/java/org/apache/lucene/analysis/TokenStream.java b/src/java/org/apache/lucene/analysis/TokenStream.java
index 604f4a27cd5..6a9161e8ae3 100644
--- a/src/java/org/apache/lucene/analysis/TokenStream.java
+++ b/src/java/org/apache/lucene/analysis/TokenStream.java
@@ -17,9 +17,12 @@ package org.apache.lucene.analysis;
* limitations under the License.
*/
-import org.apache.lucene.index.Payload;
-
import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.lucene.index.Payload;
+import org.apache.lucene.util.Attribute;
+import org.apache.lucene.util.AttributeSource;
/** A TokenStream enumerates the sequence of tokens, either from
fields of a document or from query text.
@@ -31,13 +34,140 @@ import java.io.IOException;
{@link TokenFilter}, a TokenStream
whose input is another TokenStream.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ A new TokenStream API is introduced with Lucene 2.9. Since
+ 2.9 Token is deprecated and the preferred way to store
+ the information of a token is to use {@link Attribute}s.
+
+ For that reason TokenStream extends {@link AttributeSource}
+ now. Note that only one instance per {@link Attribute} is
+ created and reused for every token. This approach reduces
+ object creations and allows local caching of references to
+ the {@link Attribute}s. See {@link #incrementToken()} for further details.
+
+ The workflow of the new TokenStream API is as follows:
+
+ - Instantiation of TokenStream/TokenFilters which add/get attributes
+ to/from the {@link AttributeSource}.
+
- The consumer calls {@link TokenStream#reset()}.
+
- the consumer retrieves attributes from the
+ stream and stores local references to all attributes it wants to access
+
- The consumer calls {@link #incrementToken()} until it returns false and
+ consumes the attributes after each call.
+
+ To make sure that filters and consumers know which attributes are available
+ the attributes must be added in the during instantiation. Filters and
+ consumers are not required to check for availability of attributes in {@link #incrementToken()}.
+
+ Sometimes it is desirable to capture a current state of a
+ TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
+ {@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
+ {@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
+
+ NOTE: In order to enable the new API the method
+ {@link #useNewAPI()} has to be called with useNewAPI=true.
+ Otherwise the deprecated method {@link #next(Token)} will
+ be used by Lucene consumers (indexer and queryparser) to
+ consume the tokens. {@link #next(Token)} will be removed
+ in Lucene 3.0.
+
+ NOTE: To use the old API subclasses must override {@link #next(Token)}.
+ It's also OK to instead override {@link #next()} but that
+ method is slower compared to {@link #next(Token)}.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
*/
-public abstract class TokenStream {
+public abstract class TokenStream extends AttributeSource {
+ private static boolean useNewAPIDefault = false;
+ private boolean useNewAPI = useNewAPIDefault;
+
+ protected TokenStream() {
+ super();
+ }
+
+ protected TokenStream(AttributeSource input) {
+ super(input);
+ }
+ /**
+ * Returns whether or not the new TokenStream APIs are used
+ * by default.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ */
+ public static boolean useNewAPIDefault() {
+ return useNewAPIDefault;
+ }
+
+ /**
+ * Use this API to enable or disable the new TokenStream API.
+ * by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ *
+ * If set to true, the indexer will call {@link #incrementToken()}
+ * to consume Tokens from this stream.
+ *
+ * If set to false, the indexer will call {@link #next(Token)}
+ * instead.
+ */
+ public static void setUseNewAPIDefault(boolean use) {
+ useNewAPIDefault = use;
+ }
+
+ /**
+ * Returns whether or not the new TokenStream APIs are used
+ * for this stream.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ */
+ public boolean useNewAPI() {
+ return useNewAPI;
+ }
+
+ /**
+ * Use this API to enable or disable the new TokenStream API
+ * for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
+ * (see {@link #incrementToken()}, {@link AttributeSource}).
+ *
+ * If set to true, the indexer will call {@link #incrementToken()}
+ * to consume Tokens from this stream.
+ *
+ * If set to false, the indexer will call {@link #next(Token)}
+ * instead.
+ *
+ * NOTE: All streams and filters in one chain must use the
+ * same API.
+ */
+ public void setUseNewAPI(boolean use) {
+ useNewAPI = use;
+ }
+
+ /**
+ * Consumers (e. g. the indexer) use this method to advance the stream
+ * to the next token. Implementing classes must implement this method
+ * and update the appropriate {@link Attribute}s with content of the
+ * next token.
+ *
+ * This method is called for every token of a document, so an efficient
+ * implementation is crucial for good performance. To avoid calls to
+ * {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
+ * downcasts, references to all {@link Attribute}s that this stream uses
+ * should be retrieved during instantiation.
+ *
+ * To make sure that filters and consumers know which attributes are available
+ * the attributes must be added during instantiation. Filters and
+ * consumers are not required to check for availability of attributes in {@link #incrementToken()}.
+ *
+ * @return false for end of stream; true otherwise
+ *
+ *
+ * Note that this method will be defined abstract in Lucene 3.0.
+ */
+ public boolean incrementToken() throws IOException {
+ // subclasses must implement this method; will be made abstract in Lucene 3.0
+ return false;
+ }
+
/** Returns the next token in the stream, or null at EOS.
* @deprecated The returned Token is a "full private copy" (not
* re-used across calls to next()) but will be slower
@@ -84,6 +214,8 @@ public abstract class TokenStream {
* is not required to check for null before using it, but it is a
* good idea to assert that it is not null.)
* @return next token in the stream or null if end-of-stream was hit
+ * @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
+ * APIs should be used instead. See also {@link #useNewAPI()}.
*/
public Token next(final Token reusableToken) throws IOException {
// We don't actually use inputToken, but still add this assert
@@ -107,4 +239,25 @@ public abstract class TokenStream {
/** Releases resources associated with this stream. */
public void close() throws IOException {}
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append('(');
+
+ if (hasAttributes()) {
+ // TODO Java 1.5
+ //Iterator it = attributes.values().iterator();
+ Iterator it = getAttributesIterator();
+ if (it.hasNext()) {
+ sb.append(it.next().toString());
+ }
+ while (it.hasNext()) {
+ sb.append(',');
+ sb.append(it.next().toString());
+ }
+ }
+ sb.append(')');
+ return sb.toString();
+ }
+
}
diff --git a/src/java/org/apache/lucene/analysis/Tokenizer.java b/src/java/org/apache/lucene/analysis/Tokenizer.java
index 4c6dc4772a2..1222e73761d 100644
--- a/src/java/org/apache/lucene/analysis/Tokenizer.java
+++ b/src/java/org/apache/lucene/analysis/Tokenizer.java
@@ -24,12 +24,23 @@ import java.io.IOException;
This is an abstract class.
- NOTE: subclasses must override {@link #next(Token)}. It's
- also OK to instead override {@link #next()} but that
- method is now deprecated in favor of {@link #next(Token)}.
+ NOTE: In order to enable the new API the method
+ {@link #useNewAPI()} has to be called with useNewAPI=true.
+ Otherwise the deprecated method {@link #next(Token)} will
+ be used by Lucene consumers (indexer and queryparser) to
+ consume the tokens. {@link #next(Token)} will be removed
+ in Lucene 3.0.
+ NOTE: To use the old API subclasses must override {@link #next(Token)}.
+ It's also OK to instead override {@link #next()} but that
+ method is slower compared to {@link #next(Token)}.
+
NOTE: subclasses overriding {@link #next(Token)} must
call {@link Token#clear()}.
+ *
+ * WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
+ * The APIs introduced in these classes with Lucene 2.9 might change in the future.
+ * We will make our best efforts to keep the APIs backwards-compatible.
*/
public abstract class Tokenizer extends TokenStream {
diff --git a/src/java/org/apache/lucene/analysis/package.html b/src/java/org/apache/lucene/analysis/package.html
index d5fe91c0041..3ae9fb0f627 100644
--- a/src/java/org/apache/lucene/analysis/package.html
+++ b/src/java/org/apache/lucene/analysis/package.html
@@ -35,8 +35,7 @@ application using Lucene to use an appropriate Parser to convert the orig
Tokenization
Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
-input text into small indexing elements –
-{@link org.apache.lucene.analysis.Token Tokens}.
+input text into small indexing elements – tokens.
The way input text is broken into tokens very
much dictates further capabilities of search upon that text.
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
@@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):
{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
by the indexing and searching processes. See below for more information on implementing your own Analyzer.
{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
- up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in
+ up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
the analysis process.
{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
- for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a
+ for modifying tokenss that have been created by the Tokenizer. Common modifications performed by a
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters
+ Since Lucene 2.9 the TokenStream API was changed. Please see section "New TokenStream API" below for details.
Hints, Tips and Traps
@@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
- Token t = ts.next();
- while (t!=null) {
- System.out.println("token: "+t));
+ while (ts.incrementToken()) {
+ System.out.println("token: "+ts));
t = ts.next();
}
@@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.
The following sections discuss some aspects of implementing your own analyzer.
-Field Section Boundaries
+Field Section Boundaries
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
is called multiple times for the same field name, we could say that each such call creates a new
@@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package.
};
-Token Position Increments
+Token Position Increments
By default, all tokens created by Analyzers and Tokenizers have a
- {@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
+ {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
This means that the position stored for that token in the index would be one more than
that of the previous token.
Recall that phrase and proximity searches rely on position info.
@@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package.
If this behavior does not fit the application needs,
a modified analyzer can be used, that would increment further the positions of
tokens following a removed stop word, using
- {@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
+ {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
This can be done with something like:
public TokenStream tokenStream(final String fieldName, Reader reader) {
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
TokenStream res = new TokenStream() {
- public Token next() throws IOException {
+ TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
+
+ public boolean incrementToken() throws IOException {
int extraIncrement = 0;
while (true) {
- Token t = ts.next();
- if (t!=null) {
- if (stopWords.contains(t.termText())) {
+ boolean hasNext = ts.incrementToken();
+ if (hasNext) {
+ if (stopWords.contains(termAtt.term())) {
extraIncrement++; // filter this word
continue;
}
if (extraIncrement>0) {
- t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
+ posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
}
}
- return t;
+ return hasNext;
}
}
};
@@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
same position as that token, and so would they be seen by phrase and proximity searches.
+New TokenStream API
+
+ With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
+ has getter and setter methods for different properties like positionIncrement and termText.
+ While this approach was sufficient for the default indexing format, it is not versatile enough for
+ Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
+ index formats.
+
+
+A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
+is necessary that can transport custom types of data from the documents to the indexer.
+
+Attribute and AttributeSource
+Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
+{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
+particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
+ contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
+An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
+means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
+AttributeSources.
+
+ Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
+
+ - {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
The term text of a token.
+ - {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}
The start and end offset of token in characters.
+ - {@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}
See above for detailed information about position increment.
+ - {@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}
The payload that a Token can optionally have.
+ - {@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}
The type of the token. Default is 'word'.
+ - {@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}
Optional flags a token can have.
+
+
+Using the new TokenStream API
+There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
+to walk through the example below first and come back to this section afterwards.
+-
+Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
+a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
+with the TokenStream.
+
+
+-
+Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
+the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
+Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
+was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
+the Attribute instances.
+
+
+-
+For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
+constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
+in incrementToken() will avoid expensive casting and attribute lookups for every token in the document.
+
+
+-
+All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
+result. This is especially important to know for addAttribute(). The method takes the type (
Class
)
+of an Attribute as an argument and returns an instance. If an Attribute of the same type was previously added, then
+the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
+can safely call addAttribute() with the same Attribute type multiple times.
+
+Example
+In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
+have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
+here to illustrate the usage of the new TokenStream API.
+Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
+utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
+Whitespace tokenization
+
+public class MyAnalyzer extends Analyzer {
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream stream = new WhitespaceTokenizer(reader);
+ return stream;
+ }
+
+ public static void main(String[] args) throws IOException {
+ // text to tokenize
+ final String text = "This is a demo of the new TokenStream API";
+
+ MyAnalyzer analyzer = new MyAnalyzer();
+ TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
+
+ // get the TermAttribute from the TokenStream
+ TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
+
+ // print all tokens until stream is exhausted
+ while (stream.incrementToken()) {
+ System.out.println(termAtt.term());
+ }
+ }
+}
+
+In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
+prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
+Here is the output:
+
+This
+is
+a
+demo
+of
+the
+new
+TokenStream
+API
+
+Adding a LengthFilter
+We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
+to the chain. Only the tokenStream() method in our analyzer needs to be changed:
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream stream = new WhitespaceTokenizer(reader);
+ stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
+ return stream;
+ }
+
+Note how now only words with 3 or more characters are contained in the output:
+
+This
+demo
+the
+new
+TokenStream
+API
+
+Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
+
+public final class LengthFilter extends TokenFilter {
+
+ final int min;
+ final int max;
+
+ private TermAttribute termAtt;
+
+ /**
+ * Build a filter that removes words that are too long or too
+ * short from the text.
+ */
+ public LengthFilter(TokenStream in, int min, int max)
+ {
+ super(in);
+ this.min = min;
+ this.max = max;
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ /**
+ * Returns the next input Token whose term() is the right len
+ */
+ public final boolean incrementToken() throws IOException
+ {
+ assert termAtt != null;
+ // return the first non-stop word found
+ while (input.incrementToken()) {
+ int len = termAtt.termLength();
+ if (len >= min && len <= max) {
+ return true;
+ }
+ // note: else we ignore it but should we index each part of it?
+ }
+ // reached EOS -- return null
+ return false;
+ }
+}
+
+The TermAttribute is added in the constructor and stored in the instance variable termAtt
.
+Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
+addAttribute()
call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
+are retrieved from the input stream in the incrementToken()
method. By looking at the term text
+in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
+Note how incrementToken()
can efficiently access the instance variable; no attribute lookup or downcasting
+is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
+Adding a custom Attribute
+Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
+PartOfSpeechAttribute
:
+
+ public static enum PartOfSpeech {
+ Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
+ }
+
+ public static final class PartOfSpeechAttribute extends Attribute {
+
+ private PartOfSpeech pos = PartOfSpeech.Unknown;
+
+ public void setPartOfSpeech(PartOfSpeech pos) {
+ this.pos = pos;
+ }
+
+ public PartOfSpeech getPartOfSpeech() {
+ return pos;
+ }
+
+ public void clear() {
+ pos = PartOfSpeech.Unknown;
+ }
+
+ public void copyTo(Attribute target) {
+ ((PartOfSpeechAttribute) target).pos = pos;
+ }
+
+ public boolean equals(Object other) {
+ if (other == this) {
+ return true;
+ }
+
+ if (other instanceof PartOfSpeechAttribute) {
+ return pos == ((PartOfSpeechAttribute) other).pos;
+ }
+
+ return false;
+ }
+
+ public int hashCode() {
+ return pos.ordinal();
+ }
+
+ public String toString() {
+ return "PartOfSpeech=" + pos;
+ }
+ }
+
+This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
+new Attribute
class and therefore implements its abstract methods clear(), copyTo(), equals(), hashCode(), toString()
.
+Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
+that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
+
+ public static class PartOfSpeechTaggingFilter extends TokenFilter {
+ PartOfSpeechAttribute posAtt;
+ TermAttribute termAtt;
+
+ protected PartOfSpeechTaggingFilter(TokenStream input) {
+ super(input);
+ posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
+ termAtt = (TermAttribute) addAttribute(TermAttribute.class);
+ }
+
+ public boolean incrementToken() throws IOException {
+ if (!input.incrementToken()) {return false;}
+ posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
+ return true;
+ }
+
+ // determine the part of speech for the given term
+ protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
+ // naive implementation that tags every uppercased word as noun
+ if (length > 0 && Character.isUpperCase(term[0])) {
+ return PartOfSpeech.Noun;
+ }
+ return PartOfSpeech.Unknown;
+ }
+ }
+
+Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
+stores references in instance variables. Now we need to add the filter to the chain:
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ TokenStream stream = new WhitespaceTokenizer(reader);
+ stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
+ stream = new PartOfSpeechTaggingFilter(stream);
+ return stream;
+ }
+
+Now let's look at the output:
+
+This
+demo
+the
+new
+TokenStream
+API
+
+Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
+affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
+to make use of the new PartOfSpeechAttribute and print it out:
+
+ public static void main(String[] args) throws IOException {
+ // text to tokenize
+ final String text = "This is a demo of the new TokenStream API";
+
+ MyAnalyzer analyzer = new MyAnalyzer();
+ TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
+
+ // get the TermAttribute from the TokenStream
+ TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
+
+ // get the PartOfSpeechAttribute from the TokenStream
+ PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class);
+
+ // print all tokens until stream is exhausted
+ while (stream.incrementToken()) {
+ System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
+ }
+ }
+
+The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
+the while loop that consumes the stream. Here is the new output:
+
+This: Noun
+demo: Unknown
+the: Unknown
+new: Unknown
+TokenStream: Noun
+API: Noun
+
+Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
+part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
+is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
+API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
+of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
+as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
+As a small hint, this is how the new Attribute class could begin:
+
+ public class FirstTokenOfSentenceAttribute extends Attribute {
+
+ private boolean firstToken;
+
+ public void setFirstToken(boolean firstToken) {
+ this.firstToken = firstToken;
+ }
+
+ public boolean getFirstToken() {
+ return firstToken;
+ }
+
+ public void clear() {
+ firstToken = false;
+ }
+
+ ...
+