mirror of https://github.com/apache/lucene.git
LUCENE-1460: Changed TokenStreams/TokenFilters in contrib to use the new TokenStream API.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@799953 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
3e869d9336
commit
1743081b07
|
@ -356,6 +356,9 @@ API Changes
|
||||||
33. LUCENE-1705: Added IndexWriter.deleteAllDocuments. (Tim Smith via
|
33. LUCENE-1705: Added IndexWriter.deleteAllDocuments. (Tim Smith via
|
||||||
Mike McCandless)
|
Mike McCandless)
|
||||||
|
|
||||||
|
34. LUCENE-1460: Changed TokenStreams/TokenFilters in contrib to
|
||||||
|
use the new TokenStream API. (Robert Muir, Michael Busch)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
||||||
|
|
|
@ -19,35 +19,33 @@ package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
|
* A TokenFilter that applies {@link ArabicNormalizer} to normalize the orthography.
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ArabicNormalizationFilter extends TokenFilter {
|
public final class ArabicNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
protected ArabicNormalizer normalizer = null;
|
protected ArabicNormalizer normalizer = null;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public ArabicNormalizationFilter(TokenStream input) {
|
public ArabicNormalizationFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
normalizer = new ArabicNormalizer();
|
normalizer = new ArabicNormalizer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
public Token next(Token reusableToken) throws IOException {
|
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
|
||||||
if ((reusableToken = input.next(reusableToken)) == null) {
|
termAtt.setTermLength(newlen);
|
||||||
return null;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
int oldlen = reusableToken.termLength();
|
return false;
|
||||||
int newlen = normalizer.normalize(reusableToken.termBuffer(), oldlen);
|
|
||||||
if (oldlen != newlen)
|
|
||||||
reusableToken.setTermLength(newlen);
|
|
||||||
return reusableToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,43 +19,33 @@ package org.apache.lucene.analysis.ar;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
|
* A TokenFilter that applies {@link ArabicStemmer} to stem Arabic words..
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class ArabicStemFilter extends TokenFilter {
|
public final class ArabicStemFilter extends TokenFilter {
|
||||||
|
|
||||||
protected ArabicStemmer stemmer = null;
|
protected ArabicStemmer stemmer = null;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public ArabicStemFilter(TokenStream input) {
|
public ArabicStemFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
stemmer = new ArabicStemmer();
|
stemmer = new ArabicStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
/**
|
int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
||||||
* @return Returns the next token in the stream, or null at EOS
|
termAtt.setTermLength(newlen);
|
||||||
*/
|
return true;
|
||||||
public Token next(Token reusableToken) throws IOException {
|
|
||||||
/**
|
|
||||||
* The actual token in the input stream.
|
|
||||||
*/
|
|
||||||
|
|
||||||
|
|
||||||
if ((reusableToken = input.next(reusableToken)) == null) {
|
|
||||||
return null;
|
|
||||||
} else {
|
} else {
|
||||||
int oldlen = reusableToken.termLength();
|
return false;
|
||||||
int newlen = stemmer.stem(reusableToken.termBuffer(), oldlen);
|
|
||||||
if (oldlen != newlen)
|
|
||||||
reusableToken.setTermLength(newlen);
|
|
||||||
return reusableToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,12 @@ package org.apache.lucene.analysis.br;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import java.io.IOException;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Based on GermanStemFilter
|
* Based on GermanStemFilter
|
||||||
|
@ -36,10 +35,12 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
private BrazilianStemmer stemmer = null;
|
private BrazilianStemmer stemmer = null;
|
||||||
private Set exclusions = null;
|
private Set exclusions = null;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public BrazilianStemFilter(TokenStream in) {
|
public BrazilianStemFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new BrazilianStemmer();
|
stemmer = new BrazilianStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
|
public BrazilianStemFilter(TokenStream in, Set exclusiontable) {
|
||||||
|
@ -47,26 +48,20 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||||
this.exclusions = exclusiontable;
|
this.exclusions = exclusiontable;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public boolean incrementToken() throws IOException {
|
||||||
* @return Returns the next token in the stream, or null at EOS.
|
if (input.incrementToken()) {
|
||||||
*/
|
String term = termAtt.term();
|
||||||
public final Token next(final Token reusableToken)
|
// Check the exclusion table.
|
||||||
throws IOException {
|
if (exclusions == null || !exclusions.contains(term)) {
|
||||||
assert reusableToken != null;
|
String s = stemmer.stem(term);
|
||||||
Token nextToken = input.next(reusableToken);
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
if (nextToken == null)
|
if ((s != null) && !s.equals(term))
|
||||||
return null;
|
termAtt.setTermBuffer(s);
|
||||||
|
}
|
||||||
String term = nextToken.term();
|
return true;
|
||||||
|
} else {
|
||||||
// Check the exclusion table.
|
return false;
|
||||||
if (exclusions == null || !exclusions.contains(term)) {
|
|
||||||
String s = stemmer.stem(term);
|
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
|
||||||
if ((s != null) && !s.equals(term))
|
|
||||||
nextToken.setTermBuffer(s);
|
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,11 +17,14 @@ package org.apache.lucene.analysis.cjk;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
||||||
|
@ -88,6 +91,10 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
*/
|
*/
|
||||||
private boolean preIsTokened = false;
|
private boolean preIsTokened = false;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
//~ Constructors -----------------------------------------------------------
|
//~ Constructors -----------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -97,25 +104,26 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
*/
|
*/
|
||||||
public CJKTokenizer(Reader in) {
|
public CJKTokenizer(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
//~ Methods ----------------------------------------------------------------
|
//~ Methods ----------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next token in the stream, or null at EOS.
|
* Returns true for the next token in the stream, or false at EOS.
|
||||||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
||||||
* for detail.
|
* for detail.
|
||||||
*
|
*
|
||||||
* @param reusableToken a reusable token
|
* @return false for end of stream, true otherwise
|
||||||
* @return Token
|
|
||||||
*
|
*
|
||||||
* @throws java.io.IOException - throw IOException when read error <br>
|
* @throws java.io.IOException - throw IOException when read error <br>
|
||||||
* happened in the InputStream
|
* happened in the InputStream
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
/** how many character(s) has been stored in buffer */
|
/** how many character(s) has been stored in buffer */
|
||||||
assert reusableToken != null;
|
|
||||||
|
|
||||||
while(true) { // loop until we find a non-empty token
|
while(true) { // loop until we find a non-empty token
|
||||||
|
|
||||||
|
@ -147,7 +155,7 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
|
|
||||||
break;
|
break;
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//get current character
|
//get current character
|
||||||
|
@ -252,10 +260,12 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
return reusableToken.reinit
|
termAtt.setTermBuffer(buffer, 0, length);
|
||||||
(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length), TOKEN_TYPE_NAMES[tokenType]);
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||||
|
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
|
||||||
|
return true;
|
||||||
} else if (dataLen == -1) {
|
} else if (dataLen == -1) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Cycle back and try for the next token (don't
|
// Cycle back and try for the next token (don't
|
||||||
|
|
|
@ -17,12 +17,13 @@ package org.apache.lucene.analysis.cn;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Title: ChineseFilter
|
* Title: ChineseFilter
|
||||||
|
@ -56,19 +57,21 @@ public final class ChineseFilter extends TokenFilter {
|
||||||
|
|
||||||
private Map stopTable;
|
private Map stopTable;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public ChineseFilter(TokenStream in) {
|
public ChineseFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
|
||||||
stopTable = new HashMap(STOP_WORDS.length);
|
stopTable = new HashMap(STOP_WORDS.length);
|
||||||
for (int i = 0; i < STOP_WORDS.length; i++)
|
for (int i = 0; i < STOP_WORDS.length; i++)
|
||||||
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
|
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
|
|
||||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
while (input.incrementToken()) {
|
||||||
String text = nextToken.term();
|
String text = termAtt.term();
|
||||||
|
|
||||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||||
if (stopTable.get(text) == null) {
|
if (stopTable.get(text) == null) {
|
||||||
|
@ -79,7 +82,7 @@ public final class ChineseFilter extends TokenFilter {
|
||||||
|
|
||||||
// English word/token should larger than 1 character.
|
// English word/token should larger than 1 character.
|
||||||
if (text.length()>1) {
|
if (text.length()>1) {
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case Character.OTHER_LETTER:
|
case Character.OTHER_LETTER:
|
||||||
|
@ -87,13 +90,13 @@ public final class ChineseFilter extends TokenFilter {
|
||||||
// One Chinese character as one Chinese word.
|
// One Chinese character as one Chinese word.
|
||||||
// Chinese word extraction to be added later here.
|
// Chinese word extraction to be added later here.
|
||||||
|
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -18,10 +18,12 @@ package org.apache.lucene.analysis.cn;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,6 +58,8 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
|
|
||||||
public ChineseTokenizer(Reader in) {
|
public ChineseTokenizer(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||||
|
@ -68,7 +72,9 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
private int length;
|
private int length;
|
||||||
private int start;
|
private int start;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
private final void push(char c) {
|
private final void push(char c) {
|
||||||
|
|
||||||
if (length == 0) start = offset-1; // start of token
|
if (length == 0) start = offset-1; // start of token
|
||||||
|
@ -76,19 +82,20 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final Token flush(final Token token) {
|
private final boolean flush() {
|
||||||
|
|
||||||
if (length>0) {
|
if (length>0) {
|
||||||
//System.out.println(new String(buffer, 0,
|
//System.out.println(new String(buffer, 0,
|
||||||
//length));
|
//length));
|
||||||
return token.reinit(buffer, 0, length, input.correctOffset(start), input.correctOffset(start+length));
|
termAtt.setTermBuffer(buffer, 0, length);
|
||||||
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start+length));
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
|
|
||||||
length = 0;
|
length = 0;
|
||||||
start = offset;
|
start = offset;
|
||||||
|
@ -104,7 +111,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
bufferIndex = 0;
|
bufferIndex = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (dataLen == -1) return flush(reusableToken);
|
if (dataLen == -1) return flush();
|
||||||
else
|
else
|
||||||
c = ioBuffer[bufferIndex++];
|
c = ioBuffer[bufferIndex++];
|
||||||
|
|
||||||
|
@ -115,20 +122,20 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
case Character.LOWERCASE_LETTER:
|
case Character.LOWERCASE_LETTER:
|
||||||
case Character.UPPERCASE_LETTER:
|
case Character.UPPERCASE_LETTER:
|
||||||
push(c);
|
push(c);
|
||||||
if (length == MAX_WORD_LEN) return flush(reusableToken);
|
if (length == MAX_WORD_LEN) return flush();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case Character.OTHER_LETTER:
|
case Character.OTHER_LETTER:
|
||||||
if (length>0) {
|
if (length>0) {
|
||||||
bufferIndex--;
|
bufferIndex--;
|
||||||
offset--;
|
offset--;
|
||||||
return flush(reusableToken);
|
return flush();
|
||||||
}
|
}
|
||||||
push(c);
|
push(c);
|
||||||
return flush(reusableToken);
|
return flush();
|
||||||
|
|
||||||
default:
|
default:
|
||||||
if (length>0) return flush(reusableToken);
|
if (length>0) return flush();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,12 @@ import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class for decomposition token filters.
|
* Base class for decomposition token filters.
|
||||||
|
@ -54,6 +60,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
protected final int minSubwordSize;
|
protected final int minSubwordSize;
|
||||||
protected final int maxSubwordSize;
|
protected final int maxSubwordSize;
|
||||||
protected final boolean onlyLongestMatch;
|
protected final boolean onlyLongestMatch;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private FlagsAttribute flagsAtt;
|
||||||
|
private PositionIncrementAttribute posIncAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
private PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
|
private final Token wrapper = new Token();
|
||||||
|
|
||||||
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
this(input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
|
||||||
|
@ -90,6 +105,13 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
this.dictionary = new CharArraySet(dictionary.size(), false);
|
this.dictionary = new CharArraySet(dictionary.size(), false);
|
||||||
addAllLowerCase(this.dictionary, dictionary);
|
addAllLowerCase(this.dictionary, dictionary);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||||
|
posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -105,26 +127,54 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
return dict;
|
return dict;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
private final void setToken(final Token token) throws IOException {
|
||||||
assert reusableToken != null;
|
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
||||||
|
flagsAtt.setFlags(token.getFlags());
|
||||||
|
typeAtt.setType(token.type());
|
||||||
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
|
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
|
payloadAtt.setPayload(token.getPayload());
|
||||||
|
}
|
||||||
|
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
if (tokens.size() > 0) {
|
if (tokens.size() > 0) {
|
||||||
return (Token)tokens.removeFirst();
|
setToken((Token)tokens.removeFirst());
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
if (input.incrementToken() == false)
|
||||||
if (nextToken == null) {
|
return false;
|
||||||
return null;
|
|
||||||
}
|
wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
|
wrapper.setStartOffset(offsetAtt.startOffset());
|
||||||
decompose(nextToken);
|
wrapper.setEndOffset(offsetAtt.endOffset());
|
||||||
|
wrapper.setFlags(flagsAtt.getFlags());
|
||||||
|
wrapper.setType(typeAtt.type());
|
||||||
|
wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
|
||||||
|
wrapper.setPayload(payloadAtt.getPayload());
|
||||||
|
|
||||||
|
decompose(wrapper);
|
||||||
|
|
||||||
if (tokens.size() > 0) {
|
if (tokens.size() > 0) {
|
||||||
return (Token)tokens.removeFirst();
|
setToken((Token)tokens.removeFirst());
|
||||||
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
|
}
|
||||||
|
|
||||||
protected static final void addAllLowerCase(Set target, Collection col) {
|
protected static final void addAllLowerCase(Set target, Collection col) {
|
||||||
Iterator iter=col.iterator();
|
Iterator iter=col.iterator();
|
||||||
|
|
||||||
|
|
|
@ -17,13 +17,13 @@ package org.apache.lucene.analysis.de;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems German words. It supports a table of words that should
|
* A filter that stems German words. It supports a table of words that should
|
||||||
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
* not be stemmed at all. The stemmer used can be changed at runtime after the
|
||||||
|
@ -40,10 +40,13 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
private GermanStemmer stemmer = null;
|
private GermanStemmer stemmer = null;
|
||||||
private Set exclusionSet = null;
|
private Set exclusionSet = null;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public GermanStemFilter( TokenStream in )
|
public GermanStemFilter( TokenStream in )
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new GermanStemmer();
|
stemmer = new GermanStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,26 +59,22 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Returns the next token in the stream, or null at EOS
|
* @return Returns true for next token in the stream, or false at EOS
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken)
|
public boolean incrementToken() throws IOException {
|
||||||
throws IOException
|
if (input.incrementToken()) {
|
||||||
{
|
String term = termAtt.term();
|
||||||
assert reusableToken != null;
|
// Check the exclusion table.
|
||||||
Token nextToken = input.next(reusableToken);
|
if (exclusionSet == null || !exclusionSet.contains(term)) {
|
||||||
|
String s = stemmer.stem(term);
|
||||||
if (nextToken == null)
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
return null;
|
if ((s != null) && !s.equals(term))
|
||||||
|
termAtt.setTermBuffer(s);
|
||||||
String term = nextToken.term();
|
}
|
||||||
// Check the exclusion table.
|
return true;
|
||||||
if (exclusionSet == null || !exclusionSet.contains(term)) {
|
} else {
|
||||||
String s = stemmer.stem(term);
|
return false;
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
|
||||||
if ((s != null) && !s.equals(term))
|
|
||||||
nextToken.setTermBuffer(s);
|
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -16,9 +16,11 @@ package org.apache.lucene.analysis.el;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes token text to lower case, analyzing given ("greek") charset.
|
* Normalizes token text to lower case, analyzing given ("greek") charset.
|
||||||
|
@ -28,26 +30,26 @@ public final class GreekLowerCaseFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
char[] charset;
|
char[] charset;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
public GreekLowerCaseFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException
|
public boolean incrementToken() throws IOException {
|
||||||
{
|
if (input.incrementToken()) {
|
||||||
assert reusableToken != null;
|
char[] chArray = termAtt.termBuffer();
|
||||||
Token nextToken = input.next(reusableToken);
|
int chLen = termAtt.termLength();
|
||||||
|
|
||||||
if (nextToken == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
char[] chArray = nextToken.termBuffer();
|
|
||||||
int chLen = nextToken.termLength();
|
|
||||||
for (int i = 0; i < chLen; i++)
|
for (int i = 0; i < chLen; i++)
|
||||||
{
|
{
|
||||||
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
|
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
|
||||||
}
|
}
|
||||||
return nextToken;
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.util.Iterator;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes elisions from a token stream. For example, "l'avion" (the plane) will be
|
* Removes elisions from a token stream. For example, "l'avion" (the plane) will be
|
||||||
|
@ -36,7 +37,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
*/
|
*/
|
||||||
public class ElisionFilter extends TokenFilter {
|
public class ElisionFilter extends TokenFilter {
|
||||||
private Set articles = null;
|
private Set articles = null;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
private static char[] apostrophes = {'\'', '’'};
|
private static char[] apostrophes = {'\'', '’'};
|
||||||
|
|
||||||
public void setArticles(Set articles) {
|
public void setArticles(Set articles) {
|
||||||
|
@ -54,6 +56,7 @@ public class ElisionFilter extends TokenFilter {
|
||||||
super(input);
|
super(input);
|
||||||
this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t",
|
this.articles = new HashSet(Arrays.asList(new String[] { "l", "m", "t",
|
||||||
"qu", "n", "s", "j" }));
|
"qu", "n", "s", "j" }));
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -62,6 +65,7 @@ public class ElisionFilter extends TokenFilter {
|
||||||
public ElisionFilter(TokenStream input, Set articles) {
|
public ElisionFilter(TokenStream input, Set articles) {
|
||||||
super(input);
|
super(input);
|
||||||
setArticles(articles);
|
setArticles(articles);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -70,39 +74,50 @@ public class ElisionFilter extends TokenFilter {
|
||||||
public ElisionFilter(TokenStream input, String[] articles) {
|
public ElisionFilter(TokenStream input, String[] articles) {
|
||||||
super(input);
|
super(input);
|
||||||
setArticles(new HashSet(Arrays.asList(articles)));
|
setArticles(new HashSet(Arrays.asList(articles)));
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token with term() without elisioned start
|
* Returns the next input Token with term() without elisioned start
|
||||||
*/
|
*/
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
if (nextToken == null)
|
int termLength = termAtt.termLength();
|
||||||
return null;
|
|
||||||
|
|
||||||
char[] termBuffer = nextToken.termBuffer();
|
int minPoz = Integer.MAX_VALUE;
|
||||||
int termLength = nextToken.termLength();
|
for (int i = 0; i < apostrophes.length; i++) {
|
||||||
|
char apos = apostrophes[i];
|
||||||
int minPoz = Integer.MAX_VALUE;
|
// The equivalent of String.indexOf(ch)
|
||||||
for (int i = 0; i < apostrophes.length; i++) {
|
for (int poz = 0; poz < termLength ; poz++) {
|
||||||
char apos = apostrophes[i];
|
if (termBuffer[poz] == apos) {
|
||||||
// The equivalent of String.indexOf(ch)
|
|
||||||
for (int poz = 0; poz < termLength ; poz++) {
|
|
||||||
if (termBuffer[poz] == apos) {
|
|
||||||
minPoz = Math.min(poz, minPoz);
|
minPoz = Math.min(poz, minPoz);
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// An apostrophe has been found. If the prefix is an article strip it off.
|
// An apostrophe has been found. If the prefix is an article strip it off.
|
||||||
if (minPoz != Integer.MAX_VALUE
|
if (minPoz != Integer.MAX_VALUE
|
||||||
&& articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) {
|
&& articles.contains(new String(termAtt.termBuffer(), 0, minPoz).toLowerCase())) {
|
||||||
nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1));
|
termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return nextToken;
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.lucene.analysis.fr;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -39,10 +40,13 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
private FrenchStemmer stemmer = null;
|
private FrenchStemmer stemmer = null;
|
||||||
private Set exclusions = null;
|
private Set exclusions = null;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public FrenchStemFilter( TokenStream in ) {
|
public FrenchStemFilter( TokenStream in ) {
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new FrenchStemmer();
|
stemmer = new FrenchStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,25 +56,23 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Returns the next token in the stream, or null at EOS
|
* @return Returns true for the next token in the stream, or false at EOS
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken)
|
public boolean incrementToken() throws IOException {
|
||||||
throws IOException {
|
if (input.incrementToken()) {
|
||||||
assert reusableToken != null;
|
String term = termAtt.term();
|
||||||
Token nextToken = input.next(reusableToken);
|
|
||||||
if (nextToken == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
String term = nextToken.term();
|
// Check the exclusion table
|
||||||
|
if ( exclusions == null || !exclusions.contains( term ) ) {
|
||||||
// Check the exclusion table
|
String s = stemmer.stem( term );
|
||||||
if ( exclusions == null || !exclusions.contains( term ) ) {
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
String s = stemmer.stem( term );
|
if ((s != null) && !s.equals( term ) )
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
termAtt.setTermBuffer(s);
|
||||||
if ((s != null) && !s.equals( term ) )
|
}
|
||||||
nextToken.setTermBuffer(s);
|
return true;
|
||||||
}
|
} else {
|
||||||
return nextToken;
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom FrenchStemmer for this filter.
|
* Set a alternative/custom FrenchStemmer for this filter.
|
||||||
|
|
|
@ -27,8 +27,19 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public class EmptyTokenStream extends TokenStream {
|
public class EmptyTokenStream extends TokenStream {
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
return false;
|
||||||
return null;
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Links two PrefixAwareTokenFilter
|
* Links two PrefixAwareTokenFilter
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ import java.io.IOException;
|
||||||
* to be used when updating the token values in the second stream based on that token.
|
* to be used when updating the token values in the second stream based on that token.
|
||||||
*
|
*
|
||||||
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
* The default implementation adds last prefix token end offset to the suffix token start and end offsets.
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public class PrefixAwareTokenFilter extends TokenStream {
|
public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
|
|
||||||
|
|
|
@ -17,10 +17,16 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A token stream containing a single token.
|
* A token stream containing a single token.
|
||||||
|
@ -29,34 +35,66 @@ public class SingleTokenTokenStream extends TokenStream {
|
||||||
|
|
||||||
private boolean exhausted = false;
|
private boolean exhausted = false;
|
||||||
// The token needs to be immutable, so work with clones!
|
// The token needs to be immutable, so work with clones!
|
||||||
private Token token;
|
private Token singleToken;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private FlagsAttribute flagsAtt;
|
||||||
|
private PositionIncrementAttribute posIncAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
private PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
public SingleTokenTokenStream(Token token) {
|
public SingleTokenTokenStream(Token token) {
|
||||||
assert token != null;
|
assert token != null;
|
||||||
this.token = (Token) token.clone();
|
this.singleToken = (Token) token.clone();
|
||||||
|
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||||
|
posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (exhausted) {
|
if (exhausted) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Token clone = (Token) singleToken.clone();
|
||||||
|
|
||||||
|
termAtt.setTermBuffer(clone.termBuffer(), 0, clone.termLength());
|
||||||
|
offsetAtt.setOffset(clone.startOffset(), clone.endOffset());
|
||||||
|
flagsAtt.setFlags(clone.getFlags());
|
||||||
|
typeAtt.setType(clone.type());
|
||||||
|
posIncAtt.setPositionIncrement(clone.getPositionIncrement());
|
||||||
|
payloadAtt.setPayload(clone.getPayload());
|
||||||
exhausted = true;
|
exhausted = true;
|
||||||
return (Token) token.clone();
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
|
}
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
exhausted = false;
|
exhausted = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token getToken() {
|
public Token getToken() {
|
||||||
return (Token) token.clone();
|
return (Token) singleToken.clone();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setToken(Token token) {
|
public void setToken(Token token) {
|
||||||
this.token = (Token) token.clone();
|
this.singleToken = (Token) token.clone();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,9 +20,10 @@ package org.apache.lucene.analysis.ngram;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the given token into n-grams of given size(s).
|
* Tokenizes the given token into n-grams of given size(s).
|
||||||
|
@ -66,11 +67,18 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
private int minGram;
|
private int minGram;
|
||||||
private int maxGram;
|
private int maxGram;
|
||||||
private Side side;
|
private Side side;
|
||||||
private LinkedList ngrams;
|
private char[] curTermBuffer;
|
||||||
|
private int curTermLength;
|
||||||
|
private int curGramSize;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
|
||||||
protected EdgeNGramTokenFilter(TokenStream input) {
|
protected EdgeNGramTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
this.ngrams = new LinkedList();
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -99,7 +107,8 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
this.side = side;
|
this.side = side;
|
||||||
this.ngrams = new LinkedList();
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -114,54 +123,42 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
this(input, Side.getSide(sideLabel), minGram, maxGram);
|
this(input, Side.getSide(sideLabel), minGram, maxGram);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
public final boolean incrementToken() throws IOException {
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
while (true) {
|
||||||
assert reusableToken != null;
|
if (curTermBuffer == null) {
|
||||||
if (!ngrams.isEmpty()) {
|
if (!input.incrementToken()) {
|
||||||
return (Token)ngrams.removeFirst();
|
return false;
|
||||||
}
|
} else {
|
||||||
|
curTermBuffer = (char[]) termAtt.termBuffer().clone();
|
||||||
Token token = null;
|
curTermLength = termAtt.termLength();
|
||||||
|
curGramSize = minGram;
|
||||||
while (ngrams.isEmpty() && (token = input.next()) != null) {
|
}
|
||||||
ngram(token);
|
}
|
||||||
}
|
if (curGramSize <= maxGram) {
|
||||||
|
if (! (curGramSize > curTermLength // if the remaining input is too short, we can't generate any n-grams
|
||||||
if (token == null) {
|
|| curGramSize > maxGram)) { // if we have hit the end of our n-gram size range, quit
|
||||||
return null;
|
// grab gramSize chars from front or back
|
||||||
}
|
int start = side == Side.FRONT ? 0 : curTermLength - curGramSize;
|
||||||
|
int end = start + curGramSize;
|
||||||
if (!ngrams.isEmpty()) {
|
offsetAtt.setOffset(start, end);
|
||||||
return (Token)ngrams.removeFirst();
|
termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
|
||||||
} else {
|
curGramSize++;
|
||||||
return null;
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
curTermBuffer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
private void ngram(final Token token) {
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
int termLength = token.termLength();
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
char[] termBuffer = token.termBuffer();
|
public final Token next() throws java.io.IOException {
|
||||||
int gramSize = minGram;
|
return super.next();
|
||||||
while (gramSize <= maxGram) {
|
|
||||||
// if the remaining input is too short, we can't generate any n-grams
|
|
||||||
if (gramSize > termLength) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we have hit the end of our n-gram size range, quit
|
|
||||||
if (gramSize > maxGram) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// grab gramSize chars from front or back
|
|
||||||
int start = side == Side.FRONT ? 0 : termLength - gramSize;
|
|
||||||
int end = start + gramSize;
|
|
||||||
Token tok = (Token) token.clone();
|
|
||||||
tok.setStartOffset(start);
|
|
||||||
tok.setEndOffset(end);
|
|
||||||
tok.setTermBuffer(termBuffer, start, gramSize);
|
|
||||||
ngrams.add(tok);
|
|
||||||
gramSize++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ngram;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
|
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -35,6 +37,9 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
public static final Side DEFAULT_SIDE = Side.FRONT;
|
public static final Side DEFAULT_SIDE = Side.FRONT;
|
||||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
// Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
|
// Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
|
||||||
/** Specifies which side of the input the n-gram should be generated from */
|
/** Specifies which side of the input the n-gram should be generated from */
|
||||||
|
@ -100,6 +105,9 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
this.side = side;
|
this.side = side;
|
||||||
|
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenizer that can generate n-grams in the sizes of the given range
|
||||||
|
@ -114,8 +122,7 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
// if we are just starting, read the whole input
|
// if we are just starting, read the whole input
|
||||||
if (!started) {
|
if (!started) {
|
||||||
started = true;
|
started = true;
|
||||||
|
@ -128,21 +135,32 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
||||||
|
|
||||||
// if the remaining input is too short, we can't generate any n-grams
|
// if the remaining input is too short, we can't generate any n-grams
|
||||||
if (gramSize > inLen) {
|
if (gramSize > inLen) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// if we have hit the end of our n-gram size range, quit
|
// if we have hit the end of our n-gram size range, quit
|
||||||
if (gramSize > maxGram) {
|
if (gramSize > maxGram) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// grab gramSize chars from front or back
|
// grab gramSize chars from front or back
|
||||||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||||
int end = start + gramSize;
|
int end = start + gramSize;
|
||||||
reusableToken.setTermBuffer(inStr, start, gramSize);
|
termAtt.setTermBuffer(inStr, start, gramSize);
|
||||||
reusableToken.setStartOffset(input.correctOffset(start));
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(end));
|
||||||
reusableToken.setEndOffset(input.correctOffset(end));
|
|
||||||
gramSize++;
|
gramSize++;
|
||||||
return reusableToken;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,13 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import java.util.LinkedList;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the input into n-grams of the given size(s).
|
* Tokenizes the input into n-grams of the given size(s).
|
||||||
|
@ -32,7 +33,14 @@ public class NGramTokenFilter extends TokenFilter {
|
||||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||||
|
|
||||||
private int minGram, maxGram;
|
private int minGram, maxGram;
|
||||||
private LinkedList ngrams;
|
|
||||||
|
private char[] curTermBuffer;
|
||||||
|
private int curTermLength;
|
||||||
|
private int curGramSize;
|
||||||
|
private int curPos;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with given min and max n-grams.
|
* Creates NGramTokenFilter with given min and max n-grams.
|
||||||
|
@ -50,7 +58,9 @@ public class NGramTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
this.ngrams = new LinkedList();
|
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -62,40 +72,41 @@ public class NGramTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
while (true) {
|
||||||
if (!ngrams.isEmpty()) {
|
if (curTermBuffer == null) {
|
||||||
return (Token)ngrams.removeFirst();
|
if (!input.incrementToken()) {
|
||||||
}
|
return false;
|
||||||
|
} else {
|
||||||
Token token = null;
|
curTermBuffer = (char[]) termAtt.termBuffer().clone();
|
||||||
|
curTermLength = termAtt.termLength();
|
||||||
while (ngrams.isEmpty() && (token = input.next()) != null) {
|
curGramSize = minGram;
|
||||||
ngram(token);
|
curPos = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
if (token == null) {
|
while (curGramSize <= maxGram) {
|
||||||
return null;
|
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||||
}
|
termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
|
||||||
|
offsetAtt.setOffset(curPos, curPos+curGramSize);
|
||||||
if (!ngrams.isEmpty()) {
|
curPos++;
|
||||||
return (Token)ngrams.removeFirst();
|
return true;
|
||||||
} else {
|
}
|
||||||
return null;
|
curGramSize++; // increase n-gram size
|
||||||
|
curPos = 0;
|
||||||
|
}
|
||||||
|
curTermBuffer = null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
private void ngram(Token token) {
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
char[] termBuffer = token.termBuffer();
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
int termLength = token.termLength();
|
public final Token next() throws java.io.IOException {
|
||||||
int gramSize = minGram;
|
return super.next();
|
||||||
while (gramSize <= maxGram) {
|
|
||||||
int pos = 0; // reset to beginning of string
|
|
||||||
while (pos+gramSize <= termLength) { // while there is input
|
|
||||||
ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize));
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
gramSize++; // increase n-gram size
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis.ngram;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -36,6 +38,9 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
private int inLen;
|
private int inLen;
|
||||||
private String inStr;
|
private String inStr;
|
||||||
private boolean started = false;
|
private boolean started = false;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
@ -53,6 +58,9 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with default min and max n-grams.
|
* Creates NGramTokenizer with default min and max n-grams.
|
||||||
|
@ -63,8 +71,7 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (!started) {
|
if (!started) {
|
||||||
started = true;
|
started = true;
|
||||||
gramSize = minGram;
|
gramSize = minGram;
|
||||||
|
@ -78,13 +85,27 @@ public class NGramTokenizer extends Tokenizer {
|
||||||
pos = 0; // reset to beginning of string
|
pos = 0; // reset to beginning of string
|
||||||
gramSize++; // increase n-gram size
|
gramSize++; // increase n-gram size
|
||||||
if (gramSize > maxGram) // we are done
|
if (gramSize > maxGram) // we are done
|
||||||
return null;
|
return false;
|
||||||
if (pos+gramSize > inLen)
|
if (pos+gramSize > inLen)
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int oldPos = pos;
|
int oldPos = pos;
|
||||||
pos++;
|
pos++;
|
||||||
return reusableToken.reinit(inStr, oldPos, gramSize, input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
|
termAtt.setTermBuffer(inStr, oldPos, gramSize);
|
||||||
|
offsetAtt.setOffset(input.correctOffset(oldPos), input.correctOffset(oldPos+gramSize));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,15 +17,15 @@ package org.apache.lucene.analysis.nl;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Set;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A filter that stems Dutch words. It supports a table of words that should
|
* A filter that stems Dutch words. It supports a table of words that should
|
||||||
|
@ -39,10 +39,13 @@ public final class DutchStemFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
private DutchStemmer stemmer = null;
|
private DutchStemmer stemmer = null;
|
||||||
private Set exclusions = null;
|
private Set exclusions = null;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public DutchStemFilter(TokenStream _in) {
|
public DutchStemFilter(TokenStream _in) {
|
||||||
super(_in);
|
super(_in);
|
||||||
stemmer = new DutchStemmer();
|
stemmer = new DutchStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -62,24 +65,23 @@ public final class DutchStemFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Returns the next token in the stream, or null at EOS
|
* Returns the next token in the stream, or null at EOS
|
||||||
*/
|
*/
|
||||||
public Token next(Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
String term = termAtt.term();
|
||||||
if (nextToken == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
String term = nextToken.term();
|
// Check the exclusion table.
|
||||||
|
if (exclusions == null || !exclusions.contains(term)) {
|
||||||
// Check the exclusion table.
|
String s = stemmer.stem(term);
|
||||||
if (exclusions == null || !exclusions.contains(term)) {
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
String s = stemmer.stem(term);
|
if ((s != null) && !s.equals(term))
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
termAtt.setTermBuffer(s);
|
||||||
if ((s != null) && !s.equals(term))
|
}
|
||||||
nextToken.setTermBuffer(s);
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -16,14 +16,13 @@ package org.apache.lucene.analysis.payloads;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Characters before the delimiter are the "token", those after are the payload.
|
* Characters before the delimiter are the "token", those after are the payload.
|
||||||
|
@ -37,7 +36,7 @@ import java.io.IOException;
|
||||||
*
|
*
|
||||||
* @see PayloadEncoder
|
* @see PayloadEncoder
|
||||||
*/
|
*/
|
||||||
public class DelimitedPayloadTokenFilter extends TokenFilter {
|
public final class DelimitedPayloadTokenFilter extends TokenFilter {
|
||||||
public static final char DEFAULT_DELIMITER = '|';
|
public static final char DEFAULT_DELIMITER = '|';
|
||||||
protected char delimiter = DEFAULT_DELIMITER;
|
protected char delimiter = DEFAULT_DELIMITER;
|
||||||
protected TermAttribute termAtt;
|
protected TermAttribute termAtt;
|
||||||
|
@ -83,27 +82,4 @@ public class DelimitedPayloadTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Token next(Token reusableToken) throws IOException {
|
|
||||||
Token result = input.next(reusableToken);
|
|
||||||
if (result != null) {
|
|
||||||
final char[] buffer = result.termBuffer();
|
|
||||||
final int length = result.termLength();
|
|
||||||
boolean seen = false;
|
|
||||||
for (int i = 0; i < length; i++) {
|
|
||||||
if (buffer[i] == delimiter) {
|
|
||||||
result.setTermBuffer(buffer, 0, i);
|
|
||||||
result.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
|
|
||||||
seen = true;
|
|
||||||
break;//at this point, we know the whole piece, so we can exit. If we don't see the delimiter, then the termAtt is the same
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (seen == false) {
|
|
||||||
//no delimiter
|
|
||||||
payAtt.setPayload(null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -34,19 +36,37 @@ public class NumericPayloadTokenFilter extends TokenFilter {
|
||||||
private String typeMatch;
|
private String typeMatch;
|
||||||
private Payload thePayload;
|
private Payload thePayload;
|
||||||
|
|
||||||
|
private PayloadAttribute payloadAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
|
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
|
||||||
super(input);
|
super(input);
|
||||||
//Need to encode the payload
|
//Need to encode the payload
|
||||||
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
|
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
|
||||||
this.typeMatch = typeMatch;
|
this.typeMatch = typeMatch;
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
if (typeAtt.type().equals(typeMatch))
|
||||||
if (nextToken != null && nextToken.type().equals(typeMatch)){
|
payloadAtt.setPayload(thePayload);
|
||||||
nextToken.setPayload(thePayload);
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,13 +17,15 @@ package org.apache.lucene.analysis.payloads;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)}
|
* Adds the {@link org.apache.lucene.analysis.Token#setStartOffset(int)}
|
||||||
|
@ -32,22 +34,37 @@ import java.io.IOException;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
|
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
|
||||||
|
protected OffsetAttribute offsetAtt;
|
||||||
|
protected PayloadAttribute payAtt;
|
||||||
|
|
||||||
public TokenOffsetPayloadTokenFilter(TokenStream input) {
|
public TokenOffsetPayloadTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
payAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
|
||||||
if (nextToken != null){
|
|
||||||
byte[] data = new byte[8];
|
byte[] data = new byte[8];
|
||||||
PayloadHelper.encodeInt(nextToken.startOffset(), data, 0);
|
PayloadHelper.encodeInt(offsetAtt.startOffset(), data, 0);
|
||||||
PayloadHelper.encodeInt(nextToken.endOffset(), data, 4);
|
PayloadHelper.encodeInt(offsetAtt.endOffset(), data, 4);
|
||||||
Payload payload = new Payload(data);
|
Payload payload = new Payload(data);
|
||||||
nextToken.setPayload(payload);
|
payAtt.setPayload(payload);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.payloads;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -32,19 +34,37 @@ import java.io.IOException;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class TypeAsPayloadTokenFilter extends TokenFilter {
|
public class TypeAsPayloadTokenFilter extends TokenFilter {
|
||||||
|
private PayloadAttribute payloadAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
public TypeAsPayloadTokenFilter(TokenStream input) {
|
public TypeAsPayloadTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
String type = typeAtt.type();
|
||||||
if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){
|
if (type != null && type.equals("") == false) {
|
||||||
nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8")));
|
payloadAtt.setPayload(new Payload(type.getBytes("UTF-8")));
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
/** Set the positionIncrement of all tokens to the "positionIncrement",
|
||||||
* except the first return token which retains its original positionIncrement value.
|
* except the first return token which retains its original positionIncrement value.
|
||||||
|
@ -34,6 +35,8 @@ public class PositionFilter extends TokenFilter {
|
||||||
|
|
||||||
/** The first token must have non-zero positionIncrement **/
|
/** The first token must have non-zero positionIncrement **/
|
||||||
private boolean firstTokenPositioned = false;
|
private boolean firstTokenPositioned = false;
|
||||||
|
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a PositionFilter that assigns a position increment of zero to
|
* Constructs a PositionFilter that assigns a position increment of zero to
|
||||||
|
@ -43,6 +46,7 @@ public class PositionFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public PositionFilter(final TokenStream input) {
|
public PositionFilter(final TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -58,18 +62,29 @@ public class PositionFilter extends TokenFilter {
|
||||||
this.positionIncrement = positionIncrement;
|
this.positionIncrement = positionIncrement;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
assert reusableToken != null;
|
|
||||||
reusableToken = input.next(reusableToken);
|
|
||||||
if (null != reusableToken) {
|
|
||||||
if (firstTokenPositioned) {
|
if (firstTokenPositioned) {
|
||||||
reusableToken.setPositionIncrement(positionIncrement);
|
posIncrAtt.setPositionIncrement(positionIncrement);
|
||||||
} else {
|
} else {
|
||||||
firstTokenPositioned = true;
|
firstTokenPositioned = true;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return reusableToken;
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.reverse;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -30,16 +30,20 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public final class ReverseStringFilter extends TokenFilter {
|
public final class ReverseStringFilter extends TokenFilter {
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public ReverseStringFilter(TokenStream in) {
|
public ReverseStringFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(Token in) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert in != null;
|
if (input.incrementToken()) {
|
||||||
Token token=input.next(in);
|
reverse( termAtt.termBuffer(), termAtt.termLength() );
|
||||||
if( token == null ) return null;
|
return true;
|
||||||
reverse( token.termBuffer(), token.termLength() );
|
} else {
|
||||||
return token;
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String reverse( final String input ){
|
public static String reverse( final String input ){
|
||||||
|
|
|
@ -17,9 +17,12 @@ package org.apache.lucene.analysis.ru;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
* Normalizes token text to lower case, analyzing given ("russian") charset.
|
||||||
|
@ -31,26 +34,27 @@ public final class RussianLowerCaseFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
char[] charset;
|
char[] charset;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
public RussianLowerCaseFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
this.charset = charset;
|
this.charset = charset;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException
|
public final boolean incrementToken() throws IOException
|
||||||
{
|
{
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
char[] chArray = termAtt.termBuffer();
|
||||||
|
int chLen = termAtt.termLength();
|
||||||
if (nextToken == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
char[] chArray = nextToken.termBuffer();
|
|
||||||
int chLen = nextToken.termLength();
|
|
||||||
for (int i = 0; i < chLen; i++)
|
for (int i = 0; i < chLen; i++)
|
||||||
{
|
{
|
||||||
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||||
}
|
}
|
||||||
return nextToken;
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,6 +20,8 @@ package org.apache.lucene.analysis.ru;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -37,29 +39,32 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
*/
|
*/
|
||||||
private RussianStemmer stemmer = null;
|
private RussianStemmer stemmer = null;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public RussianStemFilter(TokenStream in, char[] charset)
|
public RussianStemFilter(TokenStream in, char[] charset)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new RussianStemmer(charset);
|
stemmer = new RussianStemmer(charset);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return Returns the next token in the stream, or null at EOS
|
* Returns the next token in the stream, or null at EOS
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken) throws IOException
|
public final boolean incrementToken() throws IOException
|
||||||
{
|
{
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
String term = termAtt.term();
|
||||||
if (nextToken == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
String term = nextToken.term();
|
|
||||||
String s = stemmer.stem(term);
|
String s = stemmer.stem(term);
|
||||||
if (s != null && !s.equals(term))
|
if (s != null && !s.equals(term))
|
||||||
nextToken.setTermBuffer(s);
|
termAtt.setTermBuffer(s);
|
||||||
return nextToken;
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Set a alternative/custom RussianStemmer for this filter.
|
* Set a alternative/custom RussianStemmer for this filter.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -22,6 +22,9 @@ import java.lang.Character.UnicodeBlock;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.text.BreakIterator;
|
import java.text.BreakIterator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -32,46 +35,62 @@ import java.text.BreakIterator;
|
||||||
public class ThaiWordFilter extends TokenFilter {
|
public class ThaiWordFilter extends TokenFilter {
|
||||||
|
|
||||||
private BreakIterator breaker = null;
|
private BreakIterator breaker = null;
|
||||||
private Token thaiToken = null;
|
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
private State thaiState = null;
|
||||||
|
|
||||||
public ThaiWordFilter(TokenStream input) {
|
public ThaiWordFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
breaker = BreakIterator.getWordInstance(new Locale("th"));
|
breaker = BreakIterator.getWordInstance(new Locale("th"));
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (thaiState != null) {
|
||||||
if (thaiToken != null) {
|
|
||||||
int start = breaker.current();
|
int start = breaker.current();
|
||||||
int end = breaker.next();
|
int end = breaker.next();
|
||||||
if (end != BreakIterator.DONE) {
|
if (end != BreakIterator.DONE) {
|
||||||
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
|
restoreState(thaiState);
|
||||||
reusableToken.setStartOffset(thaiToken.startOffset()+start);
|
termAtt.setTermBuffer(termAtt.termBuffer(), start, end - start);
|
||||||
reusableToken.setEndOffset(thaiToken.startOffset()+end);
|
offsetAtt.setOffset(offsetAtt.startOffset() + start, offsetAtt.startOffset() + end);
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
thaiToken = null;
|
thaiState = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
if (input.incrementToken() == false || termAtt.termLength() == 0)
|
||||||
if (nextToken == null || nextToken.termLength() == 0) {
|
return false;
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
String text = nextToken.term();
|
String text = termAtt.term();
|
||||||
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
|
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
|
||||||
nextToken.setTermBuffer(text.toLowerCase());
|
termAtt.setTermBuffer(text.toLowerCase());
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
thaiState = captureState();
|
||||||
|
|
||||||
thaiToken = (Token) nextToken.clone();
|
|
||||||
breaker.setText(text);
|
breaker.setText(text);
|
||||||
int end = breaker.next();
|
int end = breaker.next();
|
||||||
if (end != BreakIterator.DONE) {
|
if (end != BreakIterator.DONE) {
|
||||||
nextToken.setTermBuffer(text, 0, end);
|
termAtt.setTermBuffer(text, 0, end);
|
||||||
nextToken.setEndOffset(nextToken.startOffset() + end);
|
offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset() + end);
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
return null;
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,18 +17,12 @@ package org.apache.lucene.analysis.ar;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Arabic Normalization Filter
|
* Test the Arabic Normalization Filter
|
||||||
|
@ -95,11 +89,10 @@ public class TestArabicNormalizationFilter extends TestCase {
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||||
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = filter.next(reusableToken);
|
|
||||||
if (nextToken == null)
|
assertTrue(filter.incrementToken());
|
||||||
fail();
|
assertEquals(expected, termAtt.term());
|
||||||
assertEquals(expected, nextToken.term());
|
|
||||||
filter.close();
|
filter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,17 +17,12 @@ package org.apache.lucene.analysis.ar;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Arabic Normalization Filter
|
* Test the Arabic Normalization Filter
|
||||||
|
@ -118,11 +113,10 @@ public class TestArabicStemFilter extends TestCase {
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(new StringReader(input));
|
||||||
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = filter.next(reusableToken);
|
|
||||||
if (nextToken == null)
|
assertTrue(filter.incrementToken());
|
||||||
fail();
|
assertEquals(expected, termAtt.term());
|
||||||
assertEquals(expected, nextToken.term());
|
|
||||||
filter.close();
|
filter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,8 +23,8 @@ import java.io.StringReader;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Brazilian Stem Filter, which only modifies the term text.
|
* Test the Brazilian Stem Filter, which only modifies the term text.
|
||||||
|
@ -122,12 +122,10 @@ public class TestBrazilianStemmer extends TestCase {
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
Analyzer analyzer = new BrazilianAnalyzer();
|
Analyzer analyzer = new BrazilianAnalyzer();
|
||||||
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
|
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = stream.next(reusableToken);
|
assertTrue(stream.incrementToken());
|
||||||
if (nextToken == null)
|
assertEquals(expected, text.term());
|
||||||
fail();
|
assertFalse(stream.incrementToken());
|
||||||
assertEquals(expected, nextToken.term());
|
|
||||||
assertTrue(stream.next(nextToken) == null);
|
|
||||||
stream.close();
|
stream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,50 +21,49 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
|
|
||||||
public class TestCJKTokenizer extends TestCase{
|
public class TestCJKTokenizer extends TestCase{
|
||||||
|
|
||||||
|
class TestToken {
|
||||||
|
String termText;
|
||||||
|
int start;
|
||||||
|
int end;
|
||||||
|
String type;
|
||||||
|
}
|
||||||
|
|
||||||
public Token newToken(String termText, int start, int end, int type) {
|
public TestToken newToken(String termText, int start, int end, int type) {
|
||||||
Token token = new Token(start, end);
|
TestToken token = new TestToken();
|
||||||
token.setTermBuffer(termText);
|
token.termText = termText;
|
||||||
token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]);
|
token.type = CJKTokenizer.TOKEN_TYPE_NAMES[type];
|
||||||
|
token.start = start;
|
||||||
|
token.end = end;
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException {
|
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
|
||||||
CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
|
CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str));
|
||||||
int i = 0;
|
TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
|
||||||
System.out.println("string[" + str + "]");
|
OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class);
|
||||||
System.out.print("tokens[");
|
TypeAttribute typeAtt = (TypeAttribute) tokenizer.getAttribute(TypeAttribute.class);
|
||||||
final Token reusableToken = new Token();
|
for (int i = 0; i < out_tokens.length; i++) {
|
||||||
for (Token token = tokenizer.next(reusableToken) ;
|
assertTrue(tokenizer.incrementToken());
|
||||||
token != null ;
|
assertEquals(termAtt.term(), out_tokens[i].termText);
|
||||||
token = tokenizer.next(reusableToken) ) {
|
assertEquals(offsetAtt.startOffset(), out_tokens[i].start);
|
||||||
if (token.term().equals(out_tokens[i].term())
|
assertEquals(offsetAtt.endOffset(), out_tokens[i].end);
|
||||||
&& token.startOffset() == out_tokens[i].startOffset()
|
assertEquals(typeAtt.type(), out_tokens[i].type);
|
||||||
&& token.endOffset() == out_tokens[i].endOffset()
|
|
||||||
&& token.type().equals(out_tokens[i].type()) ) {
|
|
||||||
System.out.print( token.term() + " ");
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
fail(token.term() + " (start: " + token.startOffset()
|
|
||||||
+ " end: " + token.endOffset() + " type: " + token.type() + ") != "
|
|
||||||
+ out_tokens[i].term() + " (start: " + out_tokens[i].startOffset()
|
|
||||||
+ " end: " + out_tokens[i].endOffset()
|
|
||||||
+ " type: " + out_tokens[i].type() + ")");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
++i;
|
|
||||||
}
|
}
|
||||||
System.out.println("]" + System.getProperty("line.separator"));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testJa1() throws IOException {
|
public void testJa1() throws IOException {
|
||||||
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
@ -81,7 +80,7 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
public void testJa2() throws IOException {
|
public void testJa2() throws IOException {
|
||||||
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
|
String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
@ -97,7 +96,7 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
public void testC() throws IOException {
|
public void testC() throws IOException {
|
||||||
String str = "abc defgh ijklmn opqrstu vwxy z";
|
String str = "abc defgh ijklmn opqrstu vwxy z";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||||
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||||
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE),
|
||||||
|
@ -111,7 +110,7 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
public void testMix() throws IOException {
|
public void testMix() throws IOException {
|
||||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
@ -128,7 +127,7 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
public void testMix2() throws IOException {
|
public void testMix2() throws IOException {
|
||||||
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
|
@ -147,7 +146,7 @@ public class TestCJKTokenizer extends TestCase{
|
||||||
public void testSingleChar() throws IOException {
|
public void testSingleChar() throws IOException {
|
||||||
String str = "\u4e00";
|
String str = "\u4e00";
|
||||||
|
|
||||||
Token[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE),
|
||||||
};
|
};
|
||||||
checkCJKToken(str, out_tokens);
|
checkCJKToken(str, out_tokens);
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
||||||
|
|
||||||
public class TestChineseTokenizer extends TestCase
|
public class TestChineseTokenizer extends TestCase
|
||||||
|
@ -34,12 +34,12 @@ public class TestChineseTokenizer extends TestCase
|
||||||
|
|
||||||
int correctStartOffset = 0;
|
int correctStartOffset = 0;
|
||||||
int correctEndOffset = 1;
|
int correctEndOffset = 1;
|
||||||
final Token reusableToken = new Token();
|
OffsetAttribute offsetAtt = (OffsetAttribute) tokenizer.getAttribute(OffsetAttribute.class);
|
||||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
while (tokenizer.incrementToken()) {
|
||||||
assertEquals(correctStartOffset, nextToken.startOffset());
|
assertEquals(correctStartOffset, offsetAtt.startOffset());
|
||||||
assertEquals(correctEndOffset, nextToken.endOffset());
|
assertEquals(correctEndOffset, offsetAtt.endOffset());
|
||||||
correctStartOffset++;
|
correctStartOffset++;
|
||||||
correctEndOffset++;
|
correctEndOffset++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,15 +31,14 @@ import java.util.List;
|
||||||
import java.util.zip.ZipEntry;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.zip.ZipInputStream;
|
import java.util.zip.ZipInputStream;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.compound.CompoundWordTokenFilterBase;
|
|
||||||
import org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter;
|
|
||||||
import org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter;
|
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import junit.framework.TestCase;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
public class TestCompoundWordTokenFilter extends TestCase {
|
public class TestCompoundWordTokenFilter extends TestCase {
|
||||||
private static String[] locations = {
|
private static String[] locations = {
|
||||||
|
@ -155,16 +154,18 @@ public class TestCompoundWordTokenFilter extends TestCase {
|
||||||
|
|
||||||
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
|
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
|
||||||
int[] endOffset, int[] posIncr) throws Exception {
|
int[] endOffset, int[] posIncr) throws Exception {
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.getAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) tf.getAttribute(OffsetAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) tf.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
for (int i = 0; i < s.length; ++i) {
|
for (int i = 0; i < s.length; ++i) {
|
||||||
Token nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(s[i], termAtt.term());
|
||||||
assertEquals(s[i], nextToken.term());
|
assertEquals(startOffset[i], offsetAtt.startOffset());
|
||||||
assertEquals(startOffset[i], nextToken.startOffset());
|
assertEquals(endOffset[i], offsetAtt.endOffset());
|
||||||
assertEquals(endOffset[i], nextToken.endOffset());
|
assertEquals(posIncr[i], posIncAtt.getPositionIncrement());
|
||||||
assertEquals(posIncr[i], nextToken.getPositionIncrement());
|
|
||||||
}
|
}
|
||||||
assertNull(tf.next(reusableToken));
|
assertFalse(tf.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void getHyphenationPatternFileContents() {
|
private void getHyphenationPatternFileContents() {
|
||||||
|
|
|
@ -22,8 +22,8 @@ import java.io.StringReader;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the CzechAnalyzer
|
* Test the CzechAnalyzer
|
||||||
|
@ -39,13 +39,12 @@ public class TestCzechAnalyzer extends TestCase {
|
||||||
|
|
||||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i=0; i<output.length; i++) {
|
for (int i=0; i<output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(text.term(), output[i]);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,8 +26,8 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the German stemmer. The stemming algorithm is known to work less
|
* Test the German stemmer. The stemming algorithm is known to work less
|
||||||
|
@ -68,11 +68,9 @@ public class TestGermanStemFilter extends TestCase {
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
||||||
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = filter.next(reusableToken);
|
assertTrue(filter.incrementToken());
|
||||||
if (nextToken == null)
|
assertEquals(expected, termAtt.term());
|
||||||
fail();
|
|
||||||
assertEquals(expected, nextToken.term());
|
|
||||||
filter.close();
|
filter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,12 +18,12 @@ package org.apache.lucene.analysis.el;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A unit test class for verifying the correct operation of the GreekAnalyzer.
|
* A unit test class for verifying the correct operation of the GreekAnalyzer.
|
||||||
|
@ -41,13 +41,12 @@ public class GreekAnalyzerTest extends TestCase {
|
||||||
*/
|
*/
|
||||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i=0; i<output.length; i++) {
|
for (int i=0; i<output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(termAtt.term(), output[i]);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -26,10 +26,10 @@ import java.util.Set;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -52,9 +52,9 @@ public class TestElision extends TestCase {
|
||||||
private List filtre(TokenFilter filter) {
|
private List filtre(TokenFilter filter) {
|
||||||
List tas = new ArrayList();
|
List tas = new ArrayList();
|
||||||
try {
|
try {
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
while (filter.incrementToken()) {
|
||||||
tas.add(nextToken.term());
|
tas.add(termAtt.term());
|
||||||
}
|
}
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
|
|
|
@ -59,8 +59,8 @@ import java.io.StringReader;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for FrenchAnalyzer.
|
* Test case for FrenchAnalyzer.
|
||||||
|
@ -76,13 +76,12 @@ public class TestFrenchAnalyzer extends TestCase {
|
||||||
|
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i = 0; i < output.length; i++) {
|
for (int i = 0; i < output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(termAtt.term(), output[i]);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,20 +17,19 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
public class TestEmptyTokenStream extends TestCase {
|
public class TestEmptyTokenStream extends TestCase {
|
||||||
|
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
TokenStream ts = new EmptyTokenStream();
|
TokenStream ts = new EmptyTokenStream();
|
||||||
assertNull(ts.next());
|
assertFalse(ts.incrementToken());
|
||||||
ts.reset();
|
ts.reset();
|
||||||
assertNull(ts.next(new Token()));
|
assertFalse(ts.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,9 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
@ -67,58 +67,57 @@ public class EdgeNGramTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
public void testFrontUnigram() throws Exception {
|
public void testFrontUnigram() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(a,0,1)", nextToken.toString());
|
assertEquals("(a,0,1)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBackUnigram() throws Exception {
|
public void testBackUnigram() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(e,4,5)", nextToken.toString());
|
assertEquals("(e,4,5)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
|
||||||
assertNull(tokenizer.next(new Token()));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFrontRangeOfNgrams() throws Exception {
|
public void testFrontRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(a,0,1)", nextToken.toString());
|
assertEquals("(a,0,1)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(ab,0,2)", nextToken.toString());
|
assertEquals("(ab,0,2)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(abc,0,3)", nextToken.toString());
|
assertEquals("(abc,0,3)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBackRangeOfNgrams() throws Exception {
|
public void testBackRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(e,4,5)", nextToken.toString());
|
assertEquals("(e,4,5)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(de,3,5)", nextToken.toString());
|
assertEquals("(de,3,5)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(cde,2,5)", nextToken.toString());
|
assertEquals("(cde,2,5)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(abc,0,3)", nextToken.toString());
|
assertEquals("(abc,0,3)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals("(fgh,0,3)", termAtt.toString());
|
||||||
assertEquals("(fgh,0,3)", nextToken.toString());
|
assertFalse(tokenizer.incrementToken());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,11 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -65,46 +66,46 @@ public class EdgeNGramTokenizerTest extends TestCase {
|
||||||
|
|
||||||
public void testFrontUnigram() throws Exception {
|
public void testFrontUnigram() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(a,0,1)", nextToken.toString());
|
assertEquals("(a,0,1)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBackUnigram() throws Exception {
|
public void testBackUnigram() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(e,4,5)", nextToken.toString());
|
assertEquals("(e,4,5)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
|
||||||
assertNull(tokenizer.next(new Token()));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFrontRangeOfNgrams() throws Exception {
|
public void testFrontRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(a,0,1)", nextToken.toString());
|
assertEquals("(a,0,1)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(ab,0,2)", nextToken.toString());
|
assertEquals("(ab,0,2)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(abc,0,3)", nextToken.toString());
|
assertEquals("(abc,0,3)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBackRangeOfNgrams() throws Exception {
|
public void testBackRangeOfNgrams() throws Exception {
|
||||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenizer.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(e,4,5)", nextToken.toString());
|
assertEquals("(e,4,5)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(de,3,5)", nextToken.toString());
|
assertEquals("(de,3,5)", termAtt.toString());
|
||||||
nextToken = tokenizer.next(reusableToken);
|
assertTrue(tokenizer.incrementToken());
|
||||||
assertEquals("(cde,2,5)", nextToken.toString());
|
assertEquals("(cde,2,5)", termAtt.toString());
|
||||||
assertNull(tokenizer.next(reusableToken));
|
assertFalse(tokenizer.incrementToken());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,12 +17,12 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
@ -31,7 +31,6 @@ import junit.framework.TestCase;
|
||||||
*/
|
*/
|
||||||
public class NGramTokenFilterTest extends TestCase {
|
public class NGramTokenFilterTest extends TestCase {
|
||||||
private TokenStream input;
|
private TokenStream input;
|
||||||
private ArrayList tokens = new ArrayList();
|
|
||||||
|
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
input = new WhitespaceTokenizer(new StringReader("abcde"));
|
input = new WhitespaceTokenizer(new StringReader("abcde"));
|
||||||
|
@ -57,79 +56,56 @@ public class NGramTokenFilterTest extends TestCase {
|
||||||
assertTrue(gotException);
|
assertTrue(gotException);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkStream(TokenStream stream, String[] exp) throws IOException {
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||||
|
for (int i = 0; i < exp.length; i++) {
|
||||||
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals(exp[i], termAtt.toString());
|
||||||
|
}
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
public void testUnigrams() throws Exception {
|
public void testUnigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
||||||
|
String[] exp = new String[] {
|
||||||
final Token reusableToken = new Token();
|
"(a,0,1)", "(b,1,2)", "(c,2,3)", "(d,3,4)", "(e,4,5)"
|
||||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
};
|
||||||
tokens.add(nextToken.toString());
|
|
||||||
// System.out.println(token.term());
|
checkStream(filter, exp);
|
||||||
// System.out.println(token);
|
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertEquals(5, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigrams() throws Exception {
|
public void testBigrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
"(ab,0,2)", "(bc,1,3)", "(cd,2,4)", "(de,3,5)"
|
||||||
tokens.add(nextToken.toString());
|
};
|
||||||
// System.out.println(token.term());
|
|
||||||
// System.out.println(token);
|
checkStream(filter, exp);
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertEquals(4, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgrams() throws Exception {
|
public void testNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
"(a,0,1)", "(b,1,2)", "(c,2,3)", "(d,3,4)", "(e,4,5)",
|
||||||
tokens.add(nextToken.toString());
|
"(ab,0,2)", "(bc,1,3)", "(cd,2,4)", "(de,3,5)",
|
||||||
// System.out.println(token.term());
|
"(abc,0,3)", "(bcd,1,4)", "(cde,2,5)"
|
||||||
// System.out.println(token);
|
};
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
checkStream(filter, exp);
|
||||||
|
|
||||||
assertEquals(12, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
|
|
||||||
exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
|
|
||||||
exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
||||||
final Token reusableToken = new Token();
|
assertFalse(filter.incrementToken());
|
||||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
|
||||||
tokens.add(nextToken.toString());
|
|
||||||
// System.out.println(token.term());
|
|
||||||
// System.out.println(token);
|
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertTrue(tokens.isEmpty());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
input = new WhitespaceTokenizer(new StringReader("abc de fgh"));
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
Token nextToken = filter.next(reusableToken);
|
"(abc,0,3)", "(fgh,0,3)"
|
||||||
assertEquals("(abc,0,3)", nextToken.toString());
|
};
|
||||||
nextToken = filter.next(reusableToken);
|
|
||||||
assertNotNull(nextToken);
|
checkStream(filter, exp);
|
||||||
assertEquals("(fgh,0,3)", nextToken.toString());
|
|
||||||
assertNull(filter.next(reusableToken));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,12 @@ package org.apache.lucene.analysis.ngram;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
@ -29,7 +31,6 @@ import junit.framework.TestCase;
|
||||||
*/
|
*/
|
||||||
public class NGramTokenizerTest extends TestCase {
|
public class NGramTokenizerTest extends TestCase {
|
||||||
private StringReader input;
|
private StringReader input;
|
||||||
private ArrayList tokens = new ArrayList();
|
|
||||||
|
|
||||||
public void setUp() {
|
public void setUp() {
|
||||||
input = new StringReader("abcde");
|
input = new StringReader("abcde");
|
||||||
|
@ -54,69 +55,48 @@ public class NGramTokenizerTest extends TestCase {
|
||||||
}
|
}
|
||||||
assertTrue(gotException);
|
assertTrue(gotException);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void checkStream(TokenStream stream, String[] exp) throws IOException {
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||||
|
for (int i = 0; i < exp.length; i++) {
|
||||||
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals(exp[i], termAtt.toString());
|
||||||
|
}
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
public void testUnigrams() throws Exception {
|
public void testUnigrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
"(a,0,1)", "(b,1,2)", "(c,2,3)", "(d,3,4)", "(e,4,5)"
|
||||||
tokens.add(nextToken.toString());
|
};
|
||||||
// System.out.println(token.term());
|
|
||||||
// System.out.println(token);
|
checkStream(tokenizer, exp);
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertEquals(5, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBigrams() throws Exception {
|
public void testBigrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
"(ab,0,2)", "(bc,1,3)", "(cd,2,4)", "(de,3,5)"
|
||||||
tokens.add(nextToken.toString());
|
};
|
||||||
// System.out.println(token.term());
|
|
||||||
// System.out.println(token);
|
checkStream(tokenizer, exp);
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertEquals(4, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testNgrams() throws Exception {
|
public void testNgrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
||||||
final Token reusableToken = new Token();
|
String[] exp = new String[] {
|
||||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
"(a,0,1)", "(b,1,2)", "(c,2,3)", "(d,3,4)", "(e,4,5)",
|
||||||
tokens.add(nextToken.toString());
|
"(ab,0,2)", "(bc,1,3)", "(cd,2,4)", "(de,3,5)",
|
||||||
// System.out.println(token.term());
|
"(abc,0,3)", "(bcd,1,4)", "(cde,2,5)"
|
||||||
// System.out.println(token);
|
};
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
checkStream(tokenizer, exp);
|
||||||
|
|
||||||
assertEquals(12, tokens.size());
|
|
||||||
ArrayList exp = new ArrayList();
|
|
||||||
exp.add("(a,0,1)"); exp.add("(b,1,2)"); exp.add("(c,2,3)"); exp.add("(d,3,4)"); exp.add("(e,4,5)");
|
|
||||||
exp.add("(ab,0,2)"); exp.add("(bc,1,3)"); exp.add("(cd,2,4)"); exp.add("(de,3,5)");
|
|
||||||
exp.add("(abc,0,3)"); exp.add("(bcd,1,4)"); exp.add("(cde,2,5)");
|
|
||||||
assertEquals(exp, tokens);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testOversizedNgrams() throws Exception {
|
public void testOversizedNgrams() throws Exception {
|
||||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
||||||
|
assertFalse(tokenizer.incrementToken());
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
|
||||||
tokens.add(nextToken.toString());
|
|
||||||
// System.out.println(token.term());
|
|
||||||
// System.out.println(token);
|
|
||||||
// Thread.sleep(1000);
|
|
||||||
}
|
|
||||||
|
|
||||||
assertTrue(tokens.isEmpty());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,8 +23,8 @@ import java.io.StringReader;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Dutch Stem Filter, which only modifies the term text.
|
* Test the Dutch Stem Filter, which only modifies the term text.
|
||||||
|
@ -121,12 +121,10 @@ public class TestDutchStemmer extends TestCase {
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
Analyzer analyzer = new DutchAnalyzer();
|
Analyzer analyzer = new DutchAnalyzer();
|
||||||
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
|
TokenStream stream = analyzer.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = stream.next(reusableToken);
|
assertTrue(stream.incrementToken());
|
||||||
if (nextToken == null)
|
assertEquals(expected, text.term());
|
||||||
fail();
|
assertFalse(stream.incrementToken());
|
||||||
assertEquals(expected, nextToken.term());
|
|
||||||
assertTrue(stream.next(nextToken) == null);
|
|
||||||
stream.close();
|
stream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
@ -65,7 +64,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
assertTermEquals("lazy", filter, "JJ".getBytes("UTF-8"));
|
assertTermEquals("lazy", filter, "JJ".getBytes("UTF-8"));
|
||||||
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
|
assertTermEquals("brown", filter, "JJ".getBytes("UTF-8"));
|
||||||
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
|
assertTermEquals("dogs", filter, "NN".getBytes("UTF-8"));
|
||||||
assertTrue(filter.next(new Token()) == null);
|
assertFalse(filter.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -106,10 +105,11 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
||||||
Token tok = new Token();
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
assertTrue(stream.next(tok) != null);
|
PayloadAttribute payloadAtt = (PayloadAttribute) stream.getAttribute(PayloadAttribute.class);
|
||||||
assertEquals(expected, tok.term());
|
assertTrue(stream.incrementToken());
|
||||||
Payload payload = tok.getPayload();
|
assertEquals(expected, termAtt.term());
|
||||||
|
Payload payload = payloadAtt.getPayload();
|
||||||
if (payload != null) {
|
if (payload != null) {
|
||||||
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
||||||
for (int i = 0; i < expectPay.length; i++) {
|
for (int i = 0; i < expectPay.length; i++) {
|
||||||
|
|
|
@ -17,10 +17,12 @@ package org.apache.lucene.analysis.payloads;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -44,36 +46,44 @@ public class NumericPayloadTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
|
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
|
||||||
boolean seenDogs = false;
|
boolean seenDogs = false;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) nptf.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
TypeAttribute typeAtt = (TypeAttribute) nptf.getAttribute(TypeAttribute.class);
|
||||||
if (nextToken.term().equals("dogs")){
|
PayloadAttribute payloadAtt = (PayloadAttribute) nptf.getAttribute(PayloadAttribute.class);
|
||||||
|
while (nptf.incrementToken()) {
|
||||||
|
if (termAtt.term().equals("dogs")) {
|
||||||
seenDogs = true;
|
seenDogs = true;
|
||||||
assertTrue(nextToken.type() + " is not equal to " + "D", nextToken.type().equals("D") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
||||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
|
assertTrue("payloadAtt.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||||
byte [] bytes = nextToken.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
|
byte [] bytes = payloadAtt.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
|
||||||
assertTrue(bytes.length + " does not equal: " + nextToken.getPayload().length(), bytes.length == nextToken.getPayload().length());
|
assertTrue(bytes.length + " does not equal: " + payloadAtt.getPayload().length(), bytes.length == payloadAtt.getPayload().length());
|
||||||
assertTrue(nextToken.getPayload().getOffset() + " does not equal: " + 0, nextToken.getPayload().getOffset() == 0);
|
assertTrue(payloadAtt.getPayload().getOffset() + " does not equal: " + 0, payloadAtt.getPayload().getOffset() == 0);
|
||||||
float pay = PayloadHelper.decodeFloat(bytes);
|
float pay = PayloadHelper.decodeFloat(bytes);
|
||||||
assertTrue(pay + " does not equal: " + 3, pay == 3);
|
assertTrue(pay + " does not equal: " + 3, pay == 3);
|
||||||
} else {
|
} else {
|
||||||
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals("word"));
|
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals("word"));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class WordTokenFilter extends TokenFilter {
|
private final class WordTokenFilter extends TokenFilter {
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
private WordTokenFilter(TokenStream input) {
|
private WordTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
if (termAtt.term().equals("dogs"))
|
||||||
if (nextToken != null && nextToken.term().equals("dogs")) {
|
typeAtt.setType("D");
|
||||||
nextToken.setType("D");
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,9 @@ package org.apache.lucene.analysis.payloads;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -43,16 +44,17 @@ public class TokenOffsetPayloadTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
|
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
|
||||||
int count = 0;
|
int count = 0;
|
||||||
final Token reusableToken = new Token();
|
PayloadAttribute payloadAtt = (PayloadAttribute) nptf.getAttribute(PayloadAttribute.class);
|
||||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
OffsetAttribute offsetAtt = (OffsetAttribute) nptf.getAttribute(OffsetAttribute.class);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
Payload pay = nextToken.getPayload();
|
while (nptf.incrementToken()) {
|
||||||
|
Payload pay = payloadAtt.getPayload();
|
||||||
assertTrue("pay is null and it shouldn't be", pay != null);
|
assertTrue("pay is null and it shouldn't be", pay != null);
|
||||||
byte [] data = pay.getData();
|
byte [] data = pay.getData();
|
||||||
int start = PayloadHelper.decodeInt(data, 0);
|
int start = PayloadHelper.decodeInt(data, 0);
|
||||||
assertTrue(start + " does not equal: " + nextToken.startOffset(), start == nextToken.startOffset());
|
assertTrue(start + " does not equal: " + offsetAtt.startOffset(), start == offsetAtt.startOffset());
|
||||||
int end = PayloadHelper.decodeInt(data, 4);
|
int end = PayloadHelper.decodeInt(data, 4);
|
||||||
assertTrue(end + " does not equal: " + nextToken.endOffset(), end == nextToken.endOffset());
|
assertTrue(end + " does not equal: " + offsetAtt.endOffset(), end == offsetAtt.endOffset());
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||||
|
|
|
@ -19,8 +19,10 @@ package org.apache.lucene.analysis.payloads;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -45,32 +47,39 @@ public class TypeAsPayloadTokenFilterTest extends TestCase {
|
||||||
|
|
||||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
|
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
|
||||||
int count = 0;
|
int count = 0;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) nptf.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
TypeAttribute typeAtt = (TypeAttribute) nptf.getAttribute(TypeAttribute.class);
|
||||||
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0]))));
|
PayloadAttribute payloadAtt = (PayloadAttribute) nptf.getAttribute(PayloadAttribute.class);
|
||||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
|
|
||||||
String type = new String(nextToken.getPayload().getData(), "UTF-8");
|
while (nptf.incrementToken()) {
|
||||||
|
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0]))));
|
||||||
|
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||||
|
String type = new String(payloadAtt.getPayload().getData(), "UTF-8");
|
||||||
assertTrue("type is null and it shouldn't be", type != null);
|
assertTrue("type is null and it shouldn't be", type != null);
|
||||||
assertTrue(type + " is not equal to " + nextToken.type(), type.equals(nextToken.type()) == true);
|
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class WordTokenFilter extends TokenFilter {
|
private final class WordTokenFilter extends TokenFilter {
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
private WordTokenFilter(TokenStream input) {
|
private WordTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
typeAtt.setType(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0])));
|
||||||
assert reusableToken != null;
|
return true;
|
||||||
Token nextToken = input.next(reusableToken);
|
} else {
|
||||||
if (nextToken != null) {
|
return false;
|
||||||
nextToken.setType(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0])));
|
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,9 +19,9 @@ package org.apache.lucene.analysis.reverse;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestReverseStringFilter extends LuceneTestCase {
|
public class TestReverseStringFilter extends LuceneTestCase {
|
||||||
|
@ -29,13 +29,18 @@ public class TestReverseStringFilter extends LuceneTestCase {
|
||||||
TokenStream stream = new WhitespaceTokenizer(
|
TokenStream stream = new WhitespaceTokenizer(
|
||||||
new StringReader("Do have a nice day")); // 1-4 length string
|
new StringReader("Do have a nice day")); // 1-4 length string
|
||||||
ReverseStringFilter filter = new ReverseStringFilter(stream);
|
ReverseStringFilter filter = new ReverseStringFilter(stream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
assertEquals("oD", filter.next(reusableToken).term());
|
assertTrue(filter.incrementToken());
|
||||||
assertEquals("evah", filter.next(reusableToken).term());
|
assertEquals("oD", text.term());
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTrue(filter.incrementToken());
|
||||||
assertEquals("ecin", filter.next(reusableToken).term());
|
assertEquals("evah", text.term());
|
||||||
assertEquals("yad", filter.next(reusableToken).term());
|
assertTrue(filter.incrementToken());
|
||||||
assertNull(filter.next(reusableToken));
|
assertEquals("a", text.term());
|
||||||
|
assertTrue(filter.incrementToken());
|
||||||
|
assertEquals("ecin", text.term());
|
||||||
|
assertTrue(filter.incrementToken());
|
||||||
|
assertEquals("yad", text.term());
|
||||||
|
assertFalse(filter.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReverseString() throws Exception {
|
public void testReverseString() throws Exception {
|
||||||
|
|
|
@ -26,8 +26,8 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for RussianAnalyzer.
|
* Test case for RussianAnalyzer.
|
||||||
|
@ -77,26 +77,21 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
sampleUnicode,
|
sampleUnicode,
|
||||||
RussianCharsets.UnicodeRussian);
|
RussianCharsets.UnicodeRussian);
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
||||||
final Token reusableSampleToken = new Token();
|
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
Token nextSampleToken;
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
nextToken = in.next(reusableToken);
|
if (in.incrementToken() == false)
|
||||||
|
break;
|
||||||
|
|
||||||
if (nextToken == null)
|
boolean nextSampleToken = sample.incrementToken();
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
nextSampleToken = sample.next(reusableSampleToken);
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"Unicode",
|
"Unicode",
|
||||||
nextToken.term(),
|
text.term(),
|
||||||
nextSampleToken == null
|
nextSampleToken == false
|
||||||
? null
|
? null
|
||||||
: nextSampleToken.term());
|
: sampleText.term());
|
||||||
}
|
}
|
||||||
|
|
||||||
inWords.close();
|
inWords.close();
|
||||||
|
@ -118,29 +113,22 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
sampleKOI8,
|
sampleKOI8,
|
||||||
RussianCharsets.KOI8);
|
RussianCharsets.KOI8);
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
||||||
final Token reusableSampleToken = new Token();
|
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
Token nextSampleToken;
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
nextToken = in.next(reusableToken);
|
if (in.incrementToken() == false)
|
||||||
|
break;
|
||||||
|
|
||||||
if (nextToken == null)
|
boolean nextSampleToken = sample.incrementToken();
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
nextSampleToken = sample.next(reusableSampleToken);
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"KOI8",
|
"KOI8",
|
||||||
nextToken.term(),
|
text.term(),
|
||||||
nextSampleToken == null
|
nextSampleToken == false
|
||||||
? null
|
? null
|
||||||
: nextSampleToken.term());
|
: sampleText.term());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inWordsKOI8.close();
|
inWordsKOI8.close();
|
||||||
sampleKOI8.close();
|
sampleKOI8.close();
|
||||||
}
|
}
|
||||||
|
@ -159,27 +147,21 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
sample1251,
|
sample1251,
|
||||||
RussianCharsets.CP1251);
|
RussianCharsets.CP1251);
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
TermAttribute text = (TermAttribute) in.getAttribute(TermAttribute.class);
|
||||||
final Token reusableSampleToken = new Token();
|
TermAttribute sampleText = (TermAttribute) sample.getAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
Token nextSampleToken;
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
nextToken = in.next(reusableToken);
|
if (in.incrementToken() == false)
|
||||||
|
break;
|
||||||
|
|
||||||
if (nextToken == null)
|
boolean nextSampleToken = sample.incrementToken();
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
nextSampleToken = sample.next(reusableSampleToken);
|
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"1251",
|
"1251",
|
||||||
nextToken.term(),
|
text.term(),
|
||||||
nextSampleToken == null
|
nextSampleToken == false
|
||||||
? null
|
? null
|
||||||
: nextSampleToken.term());
|
: sampleText.term());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
inWords1251.close();
|
inWords1251.close();
|
||||||
|
@ -192,10 +174,13 @@ public class TestRussianAnalyzer extends TestCase
|
||||||
RussianAnalyzer ra = new RussianAnalyzer();
|
RussianAnalyzer ra = new RussianAnalyzer();
|
||||||
TokenStream stream = ra.tokenStream("", reader);
|
TokenStream stream = ra.tokenStream("", reader);
|
||||||
|
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termText = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
try {
|
try {
|
||||||
assertEquals("text", stream.next(reusableToken).term());
|
assertTrue(stream.incrementToken());
|
||||||
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next(reusableToken));
|
assertEquals("text", termText.term());
|
||||||
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals("RussianAnalyzer's tokenizer skips numbers from input text", "1000", termText.term());
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
}
|
}
|
||||||
catch (IOException e)
|
catch (IOException e)
|
||||||
{
|
{
|
||||||
|
|
|
@ -18,10 +18,14 @@ package org.apache.lucene.analysis.th;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
||||||
|
@ -70,19 +74,20 @@ public class TestThaiAnalyzer extends TestCase {
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
|
||||||
|
TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||||
for (int i = 0; i < output.length; i++) {
|
for (int i = 0; i < output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(termAtt.term(), output[i]);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
|
||||||
if (startOffsets != null)
|
if (startOffsets != null)
|
||||||
assertEquals(nextToken.startOffset(), startOffsets[i]);
|
assertEquals(offsetAtt.startOffset(), startOffsets[i]);
|
||||||
if (endOffsets != null)
|
if (endOffsets != null)
|
||||||
assertEquals(nextToken.endOffset(), endOffsets[i]);
|
assertEquals(offsetAtt.endOffset(), endOffsets[i]);
|
||||||
if (types != null)
|
if (types != null)
|
||||||
assertEquals(nextToken.type(), types[i]);
|
assertEquals(typeAtt.type(), types[i]);
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,8 +20,10 @@ package org.apache.lucene.analysis.cn.smart;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes input text into sentences.
|
* Tokenizes input text into sentences.
|
||||||
|
@ -29,7 +31,7 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
* The output tokens can then be broken into words with {@link WordTokenFilter}
|
* The output tokens can then be broken into words with {@link WordTokenFilter}
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class SentenceTokenizer extends Tokenizer {
|
public final class SentenceTokenizer extends Tokenizer {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* End of sentence punctuation: 。,!?;,!?;
|
* End of sentence punctuation: 。,!?;,!?;
|
||||||
|
@ -39,12 +41,19 @@ public class SentenceTokenizer extends Tokenizer {
|
||||||
private final StringBuffer buffer = new StringBuffer();
|
private final StringBuffer buffer = new StringBuffer();
|
||||||
|
|
||||||
private int tokenStart = 0, tokenEnd = 0;
|
private int tokenStart = 0, tokenEnd = 0;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
public SentenceTokenizer(Reader reader) {
|
public SentenceTokenizer(Reader reader) {
|
||||||
super(reader);
|
super(reader);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
buffer.setLength(0);
|
buffer.setLength(0);
|
||||||
int ci;
|
int ci;
|
||||||
char ch, pch;
|
char ch, pch;
|
||||||
|
@ -83,11 +92,12 @@ public class SentenceTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (buffer.length() == 0)
|
if (buffer.length() == 0)
|
||||||
return null;
|
return false;
|
||||||
else {
|
else {
|
||||||
reusableToken.clear();
|
termAtt.setTermBuffer(buffer.toString());
|
||||||
reusableToken.reinit(buffer.toString(), input.correctOffset(tokenStart), input.correctOffset(tokenEnd), "sentence");
|
offsetAtt.setOffset(input.correctOffset(tokenStart), input.correctOffset(tokenEnd));
|
||||||
return reusableToken;
|
typeAtt.setType("sentence");
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn.smart;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
|
import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
|
||||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
|
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
|
||||||
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
|
import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
|
||||||
|
@ -37,11 +36,11 @@ class WordSegmenter {
|
||||||
/**
|
/**
|
||||||
* Segment a sentence into words with {@link HHMMSegmenter}
|
* Segment a sentence into words with {@link HHMMSegmenter}
|
||||||
*
|
*
|
||||||
* @param sentenceToken sentence {@link Token}
|
* @param sentence input sentence
|
||||||
|
* @param startOffset start offset of sentence
|
||||||
* @return {@link List} of {@link SegToken}
|
* @return {@link List} of {@link SegToken}
|
||||||
*/
|
*/
|
||||||
public List segmentSentence(Token sentenceToken) {
|
public List segmentSentence(String sentence, int startOffset) {
|
||||||
String sentence = sentenceToken.term();
|
|
||||||
|
|
||||||
List segTokenList = hhmmSegmenter.process(sentence);
|
List segTokenList = hhmmSegmenter.process(sentence);
|
||||||
|
|
||||||
|
@ -49,25 +48,25 @@ class WordSegmenter {
|
||||||
|
|
||||||
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
|
// tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
|
||||||
for (int i = 1; i < segTokenList.size() - 1; i++) {
|
for (int i = 1; i < segTokenList.size() - 1; i++) {
|
||||||
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
|
result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset));
|
||||||
sentenceToken.startOffset(), "word"));
|
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Convert a {@link SegToken} to a Lucene {@link Token}
|
* Process a {@link SegToken} so that it is ready for indexing.
|
||||||
|
*
|
||||||
|
* This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
|
||||||
*
|
*
|
||||||
* @param st input {@link SegToken}
|
* @param st input {@link SegToken}
|
||||||
* @param sentence associated Sentence
|
* @param sentence associated Sentence
|
||||||
* @param sentenceStartOffset offset into sentence
|
* @param sentenceStartOffset offset into sentence
|
||||||
* @param type token type, default is word
|
* @return Lucene {@link SegToken}
|
||||||
* @return Lucene {@link Token}
|
|
||||||
*/
|
*/
|
||||||
public Token convertSegToken(SegToken st, String sentence,
|
public SegToken convertSegToken(SegToken st, String sentence,
|
||||||
int sentenceStartOffset, String type) {
|
int sentenceStartOffset) {
|
||||||
Token result;
|
|
||||||
switch (st.wordType) {
|
switch (st.wordType) {
|
||||||
case WordType.STRING:
|
case WordType.STRING:
|
||||||
case WordType.NUMBER:
|
case WordType.NUMBER:
|
||||||
|
@ -81,9 +80,8 @@ class WordSegmenter {
|
||||||
}
|
}
|
||||||
|
|
||||||
st = tokenFilter.filter(st);
|
st = tokenFilter.filter(st);
|
||||||
|
st.startOffset += sentenceStartOffset;
|
||||||
result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
|
st.endOffset += sentenceStartOffset;
|
||||||
+ sentenceStartOffset, st.endOffset + sentenceStartOffset);
|
return st;
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,20 +21,27 @@ import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that breaks sentences into words.
|
* A {@link TokenFilter} that breaks sentences into words.
|
||||||
*/
|
*/
|
||||||
public class WordTokenFilter extends TokenFilter {
|
public final class WordTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private WordSegmenter wordSegmenter;
|
private WordSegmenter wordSegmenter;
|
||||||
|
|
||||||
private Iterator tokenIter;
|
private Iterator tokenIter;
|
||||||
|
|
||||||
private List tokenBuffer;
|
private List tokenBuffer;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a new WordTokenizer.
|
* Construct a new WordTokenizer.
|
||||||
|
@ -44,32 +51,34 @@ public class WordTokenFilter extends TokenFilter {
|
||||||
public WordTokenFilter(TokenStream in) {
|
public WordTokenFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
this.wordSegmenter = new WordSegmenter();
|
this.wordSegmenter = new WordSegmenter();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableSentenceToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (tokenIter != null && tokenIter.hasNext())
|
if (tokenIter == null || !tokenIter.hasNext()) {
|
||||||
return (Token) tokenIter.next();
|
// there are no remaining tokens from the current sentence... are there more sentences?
|
||||||
else {
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableSentenceToken);
|
// a new sentence is available: process it.
|
||||||
if (processNextSentence(nextToken)) {
|
tokenBuffer = wordSegmenter.segmentSentence(termAtt.term(), offsetAtt.startOffset());
|
||||||
return (Token) tokenIter.next();
|
tokenIter = tokenBuffer.iterator();
|
||||||
} else
|
/*
|
||||||
return null;
|
* it should not be possible to have a sentence with 0 words, check just in case.
|
||||||
}
|
* returning EOS isn't the best either, but its the behavior of the original code.
|
||||||
}
|
*/
|
||||||
|
if (!tokenIter.hasNext())
|
||||||
/**
|
return false;
|
||||||
* Process the next input sentence, placing tokens into tokenBuffer
|
} else {
|
||||||
*
|
return false; // no more sentences, end of stream!
|
||||||
* @param reusableSentenceToken input sentence
|
}
|
||||||
* @return true if more tokens were placed into tokenBuffer.
|
}
|
||||||
* @throws IOException
|
|
||||||
*/
|
// There are remaining tokens from the current sentence, return the next one.
|
||||||
private boolean processNextSentence(final Token reusableSentenceToken) throws IOException {
|
SegToken nextWord = (SegToken) tokenIter.next();
|
||||||
if (reusableSentenceToken == null)
|
termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
|
||||||
return false;
|
offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
|
||||||
tokenBuffer = wordSegmenter.segmentSentence(reusableSentenceToken);
|
typeAtt.setType("word");
|
||||||
tokenIter = tokenBuffer.iterator();
|
return true;
|
||||||
return tokenBuffer != null && tokenIter.hasNext();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,6 +29,9 @@ import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
public class TestSmartChineseAnalyzer extends TestCase {
|
public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
|
|
||||||
|
@ -108,22 +111,23 @@ public class TestSmartChineseAnalyzer extends TestCase {
|
||||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output, int startOffsets[], int endOffsets[], String types[])
|
||||||
throws Exception {
|
throws Exception {
|
||||||
|
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i = 0; i < output.length; i++) {
|
OffsetAttribute offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
|
||||||
Token nextToken = ts.next(reusableToken);
|
TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||||
assertNotNull(nextToken);
|
for (int i = 0; i < output.length; i++) {
|
||||||
assertEquals(nextToken.term(), output[i]);
|
assertTrue(ts.incrementToken());
|
||||||
|
assertEquals(termAtt.term(), output[i]);
|
||||||
if (startOffsets != null)
|
if (startOffsets != null)
|
||||||
assertEquals(nextToken.startOffset(), startOffsets[i]);
|
assertEquals(offsetAtt.startOffset(), startOffsets[i]);
|
||||||
if (endOffsets != null)
|
if (endOffsets != null)
|
||||||
assertEquals(nextToken.endOffset(), endOffsets[i]);
|
assertEquals(offsetAtt.endOffset(), endOffsets[i]);
|
||||||
if (types != null)
|
if (types != null)
|
||||||
assertEquals(nextToken.type(), types[i]);
|
assertEquals(typeAtt.type(), types[i]);
|
||||||
|
}
|
||||||
|
assertFalse(ts.incrementToken());
|
||||||
|
ts.close();
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
|
||||||
ts.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
public void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||||
assertAnalyzesTo(a, input, output, null, null, null);
|
assertAnalyzesTo(a, input, output, null, null, null);
|
||||||
|
|
|
@ -21,6 +21,7 @@ package org.apache.lucene.collation;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.IndexableBinaryStringTools;
|
import org.apache.lucene.util.IndexableBinaryStringTools;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -73,8 +74,9 @@ import java.text.Collator;
|
||||||
* {@link ICUCollationKeyFilter} on the query side, or vice versa.
|
* {@link ICUCollationKeyFilter} on the query side, or vice versa.
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class CollationKeyFilter extends TokenFilter {
|
public final class CollationKeyFilter extends TokenFilter {
|
||||||
private Collator collator = null;
|
private Collator collator = null;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param input Source token stream
|
* @param input Source token stream
|
||||||
|
@ -83,25 +85,26 @@ public class CollationKeyFilter extends TokenFilter {
|
||||||
public CollationKeyFilter(TokenStream input, Collator collator) {
|
public CollationKeyFilter(TokenStream input, Collator collator) {
|
||||||
super(input);
|
super(input);
|
||||||
this.collator = collator;
|
this.collator = collator;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
if (nextToken != null) {
|
String termText = new String(termBuffer, 0, termAtt.termLength());
|
||||||
char[] termBuffer = nextToken.termBuffer();
|
|
||||||
String termText = new String(termBuffer, 0, nextToken.termLength());
|
|
||||||
byte[] collationKey = collator.getCollationKey(termText).toByteArray();
|
byte[] collationKey = collator.getCollationKey(termText).toByteArray();
|
||||||
ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey);
|
ByteBuffer collationKeyBuf = ByteBuffer.wrap(collationKey);
|
||||||
int encodedLength
|
int encodedLength
|
||||||
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
|
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
|
||||||
if (encodedLength > termBuffer.length) {
|
if (encodedLength > termBuffer.length) {
|
||||||
nextToken.resizeTermBuffer(encodedLength);
|
termAtt.resizeTermBuffer(encodedLength);
|
||||||
}
|
}
|
||||||
nextToken.setTermLength(encodedLength);
|
termAtt.setTermLength(encodedLength);
|
||||||
CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer());
|
CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
|
||||||
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
|
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,6 +24,7 @@ import com.ibm.icu.text.RawCollationKey;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.IndexableBinaryStringTools;
|
import org.apache.lucene.util.IndexableBinaryStringTools;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -69,9 +70,10 @@ import java.nio.CharBuffer;
|
||||||
* java.text.Collator over several languages.
|
* java.text.Collator over several languages.
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public class ICUCollationKeyFilter extends TokenFilter {
|
public final class ICUCollationKeyFilter extends TokenFilter {
|
||||||
private Collator collator = null;
|
private Collator collator = null;
|
||||||
private RawCollationKey reusableKey = new RawCollationKey();
|
private RawCollationKey reusableKey = new RawCollationKey();
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -81,25 +83,26 @@ public class ICUCollationKeyFilter extends TokenFilter {
|
||||||
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
|
public ICUCollationKeyFilter(TokenStream input, Collator collator) {
|
||||||
super(input);
|
super(input);
|
||||||
this.collator = collator;
|
this.collator = collator;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
char[] termBuffer = termAtt.termBuffer();
|
||||||
if (nextToken != null) {
|
String termText = new String(termBuffer, 0, termAtt.termLength());
|
||||||
char[] termBuffer = nextToken.termBuffer();
|
|
||||||
String termText = new String(termBuffer, 0, nextToken.termLength());
|
|
||||||
collator.getRawCollationKey(termText, reusableKey);
|
collator.getRawCollationKey(termText, reusableKey);
|
||||||
ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size);
|
ByteBuffer collationKeyBuf = ByteBuffer.wrap(reusableKey.bytes, 0, reusableKey.size);
|
||||||
int encodedLength
|
int encodedLength
|
||||||
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
|
= IndexableBinaryStringTools.getEncodedLength(collationKeyBuf);
|
||||||
if (encodedLength > termBuffer.length) {
|
if (encodedLength > termBuffer.length) {
|
||||||
nextToken.resizeTermBuffer(encodedLength);
|
termAtt.resizeTermBuffer(encodedLength);
|
||||||
}
|
}
|
||||||
nextToken.setTermLength(encodedLength);
|
termAtt.setTermLength(encodedLength);
|
||||||
CharBuffer wrappedTermBuffer = CharBuffer.wrap(nextToken.termBuffer());
|
CharBuffer wrappedTermBuffer = CharBuffer.wrap(termAtt.termBuffer());
|
||||||
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
|
IndexableBinaryStringTools.encode(collationKeyBuf, wrappedTermBuffer);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,6 +28,8 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Field.Index;
|
import org.apache.lucene.document.Field.Index;
|
||||||
|
@ -193,11 +195,15 @@ public abstract class AbstractTestCase extends TestCase {
|
||||||
ch = 0;
|
ch = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next( Token reusableToken ) throws IOException {
|
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
if( !getNextPartialSnippet() )
|
if( !getNextPartialSnippet() )
|
||||||
return null;
|
return false;
|
||||||
reusableToken.reinit( snippet, startTerm, lenTerm, startOffset, startOffset + lenTerm );
|
|
||||||
return reusableToken;
|
termAtt.setTermBuffer(snippet, startTerm, lenTerm);
|
||||||
|
offsetAtt.setOffset(startOffset, startOffset + lenTerm);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getFinalOffset() {
|
public int getFinalOffset() {
|
||||||
|
|
|
@ -295,14 +295,21 @@ public class IndexTimeSynonymTest extends AbstractTestCase {
|
||||||
public TokenArrayAnalyzer( Token... tokens ){
|
public TokenArrayAnalyzer( Token... tokens ){
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new TokenStream(){
|
final Token reusableToken = new Token();
|
||||||
|
|
||||||
|
TokenStream.setOnlyUseNewAPI(true);
|
||||||
|
TokenStream ts = new TokenStream(){
|
||||||
int p = 0;
|
int p = 0;
|
||||||
public Token next( Token reusableToken ) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if( p >= tokens.length ) return null;
|
if( p >= tokens.length ) return false;
|
||||||
return tokens[p++];
|
tokens[p++].copyTo(reusableToken);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
ts.addAttributeImpl(reusableToken);
|
||||||
|
return ts;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,6 +27,7 @@ import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -44,6 +45,7 @@ import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.TopDocCollector;
|
import org.apache.lucene.search.TopDocCollector;
|
||||||
import org.apache.lucene.search.TopScoreDocCollector;
|
import org.apache.lucene.search.TopScoreDocCollector;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Asserts equality of content and behaviour of two index readers.
|
* Asserts equality of content and behaviour of two index readers.
|
||||||
|
@ -175,23 +177,26 @@ public class TestIndicesEquals extends TestCase {
|
||||||
t.setPayload(new Payload(new byte[]{2}));
|
t.setPayload(new Payload(new byte[]{2}));
|
||||||
tokens.add(t);
|
tokens.add(t);
|
||||||
tokens.add(createToken("fin", 7, 9));
|
tokens.add(createToken("fin", 7, 9));
|
||||||
document.add(new Field("f", new TokenStream() {
|
final Token reusableToken = new Token();
|
||||||
|
TokenStream ts = new TokenStream() {
|
||||||
Iterator<Token> it = tokens.iterator();
|
Iterator<Token> it = tokens.iterator();
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (!it.hasNext()) {
|
if (!it.hasNext()) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
// Resettable token streams need to return clones.
|
|
||||||
Token nextToken = (Token) it.next();
|
reusableToken.reinit(it.next());
|
||||||
return (Token) nextToken.clone();
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
it = tokens.iterator();
|
it = tokens.iterator();
|
||||||
}
|
}
|
||||||
}));
|
};
|
||||||
|
ts.addAttributeImpl(reusableToken);
|
||||||
|
|
||||||
|
document.add(new Field("f", ts));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,6 +75,8 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -317,11 +319,14 @@ class LuceneMethods {
|
||||||
int position = 0;
|
int position = 0;
|
||||||
// Tokenize field and add to postingTable
|
// Tokenize field and add to postingTable
|
||||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
while (stream.incrementToken()) {
|
||||||
position += (nextToken.getPositionIncrement() - 1);
|
position += (posIncrAtt.getPositionIncrement() - 1);
|
||||||
position++;
|
position++;
|
||||||
String name = nextToken.term();
|
String name = termAtt.term();
|
||||||
Integer Count = (Integer) tokenMap.get(name);
|
Integer Count = (Integer) tokenMap.get(name);
|
||||||
if (Count == null) { // not in there yet
|
if (Count == null) { // not in there yet
|
||||||
tokenMap.put(name, new Integer(1)); //first one
|
tokenMap.put(name, new Integer(1)); //first one
|
||||||
|
|
|
@ -31,9 +31,13 @@ import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.PorterStemFilter;
|
import org.apache.lucene.analysis.PorterStemFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Various fulltext analysis utilities avoiding redundant code in several
|
* Various fulltext analysis utilities avoiding redundant code in several
|
||||||
|
@ -71,21 +75,24 @@ public class AnalyzerUtil {
|
||||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||||
private int position = -1;
|
private int position = -1;
|
||||||
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
private PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
assert reusableToken != null;
|
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
Token nextToken = input.next(reusableToken); // from filter super class
|
private TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
log.println(toString(nextToken));
|
|
||||||
return nextToken;
|
public boolean incrementToken() throws IOException {
|
||||||
|
boolean hasNext = input.incrementToken();
|
||||||
|
log.println(toString(hasNext));
|
||||||
|
return hasNext;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String toString(Token token) {
|
private String toString(boolean hasNext) {
|
||||||
if (token == null) return "[" + logName + ":EOS:" + fieldName + "]\n";
|
if (!hasNext) return "[" + logName + ":EOS:" + fieldName + "]\n";
|
||||||
|
|
||||||
position += token.getPositionIncrement();
|
position += posIncrAtt.getPositionIncrement();
|
||||||
return "[" + logName + ":" + position + ":" + fieldName + ":"
|
return "[" + logName + ":" + position + ":" + fieldName + ":"
|
||||||
+ token.term() + ":" + token.startOffset()
|
+ termAtt.term() + ":" + offsetAtt.startOffset()
|
||||||
+ "-" + token.endOffset() + ":" + token.type()
|
+ "-" + offsetAtt.endOffset() + ":" + typeAtt.type()
|
||||||
+ "]";
|
+ "]";
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -121,9 +128,8 @@ public class AnalyzerUtil {
|
||||||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||||
private int todo = maxTokens;
|
private int todo = maxTokens;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
return --todo >= 0 ? input.incrementToken() : false;
|
||||||
return --todo >= 0 ? input.next(reusableToken) : null;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -240,11 +246,10 @@ public class AnalyzerUtil {
|
||||||
final ArrayList tokens2 = new ArrayList();
|
final ArrayList tokens2 = new ArrayList();
|
||||||
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
|
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
boolean hasNext = input.incrementToken();
|
||||||
Token nextToken = input.next(reusableToken); // from filter super class
|
if (hasNext) tokens2.add(captureState());
|
||||||
if (nextToken != null) tokens2.add(nextToken.clone());
|
return hasNext;
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -255,10 +260,10 @@ public class AnalyzerUtil {
|
||||||
|
|
||||||
private Iterator iter = tokens.iterator();
|
private Iterator iter = tokens.iterator();
|
||||||
|
|
||||||
public Token next(Token token) {
|
public boolean incrementToken() {
|
||||||
assert token != null;
|
if (!iter.hasNext()) return false;
|
||||||
if (!iter.hasNext()) return null;
|
restoreState((AttributeSource.State) iter.next());
|
||||||
return (Token) iter.next();
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -302,13 +307,13 @@ public class AnalyzerUtil {
|
||||||
// compute frequencies of distinct terms
|
// compute frequencies of distinct terms
|
||||||
HashMap map = new HashMap();
|
HashMap map = new HashMap();
|
||||||
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
|
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||||
try {
|
try {
|
||||||
final Token reusableToken = new Token();
|
while (stream.incrementToken()) {
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
MutableInteger freq = (MutableInteger) map.get(termAtt.term());
|
||||||
MutableInteger freq = (MutableInteger) map.get(nextToken.term());
|
|
||||||
if (freq == null) {
|
if (freq == null) {
|
||||||
freq = new MutableInteger(1);
|
freq = new MutableInteger(1);
|
||||||
map.put(nextToken.term(), freq);
|
map.put(termAtt.term(), freq);
|
||||||
} else {
|
} else {
|
||||||
freq.setValue(freq.intValue() + 1);
|
freq.setValue(freq.intValue() + 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,8 +28,10 @@ import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -274,18 +276,21 @@ public class MemoryIndex implements Serializable {
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
private Iterator iter = keywords.iterator();
|
private Iterator iter = keywords.iterator();
|
||||||
private int start = 0;
|
private int start = 0;
|
||||||
public Token next(final Token reusableToken) {
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
assert reusableToken != null;
|
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
if (!iter.hasNext()) return null;
|
|
||||||
|
public boolean incrementToken() {
|
||||||
|
if (!iter.hasNext()) return false;
|
||||||
|
|
||||||
Object obj = iter.next();
|
Object obj = iter.next();
|
||||||
if (obj == null)
|
if (obj == null)
|
||||||
throw new IllegalArgumentException("keyword must not be null");
|
throw new IllegalArgumentException("keyword must not be null");
|
||||||
|
|
||||||
String term = obj.toString();
|
String term = obj.toString();
|
||||||
reusableToken.reinit(term, start, start+reusableToken.termLength());
|
termAtt.setTermBuffer(term);
|
||||||
|
offsetAtt.setOffset(start, start+termAtt.termLength());
|
||||||
start += term.length() + 1; // separate words by 1 (blank) character
|
start += term.length() + 1; // separate words by 1 (blank) character
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -350,13 +355,17 @@ public class MemoryIndex implements Serializable {
|
||||||
int numTokens = 0;
|
int numTokens = 0;
|
||||||
int numOverlapTokens = 0;
|
int numOverlapTokens = 0;
|
||||||
int pos = -1;
|
int pos = -1;
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
|
||||||
String term = nextToken.term();
|
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
String term = termAtt.term();
|
||||||
if (term.length() == 0) continue; // nothing to do
|
if (term.length() == 0) continue; // nothing to do
|
||||||
// if (DEBUG) System.err.println("token='" + term + "'");
|
// if (DEBUG) System.err.println("token='" + term + "'");
|
||||||
numTokens++;
|
numTokens++;
|
||||||
final int posIncr = nextToken.getPositionIncrement();
|
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||||
if (posIncr == 0)
|
if (posIncr == 0)
|
||||||
numOverlapTokens++;
|
numOverlapTokens++;
|
||||||
pos += posIncr;
|
pos += posIncr;
|
||||||
|
@ -369,7 +378,7 @@ public class MemoryIndex implements Serializable {
|
||||||
if (stride == 1) {
|
if (stride == 1) {
|
||||||
positions.add(pos);
|
positions.add(pos);
|
||||||
} else {
|
} else {
|
||||||
positions.add(pos, nextToken.startOffset(), nextToken.endOffset());
|
positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -30,8 +30,9 @@ import java.util.regex.Pattern;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
|
* Efficient Lucene analyzer/tokenizer that preferably operates on a String rather than a
|
||||||
|
@ -331,6 +332,8 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
private Matcher matcher;
|
private Matcher matcher;
|
||||||
private int pos = 0;
|
private int pos = 0;
|
||||||
private static final Locale locale = Locale.getDefault();
|
private static final Locale locale = Locale.getDefault();
|
||||||
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
||||||
this.str = str;
|
this.str = str;
|
||||||
|
@ -338,9 +341,8 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
this.toLowerCase = toLowerCase;
|
this.toLowerCase = toLowerCase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) {
|
public final boolean incrementToken() {
|
||||||
assert reusableToken != null;
|
if (matcher == null) return false;
|
||||||
if (matcher == null) return null;
|
|
||||||
|
|
||||||
while (true) { // loop takes care of leading and trailing boundary cases
|
while (true) { // loop takes care of leading and trailing boundary cases
|
||||||
int start = pos;
|
int start = pos;
|
||||||
|
@ -357,9 +359,11 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
if (start != end) { // non-empty match (header/trailer)
|
if (start != end) { // non-empty match (header/trailer)
|
||||||
String text = str.substring(start, end);
|
String text = str.substring(start, end);
|
||||||
if (toLowerCase) text = text.toLowerCase(locale);
|
if (toLowerCase) text = text.toLowerCase(locale);
|
||||||
return reusableToken.reinit(text, start, end);
|
termAtt.setTermBuffer(text);
|
||||||
|
offsetAtt.setOffset(start, end);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
if (!isMatch) return null;
|
if (!isMatch) return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -381,6 +385,8 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
private final boolean toLowerCase;
|
private final boolean toLowerCase;
|
||||||
private final Set stopWords;
|
private final Set stopWords;
|
||||||
private static final Locale locale = Locale.getDefault();
|
private static final Locale locale = Locale.getDefault();
|
||||||
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
|
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set stopWords) {
|
||||||
this.str = str;
|
this.str = str;
|
||||||
|
@ -389,8 +395,7 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
this.stopWords = stopWords;
|
this.stopWords = stopWords;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) {
|
public boolean incrementToken() {
|
||||||
assert reusableToken != null;
|
|
||||||
// cache loop instance vars (performance)
|
// cache loop instance vars (performance)
|
||||||
String s = str;
|
String s = str;
|
||||||
int len = s.length();
|
int len = s.length();
|
||||||
|
@ -430,9 +435,11 @@ public class PatternAnalyzer extends Analyzer {
|
||||||
pos = i;
|
pos = i;
|
||||||
if (text == null)
|
if (text == null)
|
||||||
{
|
{
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
return reusableToken.reinit(text, start, i);
|
termAtt.setTermBuffer(text);
|
||||||
|
offsetAtt.setOffset(start, i);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean isTokenChar(char c, boolean isLetter) {
|
private boolean isTokenChar(char c, boolean isLetter) {
|
||||||
|
|
|
@ -19,9 +19,12 @@ package org.apache.lucene.index.memory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Injects additional tokens for synonyms of token terms fetched from the
|
* Injects additional tokens for synonyms of token terms fetched from the
|
||||||
|
@ -39,9 +42,13 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
private String[] stack = null;
|
private String[] stack = null;
|
||||||
private int index = 0;
|
private int index = 0;
|
||||||
private Token current = null;
|
private AttributeSource.State current = null;
|
||||||
private int todo = 0;
|
private int todo = 0;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance for the given underlying stream and synonym table.
|
* Creates an instance for the given underlying stream and synonym table.
|
||||||
*
|
*
|
||||||
|
@ -64,28 +71,29 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
this.synonyms = synonyms;
|
this.synonyms = synonyms;
|
||||||
this.maxSynonyms = maxSynonyms;
|
this.maxSynonyms = maxSynonyms;
|
||||||
|
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
while (todo > 0 && index < stack.length) { // pop from stack
|
while (todo > 0 && index < stack.length) { // pop from stack
|
||||||
Token nextToken = createToken(stack[index++], current, reusableToken);
|
if (createToken(stack[index++], current)) {
|
||||||
if (nextToken != null) {
|
|
||||||
todo--;
|
todo--;
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Token nextToken = input.next(reusableToken);
|
if (!input.incrementToken()) return false; // EOS; iterator exhausted
|
||||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
|
||||||
|
|
||||||
stack = synonyms.getSynonyms(nextToken.term()); // push onto stack
|
stack = synonyms.getSynonyms(termAtt.term()); // push onto stack
|
||||||
if (stack.length > maxSynonyms) randomize(stack);
|
if (stack.length > maxSynonyms) randomize(stack);
|
||||||
index = 0;
|
index = 0;
|
||||||
current = (Token) nextToken.clone();
|
current = captureState();
|
||||||
todo = maxSynonyms;
|
todo = maxSynonyms;
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -101,12 +109,12 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
* @return a new token, or null to indicate that the given synonym should be
|
* @return a new token, or null to indicate that the given synonym should be
|
||||||
* ignored
|
* ignored
|
||||||
*/
|
*/
|
||||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
protected boolean createToken(String synonym, AttributeSource.State current) {
|
||||||
reusableToken.reinit(current, synonym);
|
restoreState(current);
|
||||||
reusableToken.setTermBuffer(synonym);
|
termAtt.setTermBuffer(synonym);
|
||||||
reusableToken.setType(SYNONYM_TOKEN_TYPE);
|
typeAtt.setType(SYNONYM_TOKEN_TYPE);
|
||||||
reusableToken.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -25,6 +25,7 @@ import java.util.List;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.queryParser.ParseException;
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
|
||||||
|
@ -105,20 +106,16 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
|
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
|
|
||||||
int countTokens = 0;
|
int countTokens = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
nextToken = source.next(reusableToken);
|
if (!source.incrementToken()) break;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
nextToken = null;
|
|
||||||
}
|
|
||||||
if (nextToken == null) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
String term = nextToken.term();
|
String term = termAtt.term();
|
||||||
if (!"".equals(term)) {
|
if (!"".equals(term)) {
|
||||||
try {
|
try {
|
||||||
tlist.set(countTokens++, term);
|
tlist.set(countTokens++, term);
|
||||||
|
@ -191,19 +188,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
List tlist = new ArrayList();
|
List tlist = new ArrayList();
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
nextToken = source.next(reusableToken);
|
if (!source.incrementToken()) break;
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
nextToken = null;
|
|
||||||
}
|
|
||||||
if (nextToken == null) {
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
tlist.add(nextToken.term());
|
tlist.add(termAtt.term());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -241,13 +234,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
throws ParseException {
|
throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
String nextToken = null;
|
||||||
boolean multipleTokens = false;
|
boolean multipleTokens = false;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
nextToken = source.next(reusableToken);
|
if (source.incrementToken()) {
|
||||||
multipleTokens = source.next(reusableToken) != null;
|
nextToken = termAtt.term();
|
||||||
|
}
|
||||||
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
nextToken = null;
|
nextToken = null;
|
||||||
}
|
}
|
||||||
|
@ -263,7 +258,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
+ " - tokens were added");
|
+ " - tokens were added");
|
||||||
}
|
}
|
||||||
|
|
||||||
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken.term(), minSimilarity);
|
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken, minSimilarity);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -274,20 +269,17 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
throws ParseException {
|
throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the terms
|
// get Analyzer from superclass and tokenize the terms
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
|
||||||
Token nextToken;
|
|
||||||
Token multipleToken;
|
|
||||||
boolean multipleTokens = false;
|
boolean multipleTokens = false;
|
||||||
|
|
||||||
// part1
|
// part1
|
||||||
try {
|
try {
|
||||||
nextToken = source.next(reusableToken);
|
if (source.incrementToken()) {
|
||||||
if (nextToken != null) {
|
part1 = termAtt.term();
|
||||||
part1 = nextToken.term();
|
|
||||||
}
|
}
|
||||||
multipleTokens = source.next(reusableToken) != null;
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
nextToken = null;
|
// ignore
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
source.close();
|
source.close();
|
||||||
|
@ -301,14 +293,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
|
|
||||||
// part2
|
// part2
|
||||||
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
||||||
|
termAtt = (TermAttribute) source.addAttribute(TermAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
nextToken = source.next(reusableToken);
|
if (source.incrementToken()) {
|
||||||
if (nextToken != null) {
|
part2 = termAtt.term();
|
||||||
part2 = nextToken.term();
|
|
||||||
}
|
}
|
||||||
multipleTokens = source.next(reusableToken) != null;
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
nextToken = null;
|
// ignore
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
source.close();
|
source.close();
|
||||||
|
|
|
@ -26,6 +26,8 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
import org.apache.lucene.search.FuzzyQuery;
|
||||||
|
@ -57,28 +59,27 @@ public class TestPrecedenceQueryParser extends TestCase {
|
||||||
boolean inPhrase = false;
|
boolean inPhrase = false;
|
||||||
int savedStart = 0, savedEnd = 0;
|
int savedStart = 0, savedEnd = 0;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
assert reusableToken != null;
|
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
if (inPhrase) {
|
if (inPhrase) {
|
||||||
inPhrase = false;
|
inPhrase = false;
|
||||||
reusableToken.setTermBuffer("phrase2");
|
termAtt.setTermBuffer("phrase2");
|
||||||
reusableToken.setStartOffset(savedStart);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
reusableToken.setEndOffset(savedEnd);
|
return true;
|
||||||
return reusableToken;
|
|
||||||
} else
|
} else
|
||||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
while(input.incrementToken())
|
||||||
if (nextToken.term().equals("phrase")) {
|
if (termAtt.term().equals("phrase")) {
|
||||||
inPhrase = true;
|
inPhrase = true;
|
||||||
savedStart = nextToken.startOffset();
|
savedStart = offsetAtt.startOffset();
|
||||||
savedEnd = nextToken.endOffset();
|
savedEnd = offsetAtt.endOffset();
|
||||||
nextToken.setTermBuffer("phrase1");
|
termAtt.setTermBuffer("phrase1");
|
||||||
nextToken.setStartOffset(savedStart);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
nextToken.setEndOffset(savedEnd);
|
return true;
|
||||||
return nextToken;
|
} else if (!termAtt.term().equals("stop"))
|
||||||
} else if (!nextToken.term().equals("stop"))
|
return true;
|
||||||
return nextToken;
|
return false;
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Iterator;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.index.TermEnum;
|
import org.apache.lucene.index.TermEnum;
|
||||||
|
@ -181,13 +182,14 @@ public class FuzzyLikeThisQuery extends Query
|
||||||
{
|
{
|
||||||
if(f.queryString==null) return;
|
if(f.queryString==null) return;
|
||||||
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
|
|
||||||
int corpusNumDocs=reader.numDocs();
|
int corpusNumDocs=reader.numDocs();
|
||||||
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
||||||
HashSet processedTerms=new HashSet();
|
HashSet processedTerms=new HashSet();
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
|
while (ts.incrementToken())
|
||||||
{
|
{
|
||||||
String term = nextToken.term();
|
String term = termAtt.term();
|
||||||
if(!processedTerms.contains(term))
|
if(!processedTerms.contains(term))
|
||||||
{
|
{
|
||||||
processedTerms.add(term);
|
processedTerms.add(term);
|
||||||
|
|
|
@ -28,9 +28,9 @@ import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.Hits;
|
import org.apache.lucene.search.Hits;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
@ -829,9 +829,10 @@ public final class MoreLikeThis {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
||||||
int tokenCount=0;
|
int tokenCount=0;
|
||||||
// for every token
|
// for every token
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
|
||||||
String word = nextToken.term();
|
while (ts.incrementToken()) {
|
||||||
|
String word = termAtt.term();
|
||||||
tokenCount++;
|
tokenCount++;
|
||||||
if(tokenCount>maxNumTokensParsed)
|
if(tokenCount>maxNumTokensParsed)
|
||||||
{
|
{
|
||||||
|
|
|
@ -21,8 +21,8 @@ import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
@ -86,11 +86,12 @@ public final class SimilarityQueries
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
|
|
||||||
BooleanQuery tmp = new BooleanQuery();
|
BooleanQuery tmp = new BooleanQuery();
|
||||||
Set already = new HashSet(); // ignore dups
|
Set already = new HashSet(); // ignore dups
|
||||||
final Token reusableToken = new Token();
|
while (ts.incrementToken()) {
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
String word = termAtt.term();
|
||||||
String word = nextToken.term();
|
|
||||||
// ignore opt stop words
|
// ignore opt stop words
|
||||||
if ( stop != null &&
|
if ( stop != null &&
|
||||||
stop.contains( word)) continue;
|
stop.contains( word)) continue;
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.tartarus.snowball.SnowballProgram;
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -33,9 +34,12 @@ public class SnowballFilter extends TokenFilter {
|
||||||
|
|
||||||
private SnowballProgram stemmer;
|
private SnowballProgram stemmer;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
||||||
super(input);
|
super(input);
|
||||||
this.stemmer = stemmer;
|
this.stemmer = stemmer;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,21 +60,34 @@ public class SnowballFilter extends TokenFilter {
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new RuntimeException(e.toString());
|
throw new RuntimeException(e.toString());
|
||||||
}
|
}
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next input Token, after being stemmed */
|
/** Returns the next input Token, after being stemmed */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
if (input.incrementToken()) {
|
||||||
Token nextToken = input.next(reusableToken);
|
String originalTerm = termAtt.term();
|
||||||
if (nextToken == null)
|
stemmer.setCurrent(originalTerm);
|
||||||
return null;
|
stemmer.stem();
|
||||||
String originalTerm = nextToken.term();
|
String finalTerm = stemmer.getCurrent();
|
||||||
stemmer.setCurrent(originalTerm);
|
// Don't bother updating, if it is unchanged.
|
||||||
stemmer.stem();
|
if (!originalTerm.equals(finalTerm))
|
||||||
String finalTerm = stemmer.getCurrent();
|
termAtt.setTermBuffer(finalTerm);
|
||||||
// Don't bother updating, if it is unchanged.
|
return true;
|
||||||
if (!originalTerm.equals(finalTerm))
|
} else {
|
||||||
nextToken.setTermBuffer(finalTerm);
|
return false;
|
||||||
return nextToken;
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,9 +22,14 @@ import java.io.StringReader;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
public class TestSnowball extends TestCase {
|
public class TestSnowball extends TestCase {
|
||||||
|
|
||||||
|
@ -32,12 +37,12 @@ public class TestSnowball extends TestCase {
|
||||||
String input,
|
String input,
|
||||||
String[] output) throws Exception {
|
String[] output) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i = 0; i < output.length; i++) {
|
for (int i = 0; i < output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(output[i], nextToken.term());
|
assertEquals(output[i], termAtt.term());
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -49,33 +54,51 @@ public class TestSnowball extends TestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testFilterTokens() throws Exception {
|
public void testFilterTokens() throws Exception {
|
||||||
final Token tok = new Token(2, 7, "wrd");
|
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||||
tok.setTermBuffer("accents");
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
tok.setPositionIncrement(3);
|
OffsetAttribute offsetAtt = (OffsetAttribute) filter.getAttribute(OffsetAttribute.class);
|
||||||
Payload tokPayload = new Payload(new byte[]{0,1,2,3});
|
TypeAttribute typeAtt = (TypeAttribute) filter.getAttribute(TypeAttribute.class);
|
||||||
tok.setPayload(tokPayload);
|
PayloadAttribute payloadAtt = (PayloadAttribute) filter.getAttribute(PayloadAttribute.class);
|
||||||
int tokFlags = 77;
|
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) filter.getAttribute(PositionIncrementAttribute.class);
|
||||||
tok.setFlags(tokFlags);
|
FlagsAttribute flagsAtt = (FlagsAttribute) filter.getAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
|
filter.incrementToken();
|
||||||
|
|
||||||
SnowballFilter filter = new SnowballFilter(
|
assertEquals("accent", termAtt.term());
|
||||||
new TokenStream() {
|
assertEquals(2, offsetAtt.startOffset());
|
||||||
public Token next(final Token reusableToken) {
|
assertEquals(7, offsetAtt.endOffset());
|
||||||
assert reusableToken != null;
|
assertEquals("wrd", typeAtt.type());
|
||||||
return tok;
|
assertEquals(3, posIncAtt.getPositionIncrement());
|
||||||
}
|
assertEquals(77, flagsAtt.getFlags());
|
||||||
},
|
assertEquals(new Payload(new byte[]{0,1,2,3}), payloadAtt.getPayload());
|
||||||
"English"
|
}
|
||||||
);
|
|
||||||
|
private final class TestTokenStream extends TokenStream {
|
||||||
final Token reusableToken = new Token();
|
private TermAttribute termAtt;
|
||||||
Token nextToken = filter.next(reusableToken);
|
private OffsetAttribute offsetAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
assertEquals("accent", nextToken.term());
|
private PayloadAttribute payloadAtt;
|
||||||
assertEquals(2, nextToken.startOffset());
|
private PositionIncrementAttribute posIncAtt;
|
||||||
assertEquals(7, nextToken.endOffset());
|
private FlagsAttribute flagsAtt;
|
||||||
assertEquals("wrd", nextToken.type());
|
|
||||||
assertEquals(3, nextToken.getPositionIncrement());
|
TestTokenStream() {
|
||||||
assertEquals(tokFlags, nextToken.getFlags());
|
super();
|
||||||
assertEquals(tokPayload, nextToken.getPayload());
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
|
posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() {
|
||||||
|
termAtt.setTermBuffer("accents");
|
||||||
|
offsetAtt.setOffset(2, 7);
|
||||||
|
typeAtt.setType("wrd");
|
||||||
|
posIncAtt.setPositionIncrement(3);
|
||||||
|
payloadAtt.setPayload(new Payload(new byte[]{0,1,2,3}));
|
||||||
|
flagsAtt.setFlags(77);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -20,6 +20,12 @@ package org.apache.lucene.wikipedia.analysis;
|
||||||
import org.apache.lucene.analysis.CharReader;
|
import org.apache.lucene.analysis.CharReader;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
@ -114,6 +120,12 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
private int tokenOutput = TOKENS_ONLY;
|
private int tokenOutput = TOKENS_ONLY;
|
||||||
private Set untokenizedTypes = Collections.EMPTY_SET;
|
private Set untokenizedTypes = Collections.EMPTY_SET;
|
||||||
private Iterator tokens = null;
|
private Iterator tokens = null;
|
||||||
|
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private FlagsAttribute flagsAtt;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = CharReader.get(reader);
|
this.input = CharReader.get(reader);
|
||||||
|
@ -142,41 +154,59 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
this.tokenOutput = tokenOutput;
|
this.tokenOutput = tokenOutput;
|
||||||
this.scanner = new WikipediaTokenizerImpl(input);
|
this.scanner = new WikipediaTokenizerImpl(input);
|
||||||
this.untokenizedTypes = untokenizedTypes;
|
this.untokenizedTypes = untokenizedTypes;
|
||||||
|
this.offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
this.typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
|
this.posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
this.flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
|
return super.next(reusableToken);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated Will be removed in Lucene 3.0. This method is final, as it should
|
||||||
|
* not be overridden. Delegates to the backwards compatibility layer. */
|
||||||
|
public final Token next() throws java.io.IOException {
|
||||||
|
return super.next();
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
*/
|
*/
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (tokens != null && tokens.hasNext()){
|
if (tokens != null && tokens.hasNext()){
|
||||||
return (Token)tokens.next();
|
AttributeSource.State state = (AttributeSource.State) tokens.next();
|
||||||
|
restoreState(state);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
int tokenType = scanner.getNextToken();
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
if (tokenType == WikipediaTokenizerImpl.YYEOF) {
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
|
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
|
||||||
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
|
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
|
||||||
setupToken(reusableToken);
|
setupToken();
|
||||||
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
|
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
|
||||||
collapseTokens(reusableToken, tokenType);
|
collapseTokens(tokenType);
|
||||||
|
|
||||||
}
|
}
|
||||||
else if (tokenOutput == BOTH){
|
else if (tokenOutput == BOTH){
|
||||||
//collapse into a single token, add it to tokens AND output the individual tokens
|
//collapse into a single token, add it to tokens AND output the individual tokens
|
||||||
//output the untokenized Token first
|
//output the untokenized Token first
|
||||||
collapseAndSaveTokens(reusableToken, tokenType, type);
|
collapseAndSaveTokens(tokenType, type);
|
||||||
}
|
}
|
||||||
reusableToken.setPositionIncrement(scanner.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
|
||||||
reusableToken.setType(type);
|
typeAtt.setType(type);
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void collapseAndSaveTokens(final Token reusableToken, int tokenType, String type) throws IOException {
|
private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
|
||||||
//collapse
|
//collapse
|
||||||
StringBuffer buffer = new StringBuffer(32);
|
StringBuffer buffer = new StringBuffer(32);
|
||||||
int numAdded = scanner.setText(buffer);
|
int numAdded = scanner.setText(buffer);
|
||||||
|
@ -186,9 +216,8 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
int tmpTokType;
|
int tmpTokType;
|
||||||
int numSeen = 0;
|
int numSeen = 0;
|
||||||
List tmp = new ArrayList();
|
List tmp = new ArrayList();
|
||||||
Token saved = new Token();
|
setupSavedToken(0, type);
|
||||||
setupSavedToken(saved, 0, type);
|
tmp.add(captureState());
|
||||||
tmp.add(saved);
|
|
||||||
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
|
//while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
|
||||||
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
|
while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
|
||||||
int currPos = scanner.yychar();
|
int currPos = scanner.yychar();
|
||||||
|
@ -197,18 +226,16 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
buffer.append(' ');
|
buffer.append(' ');
|
||||||
}
|
}
|
||||||
numAdded = scanner.setText(buffer);
|
numAdded = scanner.setText(buffer);
|
||||||
saved = new Token();
|
setupSavedToken(scanner.getPositionIncrement(), type);
|
||||||
setupSavedToken(saved, scanner.getPositionIncrement(), type);
|
tmp.add(captureState());
|
||||||
tmp.add(saved);
|
|
||||||
numSeen++;
|
numSeen++;
|
||||||
lastPos = currPos + numAdded;
|
lastPos = currPos + numAdded;
|
||||||
}
|
}
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
reusableToken.setStartOffset(input.correctOffset(theStart));
|
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
|
||||||
reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
|
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
scanner.yypushback(scanner.yylength());
|
scanner.yypushback(scanner.yylength());
|
||||||
|
@ -216,13 +243,13 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
tokens = tmp.iterator();
|
tokens = tmp.iterator();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupSavedToken(Token saved, int positionInc, String type){
|
private void setupSavedToken(int positionInc, String type){
|
||||||
setupToken(saved);
|
setupToken();
|
||||||
saved.setPositionIncrement(positionInc);
|
posIncrAtt.setPositionIncrement(positionInc);
|
||||||
saved.setType(type);
|
typeAtt.setType(type);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void collapseTokens(final Token reusableToken, int tokenType) throws IOException {
|
private void collapseTokens(int tokenType) throws IOException {
|
||||||
//collapse
|
//collapse
|
||||||
StringBuffer buffer = new StringBuffer(32);
|
StringBuffer buffer = new StringBuffer(32);
|
||||||
int numAdded = scanner.setText(buffer);
|
int numAdded = scanner.setText(buffer);
|
||||||
|
@ -244,10 +271,9 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||||
reusableToken.setStartOffset(input.correctOffset(theStart));
|
offsetAtt.setOffset(input.correctOffset(theStart), input.correctOffset(theStart + s.length()));
|
||||||
reusableToken.setEndOffset(input.correctOffset(theStart + s.length()));
|
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||||
scanner.yypushback(scanner.yylength());
|
scanner.yypushback(scanner.yylength());
|
||||||
|
@ -256,11 +282,10 @@ public class WikipediaTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupToken(final Token reusableToken) {
|
private void setupToken() {
|
||||||
scanner.getText(reusableToken);
|
scanner.getText(termAtt);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
reusableToken.setStartOffset(input.correctOffset(start));
|
offsetAtt.setOffset(input.correctOffset(start), input.correctOffset(start + termAtt.termLength()));
|
||||||
reusableToken.setEndOffset(input.correctOffset(start + reusableToken.termLength()));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.wikipedia.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -476,7 +476,7 @@ public final int getPositionIncrement(){
|
||||||
/**
|
/**
|
||||||
* Fills Lucene token with the current token text.
|
* Fills Lucene token with the current token text.
|
||||||
*/
|
*/
|
||||||
final void getText(Token t) {
|
final void getText(TermAttribute t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -19,7 +19,6 @@
|
||||||
package org.apache.lucene.wikipedia.analysis;
|
package org.apache.lucene.wikipedia.analysis;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -28,6 +27,12 @@ import java.util.Map;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -131,23 +136,24 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
int numBoldItalics = 0;
|
int numBoldItalics = 0;
|
||||||
int numCategory = 0;
|
int numCategory = 0;
|
||||||
int numCitation = 0;
|
int numCitation = 0;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = tf.next(reusableToken); nextToken != null; nextToken = tf.next(reusableToken)) {
|
TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
|
||||||
String tokText = nextToken.term();
|
|
||||||
|
while (tf.incrementToken()) {
|
||||||
|
String tokText = termAtt.term();
|
||||||
//System.out.println("Text: " + tokText + " Type: " + token.type());
|
//System.out.println("Text: " + tokText + " Type: " + token.type());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
String expectedType = (String) tcm.get(tokText);
|
String expectedType = (String) tcm.get(tokText);
|
||||||
assertTrue("expectedType is null and it shouldn't be for: " + nextToken, expectedType != null);
|
assertTrue("expectedType is null and it shouldn't be for: " + tf.toString(), expectedType != null);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + expectedType + " for " + nextToken, nextToken.type().equals(expectedType) == true);
|
assertTrue(typeAtt.type() + " is not equal to " + expectedType + " for " + tf.toString(), typeAtt.type().equals(expectedType) == true);
|
||||||
count++;
|
count++;
|
||||||
if (nextToken.type().equals(WikipediaTokenizer.ITALICS) == true){
|
if (typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true){
|
||||||
numItalics++;
|
numItalics++;
|
||||||
} else if (nextToken.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
} else if (typeAtt.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
||||||
numBoldItalics++;
|
numBoldItalics++;
|
||||||
} else if (nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
} else if (typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
||||||
numCategory++;
|
numCategory++;
|
||||||
}
|
}
|
||||||
else if (nextToken.type().equals(WikipediaTokenizer.CITATION) == true){
|
else if (typeAtt.type().equals(WikipediaTokenizer.CITATION) == true){
|
||||||
numCitation++;
|
numCitation++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -166,106 +172,93 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tf.next(reusableToken);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "click", nextToken.term().equals("click") == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(termAtt.term() + " is not equal to " + "click", termAtt.term().equals("click") == true);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "link", nextToken.term().equals("link") == true);
|
assertTrue(termAtt.term() + " is not equal to " + "link", termAtt.term().equals("link") == true);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "here",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
termAtt.term().equals("here") == true);
|
||||||
nextToken.term().equals("here") == true);
|
|
||||||
//The link, and here should be at the same position for phrases to work
|
//The link, and here should be at the same position for phrases to work
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "again",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "again",
|
termAtt.term().equals("again") == true);
|
||||||
nextToken.term().equals("again") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "click",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "click",
|
termAtt.term().equals("click") == true);
|
||||||
nextToken.term().equals("click") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org",
|
termAtt.term().equals("http://lucene.apache.org") == true);
|
||||||
nextToken.term().equals("http://lucene.apache.org") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "here",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
termAtt.term().equals("here") == true);
|
||||||
nextToken.term().equals("here") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "again",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "again",
|
termAtt.term().equals("again") == true);
|
||||||
nextToken.term().equals("again") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "a",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "a",
|
termAtt.term().equals("a") == true);
|
||||||
nextToken.term().equals("a") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "b",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "b",
|
termAtt.term().equals("b") == true);
|
||||||
nextToken.term().equals("b") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "c",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "c",
|
termAtt.term().equals("c") == true);
|
||||||
nextToken.term().equals("c") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "d",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "d",
|
termAtt.term().equals("d") == true);
|
||||||
nextToken.term().equals("d") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertFalse(tf.incrementToken());
|
||||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLinks() throws Exception {
|
public void testLinks() throws Exception {
|
||||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tf.next(reusableToken);
|
TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
assertTrue(tf.incrementToken());
|
||||||
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
termAtt.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
||||||
tf.next(reusableToken);//skip here
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||||
nextToken = tf.next(reusableToken);
|
tf.incrementToken();//skip here
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
assertTrue(tf.incrementToken());
|
||||||
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
termAtt.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||||
tf.next(reusableToken);//skip here
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||||
nextToken = tf.next(reusableToken);
|
tf.incrementToken();//skip here
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
assertTrue(tf.incrementToken());
|
||||||
nextToken.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
assertTrue(termAtt.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
termAtt.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
nextToken = tf.next(reusableToken);
|
assertFalse(tf.incrementToken());
|
||||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLucene1133() throws Exception {
|
public void testLucene1133() throws Exception {
|
||||||
|
@ -277,72 +270,73 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
checkLinkPhrases(tf);
|
checkLinkPhrases(tf);
|
||||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tf.next(reusableToken);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
|
|
||||||
nextToken.term().equals("a b c d") == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(termAtt.term() + " is not equal to " + "a b c d",
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
termAtt.term().equals("a b c d") == true);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "e f g",
|
|
||||||
nextToken.term().equals("e f g") == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
assertTrue(termAtt.term() + " is not equal to " + "e f g",
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
termAtt.term().equals("e f g") == true);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
|
||||||
nextToken.term().equals("link") == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
|
assertTrue(termAtt.term() + " is not equal to " + "link",
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
|
termAtt.term().equals("link") == true);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
|
||||||
nextToken.term().equals("here") == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
|
assertTrue(termAtt.term() + " is not equal to " + "here",
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
|
termAtt.term().equals("here") == true);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
|
||||||
nextToken.term().equals("link") == true);
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
|
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
|
|
||||||
nextToken = tf.next(reusableToken);
|
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "there",
|
|
||||||
nextToken.term().equals("there") == true);
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
|
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
|
|
||||||
nextToken = tf.next(reusableToken);
|
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "italics here",
|
|
||||||
nextToken.term().equals("italics here") == true);
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
|
||||||
nextToken = tf.next(reusableToken);
|
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "something",
|
|
||||||
nextToken.term().equals("something") == true);
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
|
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
|
|
||||||
nextToken = tf.next(reusableToken);
|
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "more italics",
|
|
||||||
nextToken.term().equals("more italics") == true);
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "link",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "h i j",
|
termAtt.term().equals("link") == true);
|
||||||
nextToken.term().equals("h i j") == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "there",
|
||||||
|
termAtt.term().equals("there") == true);
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
|
||||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
|
||||||
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "italics here",
|
||||||
|
termAtt.term().equals("italics here") == true);
|
||||||
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
||||||
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
||||||
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "something",
|
||||||
|
termAtt.term().equals("something") == true);
|
||||||
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
|
||||||
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
|
||||||
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "more italics",
|
||||||
|
termAtt.term().equals("more italics") == true);
|
||||||
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
||||||
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
||||||
|
|
||||||
|
assertTrue(tf.incrementToken());
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "h i j",
|
||||||
|
termAtt.term().equals("h i j") == true);
|
||||||
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
||||||
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
||||||
|
|
||||||
|
assertFalse(tf.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBoth() throws Exception {
|
public void testBoth() throws Exception {
|
||||||
|
@ -352,225 +346,211 @@ public class WikipediaTokenizerTest extends TestCase {
|
||||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tf.addAttribute(TermAttribute.class);
|
||||||
Token nextToken = tf.next(reusableToken);
|
TypeAttribute typeAtt = (TypeAttribute) tf.addAttribute(TypeAttribute.class);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) tf.addAttribute(PositionIncrementAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
|
OffsetAttribute offsetAtt = (OffsetAttribute) tf.addAttribute(OffsetAttribute.class);
|
||||||
nextToken.term().equals("a b c d") == true);
|
FlagsAttribute flagsAtt = (FlagsAttribute) tf.addAttribute(FlagsAttribute.class);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(termAtt.term() + " is not equal to " + "a b c d",
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
termAtt.term().equals("a b c d") == true);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "a",
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
||||||
nextToken.term().equals("a") == true);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue(nextToken.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", nextToken.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(termAtt.term() + " is not equal to " + "a",
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
termAtt.term().equals("a") == true);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 12, nextToken.endOffset() == 12);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
|
assertTrue(flagsAtt.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", flagsAtt.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
||||||
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 12, offsetAtt.endOffset() == 12);
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "b",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "b",
|
termAtt.term().equals("b") == true);
|
||||||
nextToken.term().equals("b") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 13, offsetAtt.startOffset() == 13);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 13, nextToken.startOffset() == 13);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 14, offsetAtt.endOffset() == 14);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 14, nextToken.endOffset() == 14);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "c",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "c",
|
termAtt.term().equals("c") == true);
|
||||||
nextToken.term().equals("c") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 15, offsetAtt.startOffset() == 15);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 15, nextToken.startOffset() == 15);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 16, offsetAtt.endOffset() == 16);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 16, nextToken.endOffset() == 16);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "d",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "d",
|
termAtt.term().equals("d") == true);
|
||||||
nextToken.term().equals("d") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 17, offsetAtt.startOffset() == 17);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 17, nextToken.startOffset() == 17);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "e f g",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "e f g",
|
termAtt.term().equals("e f g") == true);
|
||||||
nextToken.term().equals("e f g") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "e",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "e",
|
termAtt.term().equals("e") == true);
|
||||||
nextToken.term().equals("e") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 33, offsetAtt.endOffset() == 33);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 33, nextToken.endOffset() == 33);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "f",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "f",
|
termAtt.term().equals("f") == true);
|
||||||
nextToken.term().equals("f") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 34, offsetAtt.startOffset() == 34);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 34, nextToken.startOffset() == 34);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 35, offsetAtt.endOffset() == 35);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 35, nextToken.endOffset() == 35);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "g",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "g",
|
termAtt.term().equals("g") == true);
|
||||||
nextToken.term().equals("g") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 36, offsetAtt.startOffset() == 36);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 36, nextToken.startOffset() == 36);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "link",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
termAtt.term().equals("link") == true);
|
||||||
nextToken.term().equals("link") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "here",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
termAtt.term().equals("here") == true);
|
||||||
nextToken.term().equals("here") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "link",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
termAtt.term().equals("link") == true);
|
||||||
nextToken.term().equals("link") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "there",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "there",
|
termAtt.term().equals("there") == true);
|
||||||
nextToken.term().equals("there") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "italics here",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "italics here",
|
termAtt.term().equals("italics here") == true);
|
||||||
nextToken.term().equals("italics here") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "italics",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "italics",
|
termAtt.term().equals("italics") == true);
|
||||||
nextToken.term().equals("italics") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 78, offsetAtt.endOffset() == 78);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 78, nextToken.endOffset() == 78);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "here",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
termAtt.term().equals("here") == true);
|
||||||
nextToken.term().equals("here") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 79, offsetAtt.startOffset() == 79);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 79, nextToken.startOffset() == 79);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "something",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "something",
|
termAtt.term().equals("something") == true);
|
||||||
nextToken.term().equals("something") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "more italics",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "more italics",
|
termAtt.term().equals("more italics") == true);
|
||||||
nextToken.term().equals("more italics") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "more",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "more",
|
termAtt.term().equals("more") == true);
|
||||||
nextToken.term().equals("more") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 102, offsetAtt.endOffset() == 102);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 102, nextToken.endOffset() == 102);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "italics",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "italics",
|
termAtt.term().equals("italics") == true);
|
||||||
nextToken.term().equals("italics") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
|
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 103, nextToken.startOffset() == 103);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 103, offsetAtt.startOffset() == 103);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "h i j",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "h i j",
|
termAtt.term().equals("h i j") == true);
|
||||||
nextToken.term().equals("h i j") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "h",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "h",
|
termAtt.term().equals("h") == true);
|
||||||
nextToken.term().equals("h") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 125, offsetAtt.endOffset() == 125);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 125, nextToken.endOffset() == 125);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "i",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "i",
|
termAtt.term().equals("i") == true);
|
||||||
nextToken.term().equals("i") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 128, offsetAtt.startOffset() == 128);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 128, nextToken.startOffset() == 128);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 129, offsetAtt.endOffset() == 129);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 129, nextToken.endOffset() == 129);
|
|
||||||
nextToken = tf.next(reusableToken);
|
assertTrue(tf.incrementToken());
|
||||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
assertTrue(termAtt.term() + " is not equal to " + "j",
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "j",
|
termAtt.term().equals("j") == true);
|
||||||
nextToken.term().equals("j") == true);
|
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
||||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
assertTrue(offsetAtt.startOffset() + " does not equal: " + 132, offsetAtt.startOffset() == 132);
|
||||||
assertTrue(nextToken.startOffset() + " does not equal: " + 132, nextToken.startOffset() == 132);
|
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
||||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
|
||||||
|
|
||||||
nextToken = tf.next(reusableToken);
|
|
||||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
|
||||||
|
|
||||||
|
assertFalse(tf.incrementToken());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,9 +27,9 @@ import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
@ -114,10 +114,10 @@ public final class SynExpand {
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
while (ts.incrementToken()) {
|
||||||
String word = nextToken.term();
|
String word = termAtt.term();
|
||||||
if ( already.add( word))
|
if ( already.add( word))
|
||||||
top.add( word);
|
top.add( word);
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,8 +27,8 @@ import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
|
@ -101,9 +101,10 @@ public class SynLookup {
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
|
||||||
String word = nextToken.term();
|
while (ts.incrementToken()) {
|
||||||
|
String word = termAtt.term();
|
||||||
if ( already.add( word))
|
if ( already.add( word))
|
||||||
top.add( word);
|
top.add( word);
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,8 +9,8 @@ import java.util.HashSet;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.xmlparser.DOMUtils;
|
import org.apache.lucene.xmlparser.DOMUtils;
|
||||||
|
@ -72,14 +72,14 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
||||||
if((stopWords!=null)&&(fields!=null))
|
if((stopWords!=null)&&(fields!=null))
|
||||||
{
|
{
|
||||||
stopWordsSet=new HashSet();
|
stopWordsSet=new HashSet();
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (int i = 0; i < fields.length; i++)
|
for (int i = 0; i < fields.length; i++)
|
||||||
{
|
{
|
||||||
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
|
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
while(ts.incrementToken()) {
|
||||||
stopWordsSet.add(nextToken.term());
|
stopWordsSet.add(termAtt.term());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch(IOException ioe)
|
catch(IOException ioe)
|
||||||
|
|
|
@ -5,8 +5,8 @@ import java.io.StringReader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
@ -52,9 +52,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
|
||||||
{
|
{
|
||||||
ArrayList clausesList=new ArrayList();
|
ArrayList clausesList=new ArrayList();
|
||||||
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
|
||||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,nextToken.term()));
|
while (ts.incrementToken()) {
|
||||||
|
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.term()));
|
||||||
clausesList.add(stq);
|
clausesList.add(stq);
|
||||||
}
|
}
|
||||||
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
|
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||||
|
|
|
@ -4,8 +4,8 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.Filter;
|
import org.apache.lucene.search.Filter;
|
||||||
import org.apache.lucene.search.TermsFilter;
|
import org.apache.lucene.search.TermsFilter;
|
||||||
|
@ -54,19 +54,19 @@ public class TermsFilterBuilder implements FilterBuilder
|
||||||
String text = DOMUtils.getNonBlankTextOrFail(e);
|
String text = DOMUtils.getNonBlankTextOrFail(e);
|
||||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
final Token reusableToken = new Token();
|
|
||||||
Term term = null;
|
Term term = null;
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
while (ts.incrementToken()) {
|
||||||
if (term == null)
|
if (term == null)
|
||||||
{
|
{
|
||||||
term = new Term(fieldName, nextToken.term());
|
term = new Term(fieldName, termAtt.term());
|
||||||
} else
|
} else
|
||||||
{
|
{
|
||||||
// create from previous to save fieldName.intern overhead
|
// create from previous to save fieldName.intern overhead
|
||||||
term = term.createTerm(nextToken.term());
|
term = term.createTerm(termAtt.term());
|
||||||
}
|
}
|
||||||
tf.addTerm(term);
|
tf.addTerm(term);
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,8 +4,8 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
@ -57,16 +57,16 @@ public class TermsQueryBuilder implements QueryBuilder {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
|
||||||
Term term = null;
|
Term term = null;
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
while (ts.incrementToken()) {
|
||||||
if (term == null)
|
if (term == null)
|
||||||
{
|
{
|
||||||
term = new Term(fieldName, nextToken.term());
|
term = new Term(fieldName, termAtt.term());
|
||||||
} else
|
} else
|
||||||
{
|
{
|
||||||
// create from previous to save fieldName.intern overhead
|
// create from previous to save fieldName.intern overhead
|
||||||
term = term.createTerm(nextToken.term());
|
term = term.createTerm(termAtt.term());
|
||||||
}
|
}
|
||||||
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue