LUCENE-1333: improvements to Token reuse API and full cutover to reuse API for all core and contrib analyzers

git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@687357 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael McCandless 2008-08-20 14:38:07 +00:00
parent 7675606908
commit bb6b711718
140 changed files with 3224 additions and 2337 deletions

View File

@ -108,6 +108,12 @@ API Changes
16. LUCENE-1334: Add new constructor for Term: Term(String fieldName)
which defaults term text to "". (DM Smith via Mike McCandless)
17. LUCENE-1333: Added Token.reinit(*) APIs to re-initialize (reuse) a
Token. Also added term() method to return a String, with a
performance penalty clearly documented. Also implemented
hashCode() and equals() in Token, and fixed all core and contrib
analyzers to use the re-use APIs. (DM Smith via Mike McCandless)
Bug fixes
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single

View File

@ -36,7 +36,6 @@ public final class BrazilianStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private Token token = null;
private BrazilianStemmer stemmer = null;
private Set exclusions = null;
@ -53,22 +52,23 @@ public final class BrazilianStemFilter extends TokenFilter {
/**
* @return Returns the next token in the stream, or null at EOS.
*/
public final Token next()
public final Token next(final Token reusableToken)
throws IOException {
if ((token = input.next()) == null) {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
String term = nextToken.term();
// Check the exclusion table.
if (exclusions == null || !exclusions.contains(term)) {
String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
nextToken.setTermBuffer(s);
}
// Check the exclusiontable.
else if (exclusions != null && exclusions.contains(token.termText())) {
return token;
} else {
String s = stemmer.stem(token.termText());
// If not stemmed, dont waste the time creating a new token.
if ((s != null) && !s.equals(token.termText())) {
return new Token(s, token.startOffset(), token.endOffset(), token.type());
}
return token;
}
return nextToken;
}
}

View File

@ -26,7 +26,7 @@ import java.io.Reader;
/**
* CJKTokenizer was modified from StopTokenizer which does a decent job for
* most European languages. It performs other token methods for double-byte
* Characters: the token will return at each two charactors with overlap match.<br>
* Characters: the token will return at each two characters with overlap match.<br>
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
* also need filter filter zero length token ""<br>
* for Digit: digit, '+', '#' will token as letter<br>
@ -96,24 +96,26 @@ public final class CJKTokenizer extends Tokenizer {
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
* for detail.
*
* @param reusableToken a reusable token
* @return Token
*
* @throws java.io.IOException - throw IOException when read error <br>
* hanppened in the InputStream
* happened in the InputStream
*
*/
public final Token next() throws java.io.IOException {
public final Token next(final Token reusableToken) throws java.io.IOException {
/** how many character(s) has been stored in buffer */
assert reusableToken != null;
int length = 0;
/** the position used to create Token */
int start = offset;
while (true) {
/** current charactor */
/** current character */
char c;
/** unicode block of current charactor for detail */
/** unicode block of current character for detail */
Character.UnicodeBlock ub;
offset++;
@ -198,7 +200,7 @@ public final class CJKTokenizer extends Tokenizer {
}
}
} else {
// non-ASCII letter, eg."C1C2C3C4"
// non-ASCII letter, e.g."C1C2C3C4"
if (Character.isLetter(c)) {
if (length == 0) {
start = offset - 1;
@ -236,8 +238,6 @@ public final class CJKTokenizer extends Tokenizer {
}
}
return new Token(new String(buffer, 0, length), start, start + length,
tokenType
);
return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
}
}

View File

@ -18,7 +18,10 @@ package org.apache.lucene.analysis.cn;
*/
import java.util.Hashtable;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
/**
* Title: ChineseFilter
@ -61,10 +64,11 @@ public final class ChineseFilter extends TokenFilter {
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
}
public final Token next() throws java.io.IOException {
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
for (Token token = input.next(); token != null; token = input.next()) {
String text = token.termText();
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
String text = nextToken.term();
// why not key off token type here assuming ChineseTokenizer comes first?
if (stopTable.get(text) == null) {
@ -75,7 +79,7 @@ public final class ChineseFilter extends TokenFilter {
// English word/token should larger than 1 character.
if (text.length()>1) {
return token;
return nextToken;
}
break;
case Character.OTHER_LETTER:
@ -83,7 +87,7 @@ public final class ChineseFilter extends TokenFilter {
// One Chinese character as one Chinese word.
// Chinese word extraction to be added later here.
return token;
return nextToken;
}
}

View File

@ -19,7 +19,9 @@ package org.apache.lucene.analysis.cn;
import java.io.Reader;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
/**
@ -75,17 +77,19 @@ public final class ChineseTokenizer extends Tokenizer {
}
private final Token flush() {
private final Token flush(final Token token) {
if (length>0) {
//System.out.println(new String(buffer, 0, length));
return new Token(new String(buffer, 0, length), start, start+length);
//System.out.println(new String(buffer, 0,
//length));
return token.reinit(buffer, 0, length, start, start+length);
}
else
return null;
}
public final Token next() throws java.io.IOException {
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
length = 0;
start = offset;
@ -101,7 +105,7 @@ public final class ChineseTokenizer extends Tokenizer {
bufferIndex = 0;
}
if (dataLen == -1) return flush();
if (dataLen == -1) return flush(reusableToken);
else
c = ioBuffer[bufferIndex++];
@ -112,20 +116,20 @@ public final class ChineseTokenizer extends Tokenizer {
case Character.LOWERCASE_LETTER:
case Character.UPPERCASE_LETTER:
push(c);
if (length == MAX_WORD_LEN) return flush();
if (length == MAX_WORD_LEN) return flush(reusableToken);
break;
case Character.OTHER_LETTER:
if (length>0) {
bufferIndex--;
offset--;
return flush();
return flush(reusableToken);
}
push(c);
return flush();
return flush(reusableToken);
default:
if (length>0) return flush();
if (length>0) return flush(reusableToken);
break;
}
}

View File

@ -105,17 +105,18 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
return dict;
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (tokens.size() > 0) {
return (Token)tokens.removeFirst();
}
Token token = input.next();
if (token == null) {
Token nextToken = input.next(reusableToken);
if (nextToken == null) {
return null;
}
decompose(token);
decompose(nextToken);
if (tokens.size() > 0) {
return (Token)tokens.removeFirst();
@ -145,17 +146,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
protected final Token createToken(final int offset, final int length,
final Token prototype) {
Token t = new Token(prototype.startOffset() + offset, prototype
.startOffset()
+ offset + length, prototype.type());
t.setTermBuffer(prototype.termBuffer(), offset, length);
int newStart = prototype.startOffset() + offset;
Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length);
t.setPositionIncrement(0);
return t;
}
protected void decompose(final Token token) {
// In any case we give the original token back
tokens.add(token);
tokens.add((Token) token.clone());
// Only words longer than minWordSize get processed
if (token.termLength() < this.minWordSize) {

View File

@ -37,7 +37,6 @@ public final class GermanStemFilter extends TokenFilter
/**
* The actual token in the input stream.
*/
private Token token = null;
private GermanStemmer stemmer = null;
private Set exclusionSet = null;
@ -48,7 +47,7 @@ public final class GermanStemFilter extends TokenFilter
}
/**
* Builds a GermanStemFilter that uses an exclusiontable.
* Builds a GermanStemFilter that uses an exclusion table.
*/
public GermanStemFilter( TokenStream in, Set exclusionSet )
{
@ -59,25 +58,24 @@ public final class GermanStemFilter extends TokenFilter
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next()
public final Token next(final Token reusableToken)
throws IOException
{
if ( ( token = input.next() ) == null ) {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
String term = nextToken.term();
// Check the exclusion table.
if (exclusionSet == null || !exclusionSet.contains(term)) {
String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
nextToken.setTermBuffer(s);
}
// Check the exclusiontable
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
return token;
}
else {
String s = stemmer.stem( token.termText() );
// If not stemmed, dont waste the time creating a new token
if ( !s.equals( token.termText() ) ) {
return new Token( s, token.startOffset(),
token.endOffset(), token.type() );
}
return token;
}
return nextToken;
}
/**

View File

@ -35,25 +35,20 @@ public final class GreekLowerCaseFilter extends TokenFilter
this.charset = charset;
}
public final Token next() throws java.io.IOException
public final Token next(final Token reusableToken) throws java.io.IOException
{
Token t = input.next();
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (t == null)
if (nextToken == null)
return null;
String txt = t.termText();
char[] chArray = txt.toCharArray();
for (int i = 0; i < chArray.length; i++)
char[] chArray = nextToken.termBuffer();
int chLen = nextToken.termLength();
for (int i = 0; i < chLen; i++)
{
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
}
String newTxt = new String(chArray);
// create new token
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
return newToken;
return nextToken;
}
}

View File

@ -38,7 +38,7 @@ import org.apache.lucene.analysis.TokenFilter;
public class ElisionFilter extends TokenFilter {
private Set articles = null;
private static String apostrophes = "'";
private static char[] apostrophes = {'\'', ''};
public void setArticles(Set articles) {
this.articles = new HashSet();
@ -74,25 +74,36 @@ public class ElisionFilter extends TokenFilter {
}
/**
* Returns the next input Token whith termText() without elisioned start
* Returns the next input Token with term() without elisioned start
*/
public Token next() throws IOException {
Token t = input.next();
if (t == null)
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
String text = t.termText();
System.out.println(text);
int minPoz = -1;
int poz;
for (int i = 0; i < apostrophes.length(); i++) {
poz = text.indexOf(apostrophes.charAt(i));
if (poz != -1)
minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz);
char[] termBuffer = nextToken.termBuffer();
int termLength = nextToken.termLength();
int minPoz = Integer.MAX_VALUE;
for (int i = 0; i < apostrophes.length; i++) {
char apos = apostrophes[i];
// The equivalent of String.indexOf(ch)
for (int poz = 0; poz < termLength ; poz++) {
if (termBuffer[poz] == apos) {
minPoz = Math.min(poz, minPoz);
break;
}
}
}
if (minPoz != -1
&& articles.contains(text.substring(0, minPoz).toLowerCase()))
text = text.substring(minPoz + 1);
return new Token(text, t.startOffset(), t.endOffset(), t.type());
// An apostrophe has been found. If the prefix is an article strip it off.
if (minPoz != Integer.MAX_VALUE
&& articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) {
nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1));
}
return nextToken;
}
}

View File

@ -37,12 +37,11 @@ public final class FrenchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private Token token = null;
private FrenchStemmer stemmer = null;
private Set exclusions = null;
public FrenchStemFilter( TokenStream in ) {
super(in);
super(in);
stemmer = new FrenchStemmer();
}
@ -55,23 +54,23 @@ public final class FrenchStemFilter extends TokenFilter {
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next()
public final Token next(final Token reusableToken)
throws IOException {
if ( ( token = input.next() ) == null ) {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
String term = nextToken.term();
// Check the exclusion table
if ( exclusions == null || !exclusions.contains( term ) ) {
String s = stemmer.stem( term );
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals( term ) )
nextToken.setTermBuffer(s);
}
// Check the exclusiontable
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
return token;
}
else {
String s = stemmer.stem( token.termText() );
// If not stemmed, dont waste the time creating a new token
if ( !s.equals( token.termText() ) ) {
return new Token( s, token.startOffset(), token.endOffset(), token.type());
}
return token;
}
return nextToken;
}
/**
* Set a alternative/custom FrenchStemmer for this filter.

View File

@ -27,18 +27,8 @@ import java.io.IOException;
*/
public class EmptyTokenStream extends TokenStream {
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return null;
}
public Token next(Token result) throws IOException {
return null;
}
public void reset() throws IOException {
}
public void close() throws IOException {
}
}

View File

@ -55,8 +55,9 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
}
public Token next(Token result) throws IOException {
return suffix.next(result);
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return suffix.next(reusableToken);
}

View File

@ -41,30 +41,34 @@ public class PrefixAwareTokenFilter extends TokenStream {
prefixExhausted = false;
}
private CopyableToken previousPrefixToken = new CopyableToken();
private Token previousPrefixToken = new Token();
private boolean prefixExhausted;
public Token next(Token result) throws IOException {
Token buf = result;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!prefixExhausted) {
result = prefix.next(result);
if (result == null) {
Token nextToken = prefix.next(reusableToken);
if (nextToken == null) {
prefixExhausted = true;
} else {
previousPrefixToken.copyFrom(result);
return result;
previousPrefixToken.reinit(nextToken);
// Make it a deep copy
Payload p = previousPrefixToken.getPayload();
if (p != null) {
previousPrefixToken.setPayload((Payload) p.clone());
}
return nextToken;
}
}
result = suffix.next(buf);
if (result == null) {
Token nextToken = suffix.next(reusableToken);
if (nextToken == null) {
return null;
}
return updateSuffixToken(result, previousPrefixToken);
return updateSuffixToken(nextToken, previousPrefixToken);
}
/**
@ -98,7 +102,6 @@ public class PrefixAwareTokenFilter extends TokenStream {
}
public TokenStream getPrefix() {
return prefix;
}
@ -114,35 +117,4 @@ public class PrefixAwareTokenFilter extends TokenStream {
public void setSuffix(TokenStream suffix) {
this.suffix = suffix;
}
public static class CopyableToken extends Token {
private Payload buf = new Payload();
public void copyFrom(Token source) {
if (source.termBuffer() != null) {
setTermBuffer(source.termBuffer(), 0, source.termLength());
} else {
setTermText(null);
setTermLength(0);
}
setPositionIncrement(source.getPositionIncrement());
setFlags(source.getFlags());
setStartOffset(source.startOffset());
setEndOffset(source.endOffset());
setType(source.type());
if (source.getPayload() == null) {
setPayload(null);
} else {
setPayload(buf);
if (buf.getData() == null || buf.getData().length < source.getPayload().length()) {
buf.setData(new byte[source.getPayload().length()]);
}
source.getPayload().copyTo(buf.getData(), 0);
buf.setData(buf.getData(), 0, source.getPayload().length());
}
}
}
}

View File

@ -28,20 +28,23 @@ import java.io.IOException;
public class SingleTokenTokenStream extends TokenStream {
private boolean exhausted = false;
// The token needs to be immutable, so work with clones!
private Token token;
public SingleTokenTokenStream(Token token) {
this.token = token;
assert token != null;
this.token = (Token) token.clone();
}
public Token next(Token result) throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (exhausted) {
return null;
}
exhausted = true;
return token;
return (Token) token.clone();
}
@ -50,10 +53,10 @@ public class SingleTokenTokenStream extends TokenStream {
}
public Token getToken() {
return token;
return (Token) token.clone();
}
public void setToken(Token token) {
this.token = token;
this.token = (Token) token.clone();
}
}

View File

@ -115,30 +115,30 @@ public class EdgeNGramTokenFilter extends TokenFilter {
}
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
Token token = input.next();
if (token == null) {
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
}
ngram(token);
ngram(nextToken);
if (ngrams.size() > 0)
return (Token) ngrams.removeFirst();
else
return null;
}
private void ngram(Token token) {
String inStr = token.termText();
int inLen = inStr.length();
private void ngram(final Token token) {
int termLength = token.termLength();
char[] termBuffer = token.termBuffer();
int gramSize = minGram;
while (gramSize <= maxGram) {
// if the remaining input is too short, we can't generate any n-grams
if (gramSize > inLen) {
if (gramSize > termLength) {
return;
}
@ -147,13 +147,13 @@ public class EdgeNGramTokenFilter extends TokenFilter {
return;
}
Token tok;
if (side == Side.FRONT) {
tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
}
else {
tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
}
// grab gramSize chars from front or back
int start = side == Side.FRONT ? 0 : termLength - gramSize;
int end = start + gramSize;
Token tok = (Token) token.clone();
tok.setStartOffset(start);
tok.setEndOffset(end);
tok.setTermBuffer(termBuffer, start, gramSize);
ngrams.add(tok);
gramSize++;
}

View File

@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ngram;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
import java.io.IOException;
import java.io.Reader;
@ -113,13 +114,14 @@ public class EdgeNGramTokenizer extends Tokenizer {
}
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
// if we are just starting, read the whole input
if (!started) {
started = true;
char[] chars = new char[1024];
input.read(chars);
inStr = new String(chars).trim(); // remove any trailing empty strings
inStr = new String(chars).trim(); // remove any leading or trailing spaces
inLen = inStr.length();
gramSize = minGram;
}
@ -134,15 +136,13 @@ public class EdgeNGramTokenizer extends Tokenizer {
return null;
}
Token tok;
if (side == Side.FRONT) {
tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
}
else {
tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
}
// grab gramSize chars from front or back
int start = side == Side.FRONT ? 0 : inLen - gramSize;
int end = start + gramSize;
reusableToken.setTermBuffer(inStr, start, gramSize);
reusableToken.setStartOffset(start);
reusableToken.setEndOffset(end);
gramSize++;
return tok;
return reusableToken;
}
}

View File

@ -63,17 +63,17 @@ public class NGramTokenFilter extends TokenFilter {
}
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (ngrams.size() > 0) {
return (Token) ngrams.removeFirst();
}
Token token = input.next();
if (token == null) {
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
}
ngram(token);
ngram(nextToken);
if (ngrams.size() > 0)
return (Token) ngrams.removeFirst();
else
@ -81,16 +81,13 @@ public class NGramTokenFilter extends TokenFilter {
}
private void ngram(Token token) {
String inStr = token.termText();
int inLen = inStr.length();
char[] termBuffer = token.termBuffer();
int termLength = token.termLength();
int gramSize = minGram;
while (gramSize <= maxGram) {
int pos = 0; // reset to beginning of string
while (pos+gramSize <= inLen) { // while there is input
String gram = inStr.substring(pos, pos+gramSize);
Token tok = new Token(gram, pos, pos+gramSize);
// tok.setPositionIncrement(pos);
ngrams.add(tok);
while (pos+gramSize <= termLength) { // while there is input
ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize));
pos++;
}
gramSize++; // increase n-gram size

View File

@ -64,7 +64,8 @@ public class NGramTokenizer extends Tokenizer {
}
/** Returns the next token in the stream, or null at EOS. */
public final Token next() throws IOException {
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!started) {
started = true;
gramSize = minGram;
@ -82,9 +83,9 @@ public class NGramTokenizer extends Tokenizer {
if (pos+gramSize > inLen)
return null;
}
String gram = inStr.substring(pos, pos+gramSize);
int oldPos = pos;
pos++;
return new Token(gram, oldPos, oldPos+gramSize);
return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
}
}

View File

@ -38,7 +38,6 @@ public final class DutchStemFilter extends TokenFilter {
/**
* The actual token in the input stream.
*/
private Token token = null;
private DutchStemmer stemmer = null;
private Set exclusions = null;
@ -48,7 +47,7 @@ public final class DutchStemFilter extends TokenFilter {
}
/**
* Builds a DutchStemFilter that uses an exclusiontable.
* Builds a DutchStemFilter that uses an exclusion table.
*/
public DutchStemFilter(TokenStream _in, Set exclusiontable) {
this(_in);
@ -66,23 +65,22 @@ public final class DutchStemFilter extends TokenFilter {
/**
* @return Returns the next token in the stream, or null at EOS
*/
public Token next() throws IOException {
if ((token = input.next()) == null) {
public Token next(Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
}
// Check the exclusiontable
else if (exclusions != null && exclusions.contains(token.termText())) {
return token;
} else {
String s = stemmer.stem(token.termText());
// If not stemmed, dont waste the time creating a new token
if (!s.equals(token.termText())) {
return new Token(s, token.startOffset(),
token.endOffset(), token.type());
}
return token;
String term = nextToken.term();
// Check the exclusion table.
if (exclusions == null || !exclusions.contains(term)) {
String s = stemmer.stem(term);
// If not stemmed, don't waste the time adjusting the token.
if ((s != null) && !s.equals(term))
nextToken.setTermBuffer(s);
}
return nextToken;
}
/**

View File

@ -41,11 +41,12 @@ public class NumericPayloadTokenFilter extends TokenFilter {
this.typeMatch = typeMatch;
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.type().equals(typeMatch)){
result.setPayload(thePayload);
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null && nextToken.type().equals(typeMatch)){
nextToken.setPayload(thePayload);
}
return result;
return nextToken;
}
}

View File

@ -38,15 +38,16 @@ public class TokenOffsetPayloadTokenFilter extends TokenFilter {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null){
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null){
byte[] data = new byte[8];
PayloadHelper.encodeInt(result.startOffset(), data, 0);
PayloadHelper.encodeInt(result.endOffset(), data, 4);
PayloadHelper.encodeInt(nextToken.startOffset(), data, 0);
PayloadHelper.encodeInt(nextToken.endOffset(), data, 4);
Payload payload = new Payload(data);
result.setPayload(payload);
nextToken.setPayload(payload);
}
return result;
return nextToken;
}
}

View File

@ -39,11 +39,12 @@ public class TypeAsPayloadTokenFilter extends TokenFilter {
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.type() != null && result.type().equals("") == false){
result.setPayload(new Payload(result.type().getBytes("UTF-8")));
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){
nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8")));
}
return result;
return nextToken;
}
}

View File

@ -37,25 +37,20 @@ public final class RussianLowerCaseFilter extends TokenFilter
this.charset = charset;
}
public final Token next() throws java.io.IOException
public final Token next(final Token reusableToken) throws java.io.IOException
{
Token t = input.next();
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (t == null)
if (nextToken == null)
return null;
String txt = t.termText();
char[] chArray = txt.toCharArray();
for (int i = 0; i < chArray.length; i++)
char[] chArray = nextToken.termBuffer();
int chLen = nextToken.termLength();
for (int i = 0; i < chLen; i++)
{
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
}
String newTxt = new String(chArray);
// create new token
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
return newToken;
return nextToken;
}
}

View File

@ -35,7 +35,6 @@ public final class RussianStemFilter extends TokenFilter
/**
* The actual token in the input stream.
*/
private Token token = null;
private RussianStemmer stemmer = null;
public RussianStemFilter(TokenStream in, char[] charset)
@ -47,22 +46,18 @@ public final class RussianStemFilter extends TokenFilter
/**
* @return Returns the next token in the stream, or null at EOS
*/
public final Token next() throws IOException
public final Token next(final Token reusableToken) throws IOException
{
if ((token = input.next()) == null)
{
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
}
else
{
String s = stemmer.stem(token.termText());
if (!s.equals(token.termText()))
{
return new Token(s, token.startOffset(), token.endOffset(),
token.type());
}
return token;
}
String term = nextToken.term();
String s = stemmer.stem(term);
if (s != null && !s.equals(term))
nextToken.setTermBuffer(s);
return nextToken;
}
/**

View File

@ -47,7 +47,7 @@ public class ShingleFilter extends TokenFilter {
/**
* filler token for when positionIncrement is more than 1
*/
public static final String FILLER_TOKEN = "_";
public static final char[] FILLER_TOKEN = { '_' };
/**
@ -150,11 +150,12 @@ public class ShingleFilter extends TokenFilter {
}
/* (non-Javadoc)
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next() throws IOException {
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (outputBuf.isEmpty()) {
fillOutputBuf();
fillOutputBuf(reusableToken);
}
Token nextToken = null;
if ( ! outputBuf.isEmpty())
@ -173,16 +174,19 @@ public class ShingleFilter extends TokenFilter {
* @return the next token, or null if at end of input stream
* @throws IOException if the input stream has a problem
*/
private Token getNextToken() throws IOException {
private Token getNextToken(final Token reusableToken) throws IOException {
if (tokenBuf.isEmpty()) {
Token lastToken = input.next();
if (lastToken != null) {
for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
lastToken.startOffset()));
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
for (int i = 1; i < nextToken.getPositionIncrement(); i++) {
Token fillerToken = (Token) nextToken.clone();
// A filler token occupies no space
fillerToken.setEndOffset(fillerToken.startOffset());
fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
tokenBuf.add(fillerToken);
}
tokenBuf.add(lastToken);
return getNextToken();
tokenBuf.add(nextToken.clone());
return getNextToken(nextToken);
} else {
return null;
}
@ -196,15 +200,15 @@ public class ShingleFilter extends TokenFilter {
*
* @throws IOException if there's a problem getting the next token
*/
private void fillOutputBuf() throws IOException {
private void fillOutputBuf(Token token) throws IOException {
boolean addedToken = false;
/*
* Try to fill the shingle buffer.
*/
do {
Token token = getNextToken();
token = getNextToken(token);
if (token != null) {
shingleBuf.add(token);
shingleBuf.add(token.clone());
if (shingleBuf.size() > maxShingleSize)
{
shingleBuf.remove(0);
@ -235,17 +239,17 @@ public class ShingleFilter extends TokenFilter {
}
int i = 0;
Token token = null;
Token shingle = null;
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
token = (Token) it.next();
shingle = (Token) it.next();
for (int j = i; j < shingles.length; j++) {
if (shingles[j].length() != 0) {
shingles[j].append(TOKEN_SEPARATOR);
}
shingles[j].append(token.termBuffer(), 0, token.termLength());
shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
}
endOffsets[i] = token.endOffset();
endOffsets[i] = shingle.endOffset();
i++;
}
@ -258,17 +262,26 @@ public class ShingleFilter extends TokenFilter {
/*
* Push new tokens to the output buffer.
*/
if (!shingleBuf.isEmpty()) {
Token firstShingle = (Token) shingleBuf.get(0);
shingle = (Token) firstShingle.clone();
shingle.setType(tokenType);
}
for (int j = 1; j < shingleBuf.size(); j++) {
Token shingle = new Token(shingles[j].toString(),
((Token) shingleBuf.get(0)).startOffset(),
endOffsets[j],
tokenType);
shingle.setEndOffset(endOffsets[j]);
StringBuffer buf = shingles[j];
int termLength = buf.length();
char[] termBuffer = shingle.termBuffer();
if (termBuffer.length < termLength)
termBuffer = shingle.resizeTermBuffer(termLength);
buf.getChars(0, termLength, termBuffer, 0);
shingle.setTermLength(termLength);
if ((! outputUnigrams) && j == 1) {
shingle.setPositionIncrement(1);
} else {
shingle.setPositionIncrement(0);
}
outputBuf.add(shingle);
outputBuf.add(shingle.clone());
}
}
}

View File

@ -17,15 +17,22 @@ package org.apache.lucene.analysis.shingle;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.index.Payload;
import java.io.IOException;
import java.util.*;
/**
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
@ -298,7 +305,8 @@ public class ShingleMatrixFilter extends TokenStream {
private Matrix matrix;
public Token next(Token token) throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (matrix == null) {
matrix = new Matrix();
// fill matrix with maximumShingleSize columns
@ -318,7 +326,7 @@ public class ShingleMatrixFilter extends TokenStream {
if (ignoringSinglePrefixOrSuffixShingle
&& currentShingleLength == 1
&& (currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isFirst() || currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isLast())) {
return next(token);
return next(reusableToken);
}
int termLength = 0;
@ -336,21 +344,21 @@ public class ShingleMatrixFilter extends TokenStream {
// only produce shingles that not already has been created
if (!shinglesSeen.add(shingle)) {
return next(token);
return next(reusableToken);
}
// shingle token factory
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future.
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
for (Token shingleToken : shingle) {
if (spacerCharacter != null && sb.length() > 0) {
sb.append(spacerCharacter);
}
sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
}
token.setTermText(sb.toString());
updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
reusableToken.setTermBuffer(sb.toString());
updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
return token;
return reusableToken;
} else {
@ -360,7 +368,7 @@ public class ShingleMatrixFilter extends TokenStream {
// reset shingle size and move one step to the right in the current tokens permutation
currentPermutationTokensStartOffset++;
currentShingleLength = minimumShingleSize - 1;
return next(token);
return next(reusableToken);
}
@ -411,7 +419,7 @@ public class ShingleMatrixFilter extends TokenStream {
}
nextTokensPermutation();
return next(token);
return next(reusableToken);
}
}
@ -426,7 +434,7 @@ public class ShingleMatrixFilter extends TokenStream {
nextTokensPermutation();
return next(token);
return next(reusableToken);
}
/**

View File

@ -73,10 +73,10 @@ public class DateRecognizerSinkTokenizer extends SinkTokenizer {
//Check to see if this token is a date
if (t != null) {
try {
Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
Date date = dateFormat.parse(t.term());//We don't care about the date, just that we can parse it as a date
if (date != null) {
t.setType(DATE_TYPE);
lst.add(t.clone());
super.add(t);
}
} catch (ParseException e) {

View File

@ -48,7 +48,7 @@ public class TokenTypeSinkTokenizer extends SinkTokenizer {
public void add(Token t) {
//check to see if this is a Category
if (t != null && typeToMatch.equals(t.type())){
lst.add(t.clone());
super.add(t);
}
}
}

View File

@ -40,31 +40,38 @@ public class ThaiWordFilter extends TokenFilter {
breaker = BreakIterator.getWordInstance(new Locale("th"));
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (thaiToken != null) {
String text = thaiToken.termText();
int start = breaker.current();
int end = breaker.next();
if (end != BreakIterator.DONE) {
return new Token(text.substring(start, end),
thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
reusableToken.setStartOffset(thaiToken.startOffset()+start);
reusableToken.setEndOffset(thaiToken.endOffset()+end);
return reusableToken;
}
thaiToken = null;
}
Token tk = input.next();
if (tk == null) {
Token nextToken = input.next(reusableToken);
if (nextToken == null || nextToken.termLength() == 0) {
return null;
}
String text = tk.termText();
String text = nextToken.term();
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
nextToken.setTermBuffer(text.toLowerCase());
return nextToken;
}
thaiToken = tk;
thaiToken = (Token) nextToken.clone();
breaker.setText(text);
int end = breaker.next();
if (end != BreakIterator.DONE) {
return new Token(text.substring(0, end),
thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
nextToken.setTermBuffer(text, 0, end);
nextToken.setEndOffset(nextToken.startOffset() + end);
return nextToken;
}
return null;
}

View File

@ -33,14 +33,13 @@ public class TestChineseTokenizer extends TestCase
{
String s = "a天b";
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
Token token;
int correctStartOffset = 0;
int correctEndOffset = 1;
while ((token = tokenizer.next()) != null)
{
assertEquals(correctStartOffset, token.startOffset());
assertEquals(correctEndOffset, token.endOffset());
final Token reusableToken = new Token();
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
assertEquals(correctStartOffset, nextToken.startOffset());
assertEquals(correctEndOffset, nextToken.endOffset());
correctStartOffset++;
correctEndOffset++;
}

View File

@ -153,15 +153,16 @@ public class TestCompoundWordTokenFilter extends TestCase {
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
int[] endOffset, int[] posIncr) throws Exception {
final Token reusableToken = new Token();
for (int i = 0; i < s.length; ++i) {
Token t = tf.next();
assertNotNull(t);
assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength()));
assertEquals(startOffset[i], t.startOffset());
assertEquals(endOffset[i], t.endOffset());
assertEquals(posIncr[i], t.getPositionIncrement());
Token nextToken = tf.next(reusableToken);
assertNotNull(nextToken);
assertEquals(s[i], nextToken.term());
assertEquals(startOffset[i], nextToken.startOffset());
assertEquals(endOffset[i], nextToken.endOffset());
assertEquals(posIncr[i], nextToken.getPositionIncrement());
}
assertNull(tf.next());
assertNull(tf.next(reusableToken));
}
private void getHyphenationPatternFileContents() {

View File

@ -69,10 +69,11 @@ public class TestGermanStemFilter extends TestCase {
private void check(final String input, final String expected) throws IOException {
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
GermanStemFilter filter = new GermanStemFilter(tokenStream);
Token t = filter.next();
if (t == null)
final Token reusableToken = new Token();
Token nextToken = filter.next(reusableToken);
if (nextToken == null)
fail();
assertEquals(expected, t.termText());
assertEquals(expected, nextToken.term());
filter.close();
}

View File

@ -42,12 +42,13 @@ public class GreekAnalyzerTest extends TestCase {
*/
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
for (int i=0; i<output.length; i++) {
Token t = ts.next();
assertNotNull(t);
assertEquals(t.termText(), output[i]);
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(nextToken.term(), output[i]);
}
assertNull(ts.next());
assertNull(ts.next(reusableToken));
ts.close();
}

View File

@ -53,13 +53,9 @@ public class TestElision extends TestCase {
private List filtre(TokenFilter filter) {
List tas = new ArrayList();
try {
boolean encore = true;
Token token;
while (encore) {
token = filter.next();
encore = token != null;
if (token != null)
tas.add(token.termText());
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
tas.add(nextToken.term());
}
} catch (IOException e) {
e.printStackTrace();

View File

@ -77,12 +77,13 @@ public class TestFrenchAnalyzer extends TestCase {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
for (int i = 0; i < output.length; i++) {
Token t = ts.next();
assertNotNull(t);
assertEquals(t.termText(), output[i]);
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(nextToken.term(), output[i]);
}
assertNull(ts.next());
assertNull(ts.next(reusableToken));
ts.close();
}

View File

@ -30,25 +30,32 @@ public class TestPrefixAndSuffixAwareTokenFilter extends TestCase {
public void test() throws IOException {
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
new SingleTokenTokenStream(new Token("^", 0, 0)),
new SingleTokenTokenStream(createToken("^", 0, 0)),
new WhitespaceTokenizer(new StringReader("hello world")),
new SingleTokenTokenStream(new Token("$", 0, 0)));
new SingleTokenTokenStream(createToken("$", 0, 0)));
assertNext(ts, "^", 0, 0);
assertNext(ts, "hello", 0, 5);
assertNext(ts, "world", 6, 11);
assertNext(ts, "$", 11, 11);
assertNull(ts.next());
Token token = new Token();
assertNext(ts, token, "^", 0, 0);
assertNext(ts, token, "hello", 0, 5);
assertNext(ts, token, "world", 6, 11);
assertNext(ts, token, "$", 11, 11);
assertNull(ts.next(token));
}
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
Token token = ts.next();
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
assertEquals(startOffset, token.startOffset());
assertEquals(endOffset, token.endOffset());
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
}

View File

@ -32,33 +32,40 @@ public class TestPrefixAwareTokenFilter extends TestCase {
PrefixAwareTokenFilter ts;
ts = new PrefixAwareTokenFilter(
new SingleTokenTokenStream(new Token("a", 0, 1)),
new SingleTokenTokenStream(new Token("b", 0, 1)));
assertNext(ts, "a", 0, 1);
assertNext(ts, "b", 1, 2);
assertNull(ts.next());
new SingleTokenTokenStream(createToken("a", 0, 1)),
new SingleTokenTokenStream(createToken("b", 0, 1)));
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "a", 0, 1);
assertNext(ts, reusableToken, "b", 1, 2);
assertNull(ts.next(reusableToken));
// prefix and suffix using 2x prefix
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(new Token("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(new Token("$", 0, 0)));
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
assertNext(ts, "^", 0, 0);
assertNext(ts, "hello", 0, 5);
assertNext(ts, "world", 6, 11);
assertNext(ts, "$", 11, 11);
assertNull(ts.next());
assertNext(ts, reusableToken, "^", 0, 0);
assertNext(ts, reusableToken, "hello", 0, 5);
assertNext(ts, reusableToken, "world", 6, 11);
assertNext(ts, reusableToken, "$", 11, 11);
assertNull(ts.next(reusableToken));
}
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
Token token = ts.next();
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
assertEquals(startOffset, token.startOffset());
assertEquals(endOffset, token.endOffset());
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
}

View File

@ -17,23 +17,20 @@ package org.apache.lucene.analysis.miscellaneous;
* limitations under the License.
*/
import junit.framework.TestCase;
import java.io.IOException;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.util.LuceneTestCase;
public class TestSingleTokenTokenFilter extends TestCase {
public class TestSingleTokenTokenFilter extends LuceneTestCase {
public void test() throws IOException {
Token token = new Token();
SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
assertEquals(token, ts.next());
assertNull(ts.next());
final Token reusableToken = new Token();
assertEquals(token, ts.next(reusableToken));
assertNull(ts.next(reusableToken));
}
}

View File

@ -68,52 +68,46 @@ public class EdgeNGramTokenFilterTest extends TestCase {
public void testFrontUnigram() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
Token token = null;
token = tokenizer.next();
assertEquals("(a,0,1)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(a,0,1)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testBackUnigram() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
Token token = null;
token = tokenizer.next();
assertEquals("(e,4,5)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(e,4,5)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
Token token = null;
token = tokenizer.next();
assertNull(token);
assertNull(tokenizer.next(new Token()));
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
Token token = null;
token = tokenizer.next();
assertEquals("(a,0,1)", token.toString());
token = tokenizer.next();
assertEquals("(ab,0,2)", token.toString());
token = tokenizer.next();
assertEquals("(abc,0,3)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(a,0,1)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(ab,0,2)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(abc,0,3)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
Token token = null;
token = tokenizer.next();
assertEquals("(e,4,5)", token.toString());
token = tokenizer.next();
assertEquals("(de,3,5)", token.toString());
token = tokenizer.next();
assertEquals("(cde,2,5)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(e,4,5)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(de,3,5)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(cde,2,5)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
}

View File

@ -66,52 +66,46 @@ public class EdgeNGramTokenizerTest extends TestCase {
public void testFrontUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
Token token = null;
token = tokenizer.next();
assertEquals("(a,0,1)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(a,0,1)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testBackUnigram() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
Token token = null;
token = tokenizer.next();
assertEquals("(e,4,5)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(e,4,5)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testOversizedNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
Token token = null;
token = tokenizer.next();
assertNull(token);
assertNull(tokenizer.next(new Token()));
}
public void testFrontRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
Token token = null;
token = tokenizer.next();
assertEquals("(a,0,1)", token.toString());
token = tokenizer.next();
assertEquals("(ab,0,2)", token.toString());
token = tokenizer.next();
assertEquals("(abc,0,3)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(a,0,1)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(ab,0,2)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(abc,0,3)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
public void testBackRangeOfNgrams() throws Exception {
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
Token token = null;
token = tokenizer.next();
assertEquals("(e,4,5)", token.toString());
token = tokenizer.next();
assertEquals("(de,3,5)", token.toString());
token = tokenizer.next();
assertEquals("(cde,2,5)", token.toString());
token = tokenizer.next();
assertNull(token);
final Token reusableToken = new Token();
Token nextToken = tokenizer.next(reusableToken);
assertEquals("(e,4,5)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(de,3,5)", nextToken.toString());
nextToken = tokenizer.next(reusableToken);
assertEquals("(cde,2,5)", nextToken.toString());
assertNull(tokenizer.next(reusableToken));
}
}

View File

@ -60,17 +60,14 @@ public class NGramTokenFilterTest extends TestCase {
public void testUnigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
Token token = null;
do {
token = filter.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(5, tokens.size());
ArrayList exp = new ArrayList();
@ -80,17 +77,13 @@ public class NGramTokenFilterTest extends TestCase {
public void testBigrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
Token token = null;
do {
token = filter.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(4, tokens.size());
ArrayList exp = new ArrayList();
@ -100,17 +93,13 @@ public class NGramTokenFilterTest extends TestCase {
public void testNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
Token token = null;
do {
token = filter.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(12, tokens.size());
ArrayList exp = new ArrayList();
@ -122,17 +111,13 @@ public class NGramTokenFilterTest extends TestCase {
public void testOversizedNgrams() throws Exception {
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
Token token = null;
do {
token = filter.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertTrue(tokens.isEmpty());
}

View File

@ -59,16 +59,13 @@ public class NGramTokenizerTest extends TestCase {
public void testUnigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
Token token = null;
do {
token = tokenizer.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(5, tokens.size());
ArrayList exp = new ArrayList();
@ -78,17 +75,13 @@ public class NGramTokenizerTest extends TestCase {
public void testBigrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
Token token = null;
do {
token = tokenizer.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(4, tokens.size());
ArrayList exp = new ArrayList();
@ -98,17 +91,13 @@ public class NGramTokenizerTest extends TestCase {
public void testNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
Token token = null;
do {
token = tokenizer.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertEquals(12, tokens.size());
ArrayList exp = new ArrayList();
@ -120,17 +109,14 @@ public class NGramTokenizerTest extends TestCase {
public void testOversizedNgrams() throws Exception {
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
Token token = null;
do {
token = tokenizer.next();
if (token != null) {
tokens.add(token.toString());
// System.out.println(token.termText());
// System.out.println(token);
// Thread.sleep(1000);
}
} while (token != null);
final Token reusableToken = new Token();
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
tokens.add(nextToken.toString());
// System.out.println(token.term());
// System.out.println(token);
// Thread.sleep(1000);
}
assertTrue(tokens.isEmpty());
}

View File

@ -43,20 +43,20 @@ public class NumericPayloadTokenFilterTest extends TestCase {
String test = "The quick red fox jumped over the lazy brown dogs";
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
Token tok = new Token();
boolean seenDogs = false;
while ((tok = nptf.next(tok)) != null){
if (tok.termText().equals("dogs")){
final Token reusableToken = new Token();
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
if (nextToken.term().equals("dogs")){
seenDogs = true;
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length());
assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0);
assertTrue(nextToken.type() + " is not equal to " + "D", nextToken.type().equals("D") == true);
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
byte [] bytes = nextToken.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
assertTrue(bytes.length + " does not equal: " + nextToken.getPayload().length(), bytes.length == nextToken.getPayload().length());
assertTrue(nextToken.getPayload().getOffset() + " does not equal: " + 0, nextToken.getPayload().getOffset() == 0);
float pay = PayloadHelper.decodeFloat(bytes);
assertTrue(pay + " does not equal: " + 3, pay == 3);
} else {
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals("word"));
}
}
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
@ -67,12 +67,13 @@ public class NumericPayloadTokenFilterTest extends TestCase {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.termText().equals("dogs")) {
result.setType("D");
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null && nextToken.term().equals("dogs")) {
nextToken.setType("D");
}
return result;
return nextToken;
}
}

View File

@ -42,17 +42,17 @@ public class TokenOffsetPayloadTokenFilterTest extends TestCase {
String test = "The quick red fox jumped over the lazy brown dogs";
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
Token tok = new Token();
int count = 0;
while ((tok = nptf.next(tok)) != null){
assertTrue("tok is null and it shouldn't be", tok != null);
Payload pay = tok.getPayload();
final Token reusableToken = new Token();
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
Payload pay = nextToken.getPayload();
assertTrue("pay is null and it shouldn't be", pay != null);
byte [] data = pay.getData();
int start = PayloadHelper.decodeInt(data, 0);
assertTrue(start + " does not equal: " + tok.startOffset(), start == tok.startOffset());
assertTrue(start + " does not equal: " + nextToken.startOffset(), start == nextToken.startOffset());
int end = PayloadHelper.decodeInt(data, 4);
assertTrue(end + " does not equal: " + tok.endOffset(), end == tok.endOffset());
assertTrue(end + " does not equal: " + nextToken.endOffset(), end == nextToken.endOffset());
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);

View File

@ -44,14 +44,14 @@ public class TypeAsPayloadTokenFilterTest extends TestCase {
String test = "The quick red fox jumped over the lazy brown dogs";
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
Token tok = new Token();
int count = 0;
while ((tok = nptf.next(tok)) != null){
assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0]))));
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
String type = new String(tok.getPayload().getData(), "UTF-8");
final Token reusableToken = new Token();
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0]))));
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
String type = new String(nextToken.getPayload().getData(), "UTF-8");
assertTrue("type is null and it shouldn't be", type != null);
assertTrue(type + " is not equal to " + tok.type(), type.equals(tok.type()) == true);
assertTrue(type + " is not equal to " + nextToken.type(), type.equals(nextToken.type()) == true);
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);
@ -64,12 +64,13 @@ public class TypeAsPayloadTokenFilterTest extends TestCase {
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null) {
result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0])));
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
nextToken.setType(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0])));
}
return result;
return nextToken;
}
}

View File

@ -17,12 +17,17 @@ package org.apache.lucene.analysis.ru;
* limitations under the License.
*/
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import junit.framework.TestCase;
import java.io.*;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* Test case for RussianAnalyzer.
@ -72,22 +77,26 @@ public class TestRussianAnalyzer extends TestCase
sampleUnicode,
RussianCharsets.UnicodeRussian);
final Token reusableToken = new Token();
final Token reusableSampleToken = new Token();
Token nextToken;
Token nextSampleToken;
for (;;)
{
Token token = in.next();
nextToken = in.next(reusableToken);
if (token == null)
if (nextToken == null)
{
break;
}
Token sampleToken = sample.next();
nextSampleToken = sample.next(reusableSampleToken);
assertEquals(
"Unicode",
token.termText(),
sampleToken == null
nextToken.term(),
nextSampleToken == null
? null
: sampleToken.termText());
: nextSampleToken.term());
}
inWords.close();
@ -109,22 +118,26 @@ public class TestRussianAnalyzer extends TestCase
sampleKOI8,
RussianCharsets.KOI8);
final Token reusableToken = new Token();
final Token reusableSampleToken = new Token();
Token nextToken;
Token nextSampleToken;
for (;;)
{
Token token = in.next();
nextToken = in.next(reusableToken);
if (token == null)
if (nextToken == null)
{
break;
}
Token sampleToken = sample.next();
nextSampleToken = sample.next(reusableSampleToken);
assertEquals(
"KOI8",
token.termText(),
sampleToken == null
nextToken.term(),
nextSampleToken == null
? null
: sampleToken.termText());
: nextSampleToken.term());
}
@ -146,22 +159,26 @@ public class TestRussianAnalyzer extends TestCase
sample1251,
RussianCharsets.CP1251);
final Token reusableToken = new Token();
final Token reusableSampleToken = new Token();
Token nextToken;
Token nextSampleToken;
for (;;)
{
Token token = in.next();
nextToken = in.next(reusableToken);
if (token == null)
if (nextToken == null)
{
break;
}
Token sampleToken = sample.next();
nextSampleToken = sample.next(reusableSampleToken);
assertEquals(
"1251",
token.termText(),
sampleToken == null
nextToken.term(),
nextSampleToken == null
? null
: sampleToken.termText());
: nextSampleToken.term());
}
@ -175,9 +192,10 @@ public class TestRussianAnalyzer extends TestCase
RussianAnalyzer ra = new RussianAnalyzer();
TokenStream stream = ra.tokenStream("", reader);
final Token reusableToken = new Token();
try {
assertEquals("text", stream.next().termText());
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next());
assertEquals("text", stream.next(reusableToken).term());
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next(reusableToken));
}
catch (IOException e)
{

View File

@ -156,11 +156,11 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
TokenStream ts = analyzer.tokenStream("content",
new StringReader("this sentence"));
Token token;
int j = -1;
while ((token = ts.next()) != null) {
j += token.getPositionIncrement();
String termText = new String(token.termBuffer(), 0, token.termLength());
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
j += nextToken.getPositionIncrement();
String termText = nextToken.term();
q.add(new Term("content", termText), j);
}
@ -182,9 +182,9 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
TokenStream ts = analyzer.tokenStream("content",
new StringReader("test sentence"));
Token token;
while ((token = ts.next()) != null) {
String termText = new String(token.termBuffer(), 0, token.termLength());
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String termText = nextToken.term();
q.add(new TermQuery(new Term("content", termText)),
BooleanClause.Occur.SHOULD);
}

View File

@ -35,7 +35,8 @@ public class ShingleFilterTest extends TestCase {
this.testToken = testToken;
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (index < testToken.length) {
return testToken[index++];
} else {
@ -49,28 +50,28 @@ public class ShingleFilterTest extends TestCase {
}
public static final Token[] TEST_TOKEN = new Token[] {
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("this", 14, 18),
new Token("sentence", 19, 27),
new Token("into", 28, 32),
new Token("shingles", 33, 39),
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("this", 14, 18),
createToken("sentence", 19, 27),
createToken("into", 28, 32),
createToken("shingles", 33, 39),
};
public static Token[] testTokenWithHoles;
public static final Token[] BI_GRAM_TOKENS = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("divide", 7, 13),
new Token("divide this", 7, 18),
new Token("this", 14, 18),
new Token("this sentence", 14, 27),
new Token("sentence", 19, 27),
new Token("sentence into", 19, 32),
new Token("into", 28, 32),
new Token("into shingles", 28, 39),
new Token("shingles", 33, 39),
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
@ -83,17 +84,17 @@ public class ShingleFilterTest extends TestCase {
};
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("divide", 7, 13),
new Token("divide _", 7, 19),
new Token("_", 19, 19),
new Token("_ sentence", 19, 27),
new Token("sentence", 19, 27),
new Token("sentence _", 19, 33),
new Token("_", 33, 33),
new Token("_ shingles", 33, 39),
new Token("shingles", 33, 39),
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("divide", 7, 13),
createToken("divide _", 7, 19),
createToken("_", 19, 19),
createToken("_ sentence", 19, 27),
createToken("sentence", 19, 27),
createToken("sentence _", 19, 33),
createToken("_", 33, 33),
createToken("_ shingles", 33, 39),
createToken("shingles", 33, 39),
};
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
@ -101,21 +102,21 @@ public class ShingleFilterTest extends TestCase {
};
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
new Token("please", 0, 6),
new Token("please divide", 0, 13),
new Token("please divide this", 0, 18),
new Token("divide", 7, 13),
new Token("divide this", 7, 18),
new Token("divide this sentence", 7, 27),
new Token("this", 14, 18),
new Token("this sentence", 14, 27),
new Token("this sentence into", 14, 32),
new Token("sentence", 19, 27),
new Token("sentence into", 19, 32),
new Token("sentence into shingles", 19, 39),
new Token("into", 28, 32),
new Token("into shingles", 28, 39),
new Token("shingles", 33, 39)
createToken("please", 0, 6),
createToken("please divide", 0, 13),
createToken("please divide this", 0, 18),
createToken("divide", 7, 13),
createToken("divide this", 7, 18),
createToken("divide this sentence", 7, 27),
createToken("this", 14, 18),
createToken("this sentence", 14, 27),
createToken("this sentence into", 14, 32),
createToken("sentence", 19, 27),
createToken("sentence into", 19, 32),
createToken("sentence into shingles", 19, 39),
createToken("into", 28, 32),
createToken("into shingles", 28, 39),
createToken("shingles", 33, 39)
};
public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
@ -135,10 +136,10 @@ public class ShingleFilterTest extends TestCase {
protected void setUp() throws Exception {
super.setUp();
testTokenWithHoles = new Token[] {
new Token("please", 0, 6),
new Token("divide", 7, 13),
new Token("sentence", 19, 27),
new Token("shingles", 33, 39),
createToken("please", 0, 6),
createToken("divide", 7, 13),
createToken("sentence", 19, 27),
createToken("shingles", 33, 39),
};
testTokenWithHoles[2].setPositionIncrement(2);
@ -168,22 +169,27 @@ public class ShingleFilterTest extends TestCase {
throws IOException {
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
Token token;
int i = 0;
while ((token = filter.next()) != null) {
String termText = new String(token.termBuffer(), 0, token.termLength());
String goldText
= new String(tokensToCompare[i].termBuffer(), 0, tokensToCompare[i].termLength());
final Token reusableToken = new Token();
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
String termText = nextToken.term();
String goldText = tokensToCompare[i].term();
assertEquals("Wrong termText", goldText, termText);
assertEquals("Wrong startOffset for token \"" + termText + "\"",
tokensToCompare[i].startOffset(), token.startOffset());
tokensToCompare[i].startOffset(), nextToken.startOffset());
assertEquals("Wrong endOffset for token \"" + termText + "\"",
tokensToCompare[i].endOffset(), token.endOffset());
tokensToCompare[i].endOffset(), nextToken.endOffset());
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
positionIncrements[i], token.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type());
positionIncrements[i], nextToken.getPositionIncrement());
assertEquals("Wrong type for token \"" + termText + "\"", types[i], nextToken.type());
i++;
}
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
}

View File

@ -40,29 +40,23 @@ public class TestShingleMatrixFilter extends TestCase {
ShingleMatrixFilter.defaultSettingsCodec = null;
Token token = new Token(); // for debug use only
TokenStream ts;
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
assertNull(ts.next());
assertNull(ts.next(new Token()));
TokenListStream tls;
LinkedList<Token> tokens;
// test a plain old token stream with synonyms tranlated to rows.
// test a plain old token stream with synonyms translated to rows.
tokens = new LinkedList<Token>();
tokens.add(new Token("please", 0, 6));
tokens.add(new Token("divide", 7, 13));
tokens.add(new Token("this", 14, 18));
tokens.add(new Token("sentence", 19, 27));
tokens.add(new Token("into", 28, 32));
tokens.add(new Token("shingles", 33, 39));
tokens.add(createToken("please", 0, 6));
tokens.add(createToken("divide", 7, 13));
tokens.add(createToken("this", 14, 18));
tokens.add(createToken("sentence", 19, 27));
tokens.add(createToken("into", 28, 32));
tokens.add(createToken("shingles", 33, 39));
tls = new TokenListStream(tokens);
@ -70,20 +64,22 @@ public class TestShingleMatrixFilter extends TestCase {
ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
assertNext(ts, "please", 0, 6);
assertNext(ts, "please divide", 0, 13);
assertNext(ts, "divide", 7, 13);
assertNext(ts, "divide this", 7, 18);
assertNext(ts, "this", 14, 18);
assertNext(ts, "this sentence", 14, 27);
assertNext(ts, "sentence", 19, 27);
assertNext(ts, "sentence into", 19, 32);
assertNext(ts, "into", 28, 32);
assertNext(ts, "into shingles", 28, 39);
assertNext(ts, "shingles", 33, 39);
Token reusableToken = new Token();
assertNext(ts, reusableToken, "please", 0, 6);
assertNext(ts, reusableToken, "please divide", 0, 13);
assertNext(ts, reusableToken, "divide", 7, 13);
assertNext(ts, reusableToken, "divide this", 7, 18);
assertNext(ts, reusableToken, "this", 14, 18);
assertNext(ts, reusableToken, "this sentence", 14, 27);
assertNext(ts, reusableToken, "sentence", 19, 27);
assertNext(ts, reusableToken, "sentence into", 19, 32);
assertNext(ts, reusableToken, "into", 28, 32);
assertNext(ts, reusableToken, "into shingles", 28, 39);
assertNext(ts, reusableToken, "shingles", 33, 39);
assertNull(ts.next());
assertNull(ts.next(reusableToken));
}
@ -95,9 +91,6 @@ public class TestShingleMatrixFilter extends TestCase {
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
Token token = new Token(); // for debug use only
TokenStream ts;
TokenListStream tls;
LinkedList<Token> tokens;
@ -117,25 +110,26 @@ public class TestShingleMatrixFilter extends TestCase {
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
assertNext(ts, "hello_world");
assertNext(ts, "greetings_world");
assertNext(ts, "hello_earth");
assertNext(ts, "greetings_earth");
assertNext(ts, "hello_tellus");
assertNext(ts, "greetings_tellus");
assertNull(ts.next());
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "hello_world");
assertNext(ts, reusableToken, "greetings_world");
assertNext(ts, reusableToken, "hello_earth");
assertNext(ts, reusableToken, "greetings_earth");
assertNext(ts, reusableToken, "hello_tellus");
assertNext(ts, reusableToken, "greetings_tellus");
assertNull(ts.next(reusableToken));
// bi-grams with no spacer character, start offset, end offset
tls.reset();
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
assertNext(ts, "helloworld", 0, 10);
assertNext(ts, "greetingsworld", 0, 10);
assertNext(ts, "helloearth", 0, 10);
assertNext(ts, "greetingsearth", 0, 10);
assertNext(ts, "hellotellus", 0, 10);
assertNext(ts, "greetingstellus", 0, 10);
assertNull(ts.next());
assertNext(ts, reusableToken, "helloworld", 0, 10);
assertNext(ts, reusableToken, "greetingsworld", 0, 10);
assertNext(ts, reusableToken, "helloearth", 0, 10);
assertNext(ts, reusableToken, "greetingsearth", 0, 10);
assertNext(ts, reusableToken, "hellotellus", 0, 10);
assertNext(ts, reusableToken, "greetingstellus", 0, 10);
assertNull(ts.next(reusableToken));
// add ^_prefix_and_suffix_$
@ -160,119 +154,119 @@ public class TestShingleMatrixFilter extends TestCase {
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);
//
// while ((token = ts.next(token)) != null) {
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertNull(ts.next());
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNull(ts.next(reusableToken));
// test unlimited size and allow single boundary token as shingle
tls.reset();
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false);
//
// while ((token = ts.next(token)) != null) {
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
assertNext(ts, "^", 1, 10.0f, 0, 0);
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello", 1, 1.0f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "world", 1, 1.0f, 5, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "$", 1, 7.071068f, 10, 10);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "earth", 1, 1.0f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^", 1, 10.0f, 0, 0);
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "$", 1, 7.071068f, 10, 10);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNull(ts.next());
assertNull(ts.next(reusableToken));
// test unlimited size but don't allow single boundary token as shingle
tls.reset();
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true);
// while ((token = ts.next(token)) != null) {
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello", 1, 1.0f, 0, 4);
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "world", 1, 1.0f, 5, 10);
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "earth", 1, 1.0f, 5, 10);
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
assertNull(ts.next());
assertNull(ts.next(reusableToken));
System.currentTimeMillis();
@ -300,27 +294,27 @@ public class TestShingleMatrixFilter extends TestCase {
ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);
// while ((token = ts.next(token)) != null) {
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
// shingle, position increment, weight, start offset, end offset
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "greetings_and", 1, 1.4142135f, 0, 4);
assertNext(ts, reusableToken, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
assertNext(ts, reusableToken, "and_salutations", 1, 1.4142135f, 0, 4);
assertNext(ts, reusableToken, "and_salutations_world", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_world", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "and_salutations_earth", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_earth", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
assertNext(ts, reusableToken, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
assertNext(ts, reusableToken, "salutations_tellus", 1, 1.4142135f, 0, 10);
assertNull(ts.next());
assertNull(ts.next(reusableToken));
System.currentTimeMillis();
@ -361,53 +355,53 @@ public class TestShingleMatrixFilter extends TestCase {
TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
// Token token = new Token();
// while ((token = ts.next(token)) != null) {
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
// token.clear();
// }
assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
final Token reusableToken = new Token();
assertNext(ts, reusableToken, "no_surprise", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "no_surprise_to", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "no_surprise_to_see", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "surprise_to", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "surprise_to_see", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "surprise_to_see_england", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "to_see", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "to_see_england", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "to_see_england_manager", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "see_england", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager_svennis", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "england_manager", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "england_manager_svennis", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "england_manager_svennis_in", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis_in", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "manager_svennis_in_the", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "svennis_in", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "svennis_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "svennis_in_the_croud", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "in_the", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "in_the_croud", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "the_croud", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "see_england_manager_sven", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "england_manager_sven", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "england_manager_sven_göran", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "manager_sven", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "manager_sven_göran", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "sven_göran", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in", 1, 1.4142135f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in_the", 1, 1.7320508f, 0, 0);
assertNext(ts, reusableToken, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
assertNull(ts.next());
assertNull(ts.next(reusableToken));
}
@ -417,11 +411,9 @@ public class TestShingleMatrixFilter extends TestCase {
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
Token token = new Token();
token.setTermText(text);
Token token = new Token(startOffset, endOffset);
token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
token.setStartOffset(startOffset);
token.setEndOffset(endOffset);
return token;
}
@ -435,61 +427,64 @@ public class TestShingleMatrixFilter extends TestCase {
}
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
Token token = new Token();
token.setTermText(text);
Token token = new Token(startOffset, endOffset);
token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
token.setStartOffset(startOffset);
token.setEndOffset(endOffset);
return token;
}
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
Token token = new Token();
token.setTermText(text);
Token token = new Token(startOffset, endOffset);
token.setTermBuffer(text);
token.setPositionIncrement(posIncr);
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
token.setStartOffset(startOffset);
token.setEndOffset(endOffset);
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
return token;
}
// assert-methods start here
private Token assertNext(TokenStream ts, String text) throws IOException {
Token token = ts.next(new Token());
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
return token;
private Token assertNext(TokenStream ts, final Token reusableToken, String text) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
return nextToken;
}
private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
Token token = ts.next(new Token());
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
assertEquals(positionIncrement, token.getPositionIncrement());
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
return token;
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(positionIncrement, nextToken.getPositionIncrement());
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()));
return nextToken;
}
private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
Token token = ts.next(new Token());
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
assertEquals(positionIncrement, token.getPositionIncrement());
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
assertEquals(startOffset, token.startOffset());
assertEquals(endOffset, token.endOffset());
return token;
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(positionIncrement, nextToken.getPositionIncrement());
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()));
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
}
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
Token token = ts.next(new Token());
assertNotNull(token);
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
assertEquals(startOffset, token.startOffset());
assertEquals(endOffset, token.endOffset());
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(text, nextToken.term());
assertEquals(startOffset, nextToken.startOffset());
assertEquals(endOffset, nextToken.endOffset());
return nextToken;
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
@ -500,9 +495,9 @@ public class TestShingleMatrixFilter extends TestCase {
public TokenListStream(TokenStream ts) throws IOException {
tokens = new ArrayList<Token>();
Token token;
while ((token = ts.next(new Token())) != null) {
tokens.add(token);
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
tokens.add((Token) nextToken.clone());
}
}
@ -512,14 +507,16 @@ public class TestShingleMatrixFilter extends TestCase {
private Iterator<Token> iterator;
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (iterator == null) {
iterator = tokens.iterator();
}
if (!iterator.hasNext()) {
return null;
}
return iterator.next();
Token nextToken = (Token) iterator.next();
return (Token) nextToken.clone();
}

View File

@ -43,13 +43,13 @@ public class DateRecognizerSinkTokenizerTest extends TestCase {
DateRecognizerSinkTokenizer sink = new DateRecognizerSinkTokenizer(new SimpleDateFormat("MM/dd/yyyy"));
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), sink);
Token tok = null;
int count = 0;
while ((tok = tee.next()) != null){
assertTrue("tok is null and it shouldn't be", tok != null);
if (tok.termBuffer()[0] == '7'){
assertTrue(tok.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
tok.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
final Token reusableToken = new Token();
for (Token nextToken = tee.next(reusableToken); nextToken != null; nextToken = tee.next(reusableToken)) {
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
if (nextToken.termBuffer()[0] == '7'){
assertTrue(nextToken.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
nextToken.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
}
count++;
}

View File

@ -42,10 +42,10 @@ public class TokenRangeSinkTokenizerTest extends TestCase {
TokenRangeSinkTokenizer rangeToks = new TokenRangeSinkTokenizer(2, 4);
String test = "The quick red fox jumped over the lazy brown dogs";
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), rangeToks);
Token tok = null;
int count = 0;
while ((tok = tee.next()) != null){
assertTrue("tok is null and it shouldn't be", tok != null);
final Token reusableToken = new Token();
for (Token nextToken = tee.next(reusableToken); nextToken != null; nextToken = tee.next(reusableToken)) {
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
count++;
}
assertTrue(count + " does not equal: " + 10, count == 10);

View File

@ -16,13 +16,17 @@ package org.apache.lucene.analysis.sinks;
* limitations under the License.
*/
import junit.framework.TestCase;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
import java.io.IOException;
import java.io.StringReader;
import junit.framework.TestCase;
import org.apache.lucene.analysis.TeeTokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
public class TokenTypeSinkTokenizerTest extends TestCase {
@ -42,14 +46,14 @@ public class TokenTypeSinkTokenizerTest extends TestCase {
String test = "The quick red fox jumped over the lazy brown dogs";
TeeTokenFilter ttf = new TeeTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), sink);
Token tok = new Token();
boolean seenDogs = false;
while ((tok = ttf.next(tok)) != null) {
if (tok.termText().equals("dogs")) {
final Token reusableToken = new Token();
for (Token nextToken = ttf.next(reusableToken); nextToken != null; nextToken = ttf.next(reusableToken)) {
if (nextToken.term().equals("dogs")) {
seenDogs = true;
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
assertTrue(nextToken.type() + " is not equal to " + "D", nextToken.type().equals("D") == true);
} else {
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals("word"));
}
}
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
@ -61,12 +65,13 @@ public class TokenTypeSinkTokenizerTest extends TestCase {
super(input);
}
public Token next(Token result) throws IOException {
result = input.next(result);
if (result != null && result.termText().equals("dogs")) {
result.setType("D");
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null && nextToken.term().equals("dogs")) {
nextToken.setType("D");
}
return result;
return nextToken;
}
}
}

View File

@ -36,13 +36,13 @@ public class TestThaiAnalyzer extends TestCase {
throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
for (int i = 0; i < output.length; i++) {
Token t = ts.next();
assertNotNull(t);
assertEquals(t.termText(), output[i]);
Token nextToken = ts.next(reusableToken);
assertNotNull(nextToken);
assertEquals(nextToken.term(), output[i]);
}
assertNull(ts.next());
assertNull(ts.next(reusableToken));
ts.close();
}

View File

@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.Iterator;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.util.PriorityQueue;
@ -217,7 +218,7 @@ public class Highlighter
try
{
org.apache.lucene.analysis.Token token;
final Token reusableToken = new Token();
String tokenText;
int startOffset;
int endOffset;
@ -225,10 +226,12 @@ public class Highlighter
textFragmenter.start(text);
TokenGroup tokenGroup=new TokenGroup();
token = tokenStream.next();
while ((token!= null)&&(token.startOffset()< maxDocCharsToAnalyze))
for (Token nextToken = tokenStream.next(reusableToken);
(nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
nextToken = tokenStream.next(reusableToken))
{
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
{
//the current token is distinct from previous tokens -
// markup the cached token group info
@ -244,7 +247,7 @@ public class Highlighter
tokenGroup.clear();
//check if current token marks the start of a new fragment
if(textFragmenter.isNewFragment(token))
if(textFragmenter.isNewFragment(nextToken))
{
currentFrag.setScore(fragmentScorer.getFragmentScore());
//record stats for a new fragment
@ -255,13 +258,12 @@ public class Highlighter
}
}
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken));
// if(lastEndOffset>maxDocBytesToAnalyze)
// {
// break;
// }
token = tokenStream.next();
}
currentFrag.setScore(fragmentScorer.getFragmentScore());

View File

@ -106,7 +106,7 @@ public class QueryScorer implements Scorer
*/
public float getTokenScore(Token token)
{
String termText=token.termText();
String termText=token.term();
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
if(queryTerm==null)

View File

@ -62,7 +62,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
return false;
}
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength()));
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
if (wSpanTerm != null) {
List positionSpans = wSpanTerm.getPositionSpans();

View File

@ -121,7 +121,7 @@ public class SpanScorer implements Scorer {
*/
public float getTokenScore(Token token) {
position += token.getPositionIncrement();
String termText = new String(token.termBuffer(), 0, token.termLength());
String termText = token.term();
WeightedSpanTerm weightedSpanTerm;

View File

@ -61,7 +61,7 @@ public class TokenGroup
tot+=score;
}
}
tokens[numTokens]=token;
tokens[numTokens]= (Token) token.clone();
scores[numTokens]=score;
numTokens++;
}

View File

@ -147,8 +147,9 @@ public class TokenSources
{
this.tokens=tokens;
}
public Token next()
public Token next(final Token reusableToken)
{
assert reusableToken != null;
if(currentToken>=tokens.length)
{
return null;
@ -160,6 +161,7 @@ public class TokenSources
String[] terms=tpv.getTerms();
int[] freq=tpv.getTermFrequencies();
int totalTokens=0;
Token newToken = new Token();
for (int t = 0; t < freq.length; t++)
{
totalTokens+=freq[t];
@ -189,9 +191,8 @@ public class TokenSources
}
for (int tp = 0; tp < offsets.length; tp++)
{
unsortedTokens.add(new Token(terms[t],
offsets[tp].getStartOffset(),
offsets[tp].getEndOffset()));
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
unsortedTokens.add(newToken.clone());
}
}
else
@ -204,9 +205,8 @@ public class TokenSources
//tokens stored with positions - can use this to index straight into sorted array
for (int tp = 0; tp < pos.length; tp++)
{
tokensInOriginalOrder[pos[tp]]=new Token(terms[t],
offsets[tp].getStartOffset(),
offsets[tp].getEndOffset());
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
}
}
}
@ -261,7 +261,7 @@ public class TokenSources
}
return getTokenStream(field, contents, analyzer);
}
//conevenience method
//convenience method
public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
return analyzer.tokenStream(field,new StringReader(contents));
}

View File

@ -1127,21 +1127,22 @@ public class HighlighterTest extends TestCase implements Formatter {
{
lst = new ArrayList();
Token t;
t = new Token("hi", 0, 2);
t = createToken("hi", 0, 2);
lst.add(t);
t = new Token("hispeed", 0, 8);
t = createToken("hispeed", 0, 8);
lst.add(t);
t = new Token("speed", 3, 8);
t = createToken("speed", 3, 8);
t.setPositionIncrement(0);
lst.add(t);
t = new Token("10", 8, 10);
t = createToken("10", 8, 10);
lst.add(t);
t = new Token("foo", 11, 14);
t = createToken("foo", 11, 14);
lst.add(t);
iter = lst.iterator();
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return iter.hasNext() ? (Token) iter.next() : null;
}
};
@ -1156,21 +1157,22 @@ public class HighlighterTest extends TestCase implements Formatter {
{
lst = new ArrayList();
Token t;
t = new Token("hispeed", 0, 8);
t = createToken("hispeed", 0, 8);
lst.add(t);
t = new Token("hi", 0, 2);
t = createToken("hi", 0, 2);
t.setPositionIncrement(0);
lst.add(t);
t = new Token("speed", 3, 8);
t = createToken("speed", 3, 8);
lst.add(t);
t = new Token("10", 8, 10);
t = createToken("10", 8, 10);
lst.add(t);
t = new Token("foo", 11, 14);
t = createToken("foo", 11, 14);
lst.add(t);
iter = lst.iterator();
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return iter.hasNext() ? (Token) iter.next() : null;
}
};
@ -1407,6 +1409,13 @@ public class HighlighterTest extends TestCase implements Formatter {
super.tearDown();
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
}
// ===================================================================
@ -1453,31 +1462,32 @@ class SynonymTokenizer extends TokenStream {
this.synonyms = synonyms;
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (currentRealToken == null) {
Token nextRealToken = realStream.next();
Token nextRealToken = realStream.next(reusableToken);
if (nextRealToken == null) {
return null;
}
String expansions = (String) synonyms.get(nextRealToken.termText());
String expansions = (String) synonyms.get(nextRealToken.term());
if (expansions == null) {
return nextRealToken;
}
st = new StringTokenizer(expansions, ",");
if (st.hasMoreTokens()) {
currentRealToken = nextRealToken;
currentRealToken = (Token) nextRealToken.clone();
}
return currentRealToken;
} else {
String nextExpandedValue = st.nextToken();
Token expandedToken = new Token(nextExpandedValue, currentRealToken.startOffset(),
currentRealToken.endOffset());
expandedToken.setPositionIncrement(0);
reusableToken.reinit(st.nextToken(),
currentRealToken.startOffset(),
currentRealToken.endOffset());
reusableToken.setPositionIncrement(0);
if (!st.hasMoreTokens()) {
currentRealToken = null;
st = null;
}
return expandedToken;
return reusableToken;
}
}

View File

@ -520,12 +520,10 @@ public class InstantiatedIndexWriter {
} else {
tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
}
Token next = tokenStream.next();
while (next != null) {
next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
tokens.add(next); // the vector will be built on commit.
next = tokenStream.next();
final Token reusableToken = new Token();
for (Token nextToken = tokenStream.next(reusableToken); nextToken != null; nextToken = tokenStream.next(reusableToken)) {
tokens.add((Token) nextToken.clone()); // the vector will be built on commit.
fieldSetting.fieldLength++;
if (fieldSetting.fieldLength > maxFieldLength) {
break;
@ -533,7 +531,10 @@ public class InstantiatedIndexWriter {
}
} else {
// untokenized
tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
String fieldVal = field.stringValue();
Token token = new Token(0, fieldVal.length(), "untokenized");
token.setTermBuffer(fieldVal);
tokens.add(token);
fieldSetting.fieldLength++;
}
}
@ -567,10 +568,10 @@ public class InstantiatedIndexWriter {
for (Token token : eField_Tokens.getValue()) {
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText());
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
if (termDocumentInformationFactory == null) {
termDocumentInformationFactory = new TermDocumentInformationFactory();
termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory);
termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
}
//termDocumentInformationFactory.termFrequency++;

View File

@ -15,19 +15,32 @@ package org.apache.lucene.store.instantiated;
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.*;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Payload;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermPositions;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import java.io.IOException;
import java.util.*;
/**
* Asserts equality of content and behaviour of two index readers.
*/
@ -151,21 +164,24 @@ public class TestIndicesEquals extends TestCase {
document.add(f);
if (i > 4) {
final List<Token> tokens = new ArrayList<Token>(2);
Token t = new Token("the", 0, 2, "text");
Token t = createToken("the", 0, 2, "text");
t.setPayload(new Payload(new byte[]{1, 2, 3}));
tokens.add(t);
t = new Token("end", 3, 5, "text");
t = createToken("end", 3, 5, "text");
t.setPayload(new Payload(new byte[]{2}));
tokens.add(t);
tokens.add(new Token("fin", 7, 9));
tokens.add(createToken("fin", 7, 9));
document.add(new Field("f", new TokenStream() {
Iterator<Token> it = tokens.iterator();
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!it.hasNext()) {
return null;
}
return it.next();
// Resettable token streams need to return clones.
Token nextToken = (Token) it.next();
return (Token) nextToken.clone();
}
public void reset() throws IOException {
@ -466,4 +482,19 @@ public class TestIndicesEquals extends TestCase {
testReader.close();
}
private static Token createToken(String term, int start, int offset)
{
Token token = new Token(start, offset);
token.setTermBuffer(term);
return token;
}
private static Token createToken(String term, int start, int offset, String type)
{
Token token = new Token(start, offset, type);
token.setTermBuffer(term);
return token;
}
}

View File

@ -279,6 +279,7 @@ class LuceneMethods {
Analyzer analyzer = new StandardAnalyzer();
Enumeration fields = doc.fields();
final Token reusableToken = new Token();
while (fields.hasMoreElements()) {
Field field = (Field) fields.nextElement();
String fieldName = field.name();
@ -299,10 +300,10 @@ class LuceneMethods {
// Tokenize field and add to postingTable
TokenStream stream = analyzer.tokenStream(fieldName, reader);
try {
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
position += (nextToken.getPositionIncrement() - 1);
position++;
String name = t.termText();
String name = nextToken.term();
Integer Count = (Integer) tokenHash.get(name);
if (Count == null) { // not in there yet
tokenHash.put(name, new Integer(1)); //first one

View File

@ -73,10 +73,11 @@ public class AnalyzerUtil {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int position = -1;
public Token next() throws IOException {
Token token = input.next(); // from filter super class
log.println(toString(token));
return token;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken); // from filter super class
log.println(toString(nextToken));
return nextToken;
}
private String toString(Token token) {
@ -84,7 +85,7 @@ public class AnalyzerUtil {
position += token.getPositionIncrement();
return "[" + logName + ":" + position + ":" + fieldName + ":"
+ token.termText() + ":" + token.startOffset()
+ token.term() + ":" + token.startOffset()
+ "-" + token.endOffset() + ":" + token.type()
+ "]";
}
@ -121,8 +122,9 @@ public class AnalyzerUtil {
return new TokenFilter(child.tokenStream(fieldName, reader)) {
private int todo = maxTokens;
public Token next() throws IOException {
return --todo >= 0 ? input.next() : null;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
return --todo >= 0 ? input.next(reusableToken) : null;
}
};
}
@ -239,10 +241,11 @@ public class AnalyzerUtil {
final ArrayList tokens2 = new ArrayList();
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
public Token next() throws IOException {
Token token = input.next(); // from filter super class
if (token != null) tokens2.add(token);
return token;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken); // from filter super class
if (nextToken != null) tokens2.add(nextToken.clone());
return nextToken;
}
};
@ -253,7 +256,8 @@ public class AnalyzerUtil {
private Iterator iter = tokens.iterator();
public Token next() {
public Token next(Token token) {
assert token != null;
if (!iter.hasNext()) return null;
return (Token) iter.next();
}
@ -300,12 +304,12 @@ public class AnalyzerUtil {
HashMap map = new HashMap();
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
try {
Token token;
while ((token = stream.next()) != null) {
MutableInteger freq = (MutableInteger) map.get(token.termText());
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
MutableInteger freq = (MutableInteger) map.get(nextToken.term());
if (freq == null) {
freq = new MutableInteger(1);
map.put(token.termText(), freq);
map.put(nextToken.term(), freq);
} else {
freq.setValue(freq.intValue() + 1);
}

View File

@ -275,7 +275,8 @@ public class MemoryIndex implements Serializable {
return new TokenStream() {
private Iterator iter = keywords.iterator();
private int start = 0;
public Token next() {
public Token next(final Token reusableToken) {
assert reusableToken != null;
if (!iter.hasNext()) return null;
Object obj = iter.next();
@ -283,9 +284,9 @@ public class MemoryIndex implements Serializable {
throw new IllegalArgumentException("keyword must not be null");
String term = obj.toString();
Token token = new Token(term, start, start + term.length());
reusableToken.reinit(term, start, start+reusableToken.termLength());
start += term.length() + 1; // separate words by 1 (blank) character
return token;
return reusableToken;
}
};
}
@ -349,14 +350,13 @@ public class MemoryIndex implements Serializable {
HashMap terms = new HashMap();
int numTokens = 0;
int pos = -1;
Token token;
while ((token = stream.next()) != null) {
String term = token.termText();
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
String term = nextToken.term();
if (term.length() == 0) continue; // nothing to do
// if (DEBUG) System.err.println("token='" + term + "'");
numTokens++;
pos += token.getPositionIncrement();
pos += nextToken.getPositionIncrement();
ArrayIntList positions = (ArrayIntList) terms.get(term);
if (positions == null) { // term not seen before
@ -366,7 +366,7 @@ public class MemoryIndex implements Serializable {
if (stride == 1) {
positions.add(pos);
} else {
positions.add(pos, token.startOffset(), token.endOffset());
positions.add(pos, nextToken.startOffset(), nextToken.endOffset());
}
}

View File

@ -334,7 +334,8 @@ public class PatternAnalyzer extends Analyzer {
this.toLowerCase = toLowerCase;
}
public Token next() {
public Token next(final Token reusableToken) {
assert reusableToken != null;
if (matcher == null) return null;
while (true) { // loop takes care of leading and trailing boundary cases
@ -352,7 +353,7 @@ public class PatternAnalyzer extends Analyzer {
if (start != end) { // non-empty match (header/trailer)
String text = str.substring(start, end);
if (toLowerCase) text = text.toLowerCase(locale);
return new Token(text, start, end);
return reusableToken.reinit(text, start, end);
}
if (!isMatch) return null;
}
@ -384,7 +385,8 @@ public class PatternAnalyzer extends Analyzer {
this.stopWords = stopWords;
}
public Token next() {
public Token next(final Token reusableToken) {
assert reusableToken != null;
// cache loop instance vars (performance)
String s = str;
int len = s.length();
@ -422,7 +424,11 @@ public class PatternAnalyzer extends Analyzer {
} while (text != null && isStopWord(text));
pos = i;
return text != null ? new Token(text, start, i) : null;
if (text == null)
{
return null;
}
return reusableToken.reinit(text, start, i);
}
private boolean isTokenChar(char c, boolean isLetter) {

View File

@ -68,48 +68,51 @@ public class SynonymTokenFilter extends TokenFilter {
}
/** Returns the next token in the stream, or null at EOS. */
public Token next() throws IOException {
Token token;
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
while (todo > 0 && index < stack.length) { // pop from stack
token = createToken(stack[index++], current);
if (token != null) {
Token nextToken = createToken(stack[index++], current, reusableToken);
if (nextToken != null) {
todo--;
return token;
return nextToken;
}
}
token = input.next();
if (token == null) return null; // EOS; iterator exhausted
Token nextToken = input.next(reusableToken);
if (nextToken == null) return null; // EOS; iterator exhausted
stack = synonyms.getSynonyms(token.termText()); // push onto stack
stack = synonyms.getSynonyms(nextToken.term()); // push onto stack
if (stack.length > maxSynonyms) randomize(stack);
index = 0;
current = token;
current = (Token) nextToken.clone();
todo = maxSynonyms;
return token;
return nextToken;
}
/**
* Creates and returns a token for the given synonym of the current input
* token; Override for custom (stateless or stateful) behaviour, if desired.
* token; Override for custom (stateless or stateful) behavior, if desired.
*
* @param synonym
* a synonym for the current token's term
* @param current
* the current token from the underlying child stream
* @param reusableToken
* the token to reuse
* @return a new token, or null to indicate that the given synonym should be
* ignored
*/
protected Token createToken(String synonym, Token current) {
Token token = new Token(
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
token.setPositionIncrement(0);
return token;
protected Token createToken(String synonym, Token current, final Token reusableToken) {
reusableToken.reinit(current, synonym);
reusableToken.setTermBuffer(synonym);
reusableToken.setType(SYNONYM_TOKEN_TYPE);
reusableToken.setPositionIncrement(0);
return reusableToken;
}
/**
* Randomize synonyms to later sample a subset. Uses constant random seed
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
* for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
* number generator with medium statistical quality (multiplicative
* congruential method), producing integers in the range [Integer.MIN_VALUE,
* Integer.MAX_VALUE].

View File

@ -197,9 +197,9 @@ public class PatternAnalyzerTest extends TestCase {
private List getTokens(TokenStream stream) throws IOException {
ArrayList tokens = new ArrayList();
Token token;
while ((token = stream.next()) != null) {
tokens.add(token);
final Token reusableToken = new Token();
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
tokens.add(nextToken.clone());
}
return tokens;
}
@ -211,7 +211,7 @@ public class PatternAnalyzerTest extends TestCase {
for (; i < size; i++) {
Token t1 = (Token) tokens1.get(i);
Token t2 = (Token) tokens2.get(i);
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
if (!(t1.term().equals(t2.term()))) throw new IllegalStateException("termText");
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
@ -222,8 +222,8 @@ public class PatternAnalyzerTest extends TestCase {
catch (IllegalStateException e) {
if (size > 0) {
System.out.println("i=" + i + ", size=" + size);
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).term() + "'");
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).term() + "'");
}
throw e;
}
@ -234,7 +234,7 @@ public class PatternAnalyzerTest extends TestCase {
String str = "[";
for (int i=0; i < tokens.size(); i++) {
Token t1 = (Token) tokens.get(i);
str = str + "'" + t1.termText() + "', ";
str = str + "'" + t1.term() + "', ";
}
return str + "]";
}

View File

@ -23,6 +23,7 @@ import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.Query;
@ -105,21 +106,23 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
org.apache.lucene.analysis.Token t;
final Token reusableToken = new Token();
Token nextToken;
int countTokens = 0;
while (true) {
try {
t = source.next();
nextToken = source.next(reusableToken);
} catch (IOException e) {
t = null;
nextToken = null;
}
if (t == null) {
if (nextToken == null) {
break;
}
if (!"".equals(t.termText())) {
String term = nextToken.term();
if (!"".equals(term)) {
try {
tlist.set(countTokens++, t.termText());
tlist.set(countTokens++, term);
} catch (IndexOutOfBoundsException ioobe) {
countTokens = -1;
}
@ -189,18 +192,19 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
List tlist = new ArrayList();
org.apache.lucene.analysis.Token t;
final Token reusableToken = new Token();
Token nextToken;
while (true) {
try {
t = source.next();
nextToken = source.next(reusableToken);
} catch (IOException e) {
t = null;
nextToken = null;
}
if (t == null) {
if (nextToken == null) {
break;
}
tlist.add(t.termText());
tlist.add(nextToken.term());
}
try {
@ -238,14 +242,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
throws ParseException {
// get Analyzer from superclass and tokenize the term
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
org.apache.lucene.analysis.Token t;
final Token reusableToken = new Token();
Token nextToken;
boolean multipleTokens = false;
try {
t = source.next();
multipleTokens = source.next() != null;
nextToken = source.next(reusableToken);
multipleTokens = source.next(reusableToken) != null;
} catch (IOException e) {
t = null;
nextToken = null;
}
try {
@ -259,7 +264,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
+ " - tokens were added");
}
return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity);
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken.term(), minSimilarity);
}
/**
@ -270,18 +275,20 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
throws ParseException {
// get Analyzer from superclass and tokenize the terms
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
org.apache.lucene.analysis.Token t;
final Token reusableToken = new Token();
Token nextToken;
Token multipleToken;
boolean multipleTokens = false;
// part1
try {
t = source.next();
if (t != null) {
part1 = t.termText();
nextToken = source.next(reusableToken);
if (nextToken != null) {
part1 = nextToken.term();
}
multipleTokens = source.next() != null;
multipleTokens = source.next(reusableToken) != null;
} catch (IOException e) {
t = null;
nextToken = null;
}
try {
source.close();
@ -293,16 +300,16 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
+ " - tokens were added to part1");
}
source = getAnalyzer().tokenStream(field, new StringReader(part2));
// part2
source = getAnalyzer().tokenStream(field, new StringReader(part2));
try {
t = source.next();
if (t != null) {
part2 = t.termText();
nextToken = source.next(reusableToken);
if (nextToken != null) {
part2 = nextToken.term();
}
multipleTokens = source.next() != null;
multipleTokens = source.next(reusableToken) != null;
} catch (IOException e) {
t = null;
nextToken = null;
}
try {
source.close();

View File

@ -25,6 +25,20 @@ public interface CharStream {
*/
char readChar() throws java.io.IOException;
/**
* Returns the column position of the character last read.
* @deprecated
* @see #getEndColumn
*/
int getColumn();
/**
* Returns the line number of the character last read.
* @deprecated
* @see #getEndLine
*/
int getLine();
/**
* Returns the column number of the last character for current token (being
* matched after the last call to BeginTOken).

View File

@ -1,14 +1,29 @@
/* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParser.java */
package org.apache.lucene.queryParser.precedence;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Vector;
import java.io.*;
import java.text.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@ -296,21 +311,22 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
org.apache.lucene.analysis.Token t;
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
org.apache.lucene.analysis.Token nextToken;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
t = source.next();
nextToken = source.next(reusableToken);
}
catch (IOException e) {
t = null;
nextToken = null;
}
if (t == null)
if (nextToken == null)
break;
v.addElement(t);
if (t.getPositionIncrement() == 1)
v.addElement(nextToken.clone());
if (nextToken.getPositionIncrement() == 1)
positionCount++;
else
severalTokensAtSamePosition = true;
@ -325,17 +341,17 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
if (v.size() == 0)
return null;
else if (v.size() == 1) {
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
return new TermQuery(new Term(field, t.termText()));
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(0);
return new TermQuery(new Term(field, nextToken.term()));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = new BooleanQuery();
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
TermQuery currentQuery = new TermQuery(
new Term(field, t.termText()));
new Term(field, nextToken.term()));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@ -345,12 +361,12 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
MultiPhraseQuery mpq = new MultiPhraseQuery();
List multiTerms = new ArrayList();
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (nextToken.getPositionIncrement() == 1 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
multiTerms.clear();
}
multiTerms.add(new Term(field, t.termText()));
multiTerms.add(new Term(field, nextToken.term()));
}
mpq.add((Term[])multiTerms.toArray(new Term[0]));
return mpq;
@ -361,7 +377,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
q.setSlop(phraseSlop);
for (int i = 0; i < v.size(); i++) {
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
v.elementAt(i)).termText()));
v.elementAt(i)).term()));
}
return q;

View File

@ -25,14 +25,29 @@ PARSER_BEGIN(PrecedenceQueryParser)
package org.apache.lucene.queryParser.precedence;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Vector;
import java.io.*;
import java.text.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
/**
@ -320,21 +335,22 @@ public class PrecedenceQueryParser {
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
Vector v = new Vector();
org.apache.lucene.analysis.Token t;
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
org.apache.lucene.analysis.Token nextToken;
int positionCount = 0;
boolean severalTokensAtSamePosition = false;
while (true) {
try {
t = source.next();
nextToken = source.next(reusableToken);
}
catch (IOException e) {
t = null;
nextToken = null;
}
if (t == null)
if (nextToken == null)
break;
v.addElement(t);
if (t.getPositionIncrement() == 1)
v.addElement(nextToken.clone());
if (nextToken.getPositionIncrement() == 1)
positionCount++;
else
severalTokensAtSamePosition = true;
@ -349,17 +365,17 @@ public class PrecedenceQueryParser {
if (v.size() == 0)
return null;
else if (v.size() == 1) {
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
return new TermQuery(new Term(field, t.termText()));
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(0);
return new TermQuery(new Term(field, nextToken.term()));
} else {
if (severalTokensAtSamePosition) {
if (positionCount == 1) {
// no phrase query:
BooleanQuery q = new BooleanQuery();
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
TermQuery currentQuery = new TermQuery(
new Term(field, t.termText()));
new Term(field, nextToken.term()));
q.add(currentQuery, BooleanClause.Occur.SHOULD);
}
return q;
@ -369,12 +385,12 @@ public class PrecedenceQueryParser {
MultiPhraseQuery mpq = new MultiPhraseQuery();
List multiTerms = new ArrayList();
for (int i = 0; i < v.size(); i++) {
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
if (nextToken.getPositionIncrement() == 1 && multiTerms.size() > 0) {
mpq.add((Term[])multiTerms.toArray(new Term[0]));
multiTerms.clear();
}
multiTerms.add(new Term(field, t.termText()));
multiTerms.add(new Term(field, nextToken.term()));
}
mpq.add((Term[])multiTerms.toArray(new Term[0]));
return mpq;
@ -385,7 +401,7 @@ public class PrecedenceQueryParser {
q.setSlop(phraseSlop);
for (int i = 0; i < v.size(); i++) {
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
v.elementAt(i)).termText()));
v.elementAt(i)).term()));
}
return q;

View File

@ -1,13 +1,27 @@
/* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParserTokenManager.java */
package org.apache.lucene.queryParser.precedence;
import java.io.IOException;
import java.io.StringReader;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Locale;
import java.util.Vector;
import java.io.*;
import java.text.*;
import java.util.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.index.Term;
import org.apache.lucene.analysis.*;
import org.apache.lucene.document.*;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.MultiPhraseQuery;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.RangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.util.Parameter;
public class PrecedenceQueryParserTokenManager implements PrecedenceQueryParserConstants

View File

@ -57,19 +57,26 @@ public class TestPrecedenceQueryParser extends TestCase {
boolean inPhrase = false;
int savedStart = 0, savedEnd = 0;
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (inPhrase) {
inPhrase = false;
return new Token("phrase2", savedStart, savedEnd);
reusableToken.setTermBuffer("phrase2");
reusableToken.setStartOffset(savedStart);
reusableToken.setEndOffset(savedEnd);
return reusableToken;
} else
for (Token token = input.next(); token != null; token = input.next()) {
if (token.termText().equals("phrase")) {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
if (nextToken.term().equals("phrase")) {
inPhrase = true;
savedStart = token.startOffset();
savedEnd = token.endOffset();
return new Token("phrase1", savedStart, savedEnd);
} else if (!token.termText().equals("stop"))
return token;
savedStart = nextToken.startOffset();
savedEnd = nextToken.endOffset();
nextToken.setTermBuffer("phrase1");
nextToken.setStartOffset(savedStart);
nextToken.setEndOffset(savedEnd);
return nextToken;
} else if (!nextToken.term().equals("stop"))
return nextToken;
}
return null;
}

View File

@ -104,18 +104,19 @@ public class FuzzyLikeThisQuery extends Query
{
if(f.queryString==null) return;
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
Token token=ts.next();
final Token reusableToken = new Token();
int corpusNumDocs=reader.numDocs();
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
HashSet processedTerms=new HashSet();
while(token!=null)
{
if(!processedTerms.contains(token.termText()))
for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
{
String term = nextToken.term();
if(!processedTerms.contains(term))
{
processedTerms.add(token.termText());
processedTerms.add(term);
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
float minScore=0;
Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
Term startTerm=internSavingTemplateTerm.createTerm(term);
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
TermEnum origEnum = reader.terms(startTerm);
int df=0;
@ -162,8 +163,7 @@ public class FuzzyLikeThisQuery extends Query
q.insert(st);
}
}
token=ts.next();
}
}
}
public Query rewrite(IndexReader reader) throws IOException

View File

@ -28,6 +28,7 @@ import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Hits;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
@ -808,10 +809,11 @@ public final class MoreLikeThis {
throws IOException
{
TokenStream ts = analyzer.tokenStream(fieldName, r);
org.apache.lucene.analysis.Token token;
int tokenCount=0;
while ((token = ts.next()) != null) { // for every token
String word = token.termText();
// for every token
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String word = nextToken.term();
tokenCount++;
if(tokenCount>maxNumTokensParsed)
{
@ -872,7 +874,7 @@ public final class MoreLikeThis {
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
*
* @param r the reader that has the content of the document
* @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
*
* @see #retrieveInterestingTerms
*/

View File

@ -21,6 +21,7 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
@ -85,12 +86,11 @@ public final class SimilarityQueries
throws IOException
{
TokenStream ts = a.tokenStream( field, new StringReader( body));
org.apache.lucene.analysis.Token t;
BooleanQuery tmp = new BooleanQuery();
Set already = new HashSet(); // ignore dups
while ( (t = ts.next()) != null)
{
String word = t.termText();
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String word = nextToken.term();
// ignore opt stop words
if ( stop != null &&
stop.contains( word)) continue;

View File

@ -18,11 +18,10 @@ package org.apache.lucene.analysis.snowball;
*/
import java.io.IOException;
import java.lang.reflect.Method;
import net.sf.snowball.SnowballProgram;
import net.sf.snowball.ext.*;
import net.sf.snowball.ext.EnglishStemmer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
@ -60,20 +59,22 @@ public class SnowballFilter extends TokenFilter {
}
/** Returns the next input Token, after being stemmed */
public final Token next() throws IOException {
Token token = input.next();
if (token == null)
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
stemmer.setCurrent(token.termText());
String originalTerm = nextToken.term();
stemmer.setCurrent(originalTerm);
try {
stemMethod.invoke(stemmer, EMPTY_ARGS);
} catch (Exception e) {
throw new RuntimeException(e.toString());
}
Token newToken = new Token(stemmer.getCurrent(),
token.startOffset(), token.endOffset(), token.type());
newToken.setPositionIncrement(token.getPositionIncrement());
return newToken;
String finalTerm = stemmer.getCurrent();
// Don't bother updating, if it is unchanged.
if (!originalTerm.equals(finalTerm))
nextToken.setTermBuffer(finalTerm);
return nextToken;
}
}

View File

@ -1,64 +1,30 @@
package org.apache.lucene.analysis.snowball;
/* ====================================================================
* The Apache Software License, Version 1.1
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* Copyright (c) 2004 The Apache Software Foundation. All rights
* reserved.
* http://www.apache.org/licenses/LICENSE-2.0
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
*
* 3. The end-user documentation included with the redistribution,
* if any, must include the following acknowledgment:
* "This product includes software developed by the
* Apache Software Foundation (http://www.apache.org/)."
* Alternately, this acknowledgment may appear in the software itself,
* if and wherever such third-party acknowledgments normally appear.
*
* 4. The names "Apache" and "Apache Software Foundation" and
* "Apache Lucene" must not be used to endorse or promote products
* derived from this software without prior written permission. For
* written permission, please contact apache@apache.org.
*
* 5. Products derived from this software may not be called "Apache",
* "Apache Lucene", nor may "Apache" appear in their name, without
* prior written permission of the Apache Software Foundation.
*
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
* ====================================================================
*
* This software consists of voluntary contributions made by many
* individuals on behalf of the Apache Software Foundation. For more
* information on the Apache Software Foundation, please see
* <http://www.apache.org/>.
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.*;
import java.io.StringReader;
import junit.framework.*;
import junit.framework.TestCase;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.index.Payload;
import org.apache.lucene.analysis.TokenStream;
public class TestSnowball extends TestCase {
@ -66,12 +32,12 @@ public class TestSnowball extends TestCase {
String input,
String[] output) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
final Token reusableToken = new Token();
for (int i = 0; i < output.length; i++) {
Token t = ts.next();
assertNotNull(t);
assertEquals(output[i], t.termText());
Token nextToken = ts.next(reusableToken);
assertEquals(output[i], nextToken.term());
}
assertNull(ts.next());
assertNull(ts.next(reusableToken));
ts.close();
}
@ -83,25 +49,33 @@ public class TestSnowball extends TestCase {
public void testFilterTokens() throws Exception {
final Token tok = new Token("accents", 2, 7, "wrd");
final Token tok = new Token(2, 7, "wrd");
tok.setTermBuffer("accents");
tok.setPositionIncrement(3);
Payload tokPayload = new Payload(new byte[]{0,1,2,3});
tok.setPayload(tokPayload);
int tokFlags = 77;
tok.setFlags(tokFlags);
SnowballFilter filter = new SnowballFilter(
new TokenStream() {
public Token next() {
public Token next(final Token reusableToken) {
assert reusableToken != null;
return tok;
}
},
"English"
);
Token newtok = filter.next();
final Token reusableToken = new Token();
Token nextToken = filter.next(reusableToken);
assertEquals("accent", newtok.termText());
assertEquals(2, newtok.startOffset());
assertEquals(7, newtok.endOffset());
assertEquals("wrd", newtok.type());
assertEquals(3, newtok.getPositionIncrement());
assertEquals("accent", nextToken.term());
assertEquals(2, nextToken.startOffset());
assertEquals(7, nextToken.endOffset());
assertEquals("wrd", nextToken.type());
assertEquals(3, nextToken.getPositionIncrement());
assertEquals(tokFlags, nextToken.getFlags());
assertEquals(tokPayload, nextToken.getPayload());
}
}
}

View File

@ -133,7 +133,8 @@ public class WikipediaTokenizer extends Tokenizer {
*
* @see org.apache.lucene.analysis.TokenStream#next()
*/
public Token next(Token result) throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (tokens != null && tokens.hasNext()){
return (Token)tokens.next();
}
@ -144,22 +145,22 @@ public class WikipediaTokenizer extends Tokenizer {
}
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
setupToken(result);
setupToken(reusableToken);
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
collapseTokens(result, tokenType);
collapseTokens(reusableToken, tokenType);
}
else if (tokenOutput == BOTH){
//collapse into a single token, add it to tokens AND output the individual tokens
//output the untokenized Token first
collapseAndSaveTokens(result, tokenType, type);
collapseAndSaveTokens(reusableToken, tokenType, type);
}
result.setPositionIncrement(scanner.getPositionIncrement());
result.setType(type);
return result;
reusableToken.setPositionIncrement(scanner.getPositionIncrement());
reusableToken.setType(type);
return reusableToken;
}
private void collapseAndSaveTokens(Token result, int tokenType, String type) throws IOException {
private void collapseAndSaveTokens(final Token reusableToken, int tokenType, String type) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
@ -188,10 +189,10 @@ public class WikipediaTokenizer extends Tokenizer {
}
//trim the buffer
String s = buffer.toString().trim();
result.setTermBuffer(s.toCharArray(), 0, s.length());
result.setStartOffset(theStart);
result.setEndOffset(theStart + s.length());
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
reusableToken.setStartOffset(theStart);
reusableToken.setEndOffset(theStart + s.length());
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
@ -205,7 +206,7 @@ public class WikipediaTokenizer extends Tokenizer {
saved.setType(type);
}
private void collapseTokens(Token result, int tokenType) throws IOException {
private void collapseTokens(final Token reusableToken, int tokenType) throws IOException {
//collapse
StringBuffer buffer = new StringBuffer(32);
int numAdded = scanner.setText(buffer);
@ -227,10 +228,10 @@ public class WikipediaTokenizer extends Tokenizer {
}
//trim the buffer
String s = buffer.toString().trim();
result.setTermBuffer(s.toCharArray(), 0, s.length());
result.setStartOffset(theStart);
result.setEndOffset(theStart + s.length());
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
reusableToken.setStartOffset(theStart);
reusableToken.setEndOffset(theStart + s.length());
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
scanner.yypushback(scanner.yylength());
@ -239,11 +240,11 @@ public class WikipediaTokenizer extends Tokenizer {
}
}
private void setupToken(Token result) {
scanner.getText(result);
private void setupToken(final Token reusableToken) {
scanner.getText(reusableToken);
final int start = scanner.yychar();
result.setStartOffset(start);
result.setEndOffset(start + result.termLength());
reusableToken.setStartOffset(start);
reusableToken.setEndOffset(start + reusableToken.termLength());
}
/*

View File

@ -126,28 +126,28 @@ public class WikipediaTokenizerTest extends TestCase {
tcm.put("3.25", "<NUM>");
tcm.put("3.50", "<NUM>");
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
Token token = new Token();
int count = 0;
int numItalics = 0;
int numBoldItalics = 0;
int numCategory = 0;
int numCitation = 0;
while ((token = tf.next(token)) != null) {
String tokText = token.termText();
final Token reusableToken = new Token();
for (Token nextToken = tf.next(reusableToken); nextToken != null; nextToken = tf.next(reusableToken)) {
String tokText = nextToken.term();
//System.out.println("Text: " + tokText + " Type: " + token.type());
assertTrue("token is null and it shouldn't be", token != null);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
String expectedType = (String) tcm.get(tokText);
assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
assertTrue("expectedType is null and it shouldn't be for: " + nextToken, expectedType != null);
assertTrue(nextToken.type() + " is not equal to " + expectedType + " for " + nextToken, nextToken.type().equals(expectedType) == true);
count++;
if (token.type().equals(WikipediaTokenizer.ITALICS) == true){
if (nextToken.type().equals(WikipediaTokenizer.ITALICS) == true){
numItalics++;
} else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
} else if (nextToken.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
numBoldItalics++;
} else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){
} else if (nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true){
numCategory++;
}
else if (token.type().equals(WikipediaTokenizer.CITATION) == true){
else if (nextToken.type().equals(WikipediaTokenizer.CITATION) == true){
numCitation++;
}
}
@ -166,105 +166,105 @@ public class WikipediaTokenizerTest extends TestCase {
}
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
final Token reusableToken = new Token();
Token nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "click", nextToken.term().equals("click") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "link", nextToken.term().equals("link") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "here",
nextToken.term().equals("here") == true);
//The link, and here should be at the same position for phrases to work
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "again",
nextToken.term().equals("again") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "click",
nextToken.term().equals("click") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org",
nextToken.term().equals("http://lucene.apache.org") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "here",
nextToken.term().equals("here") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "again",
nextToken.term().equals("again") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "a",
nextToken.term().equals("a") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "b",
nextToken.term().equals("b") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "c",
nextToken.term().equals("c") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "d",
nextToken.term().equals("d") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is not null and it should be", nextToken == null);
}
public void testLinks() throws Exception {
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
Token token = new Token();
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(token);//skip here
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(token);//skip here
token = tf.next(token);
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
final Token reusableToken = new Token();
Token nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(reusableToken);//skip here
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
tf.next(reusableToken);//skip here
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
nextToken.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is not null and it should be", nextToken == null);
}
@ -277,72 +277,72 @@ public class WikipediaTokenizerTest extends TestCase {
checkLinkPhrases(tf);
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
Token token;
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
final Token reusableToken = new Token();
Token nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
nextToken.term().equals("a b c d") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "e f g",
nextToken.term().equals("e f g") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "link",
nextToken.term().equals("link") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "here",
nextToken.term().equals("here") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "link",
nextToken.term().equals("link") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "there",
nextToken.term().equals("there") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "italics here",
nextToken.term().equals("italics here") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "something",
nextToken.term().equals("something") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "more italics",
nextToken.term().equals("more italics") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "h i j",
nextToken.term().equals("h i j") == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is not null and it should be", nextToken == null);
}
public void testBoth() throws Exception {
@ -352,225 +352,225 @@ public class WikipediaTokenizerTest extends TestCase {
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
Token token;
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
final Token reusableToken = new Token();
Token nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
nextToken.term().equals("a b c d") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "a",
nextToken.term().equals("a") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", nextToken.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
assertTrue(nextToken.endOffset() + " does not equal: " + 12, nextToken.endOffset() == 12);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "b",
nextToken.term().equals("b") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 13, nextToken.startOffset() == 13);
assertTrue(nextToken.endOffset() + " does not equal: " + 14, nextToken.endOffset() == 14);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "c",
nextToken.term().equals("c") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 15, nextToken.startOffset() == 15);
assertTrue(nextToken.endOffset() + " does not equal: " + 16, nextToken.endOffset() == 16);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "d",
nextToken.term().equals("d") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 17, nextToken.startOffset() == 17);
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "e f g",
nextToken.term().equals("e f g") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "e",
nextToken.term().equals("e") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
assertTrue(nextToken.endOffset() + " does not equal: " + 33, nextToken.endOffset() == 33);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "f",
nextToken.term().equals("f") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.startOffset() + " does not equal: " + 34, nextToken.startOffset() == 34);
assertTrue(nextToken.endOffset() + " does not equal: " + 35, nextToken.endOffset() == 35);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "g",
nextToken.term().equals("g") == true);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.startOffset() + " does not equal: " + 36, nextToken.startOffset() == 36);
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "link",
nextToken.term().equals("link") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "here",
nextToken.term().equals("here") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "link",
nextToken.term().equals("link") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "there",
nextToken.term().equals("there") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "italics here",
nextToken.term().equals("italics here") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "italics",
nextToken.term().equals("italics") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
assertTrue(nextToken.endOffset() + " does not equal: " + 78, nextToken.endOffset() == 78);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "here",
nextToken.term().equals("here") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 79, nextToken.startOffset() == 79);
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "something",
nextToken.term().equals("something") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "more italics",
nextToken.term().equals("more italics") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "more",
nextToken.term().equals("more") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
assertTrue(nextToken.endOffset() + " does not equal: " + 102, nextToken.endOffset() == 102);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "italics",
nextToken.term().equals("italics") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
assertTrue(nextToken.startOffset() + " does not equal: " + 103, nextToken.startOffset() == 103);
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "h i j",
nextToken.term().equals("h i j") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "h",
nextToken.term().equals("h") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
assertTrue(nextToken.endOffset() + " does not equal: " + 125, nextToken.endOffset() == 125);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
token = tf.next();
assertTrue("token is null and it shouldn't be", token != null);
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "i",
nextToken.term().equals("i") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 128, nextToken.startOffset() == 128);
assertTrue(nextToken.endOffset() + " does not equal: " + 129, nextToken.endOffset() == 129);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
assertTrue(nextToken.term() + " is not equal to " + "j",
nextToken.term().equals("j") == true);
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
assertTrue(nextToken.startOffset() + " does not equal: " + 132, nextToken.startOffset() == 132);
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
token = tf.next();
assertTrue("token is not null and it should be", token == null);
nextToken = tf.next(reusableToken);
assertTrue("nextToken is not null and it should be", nextToken == null);
}
}

View File

@ -17,14 +17,28 @@ package org.apache.lucene.wordnet;
* limitations under the License.
*/
import org.apache.lucene.store.*;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import java.io.*;
import java.util.*;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
/**
@ -99,10 +113,10 @@ public final class SynExpand {
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
org.apache.lucene.analysis.Token t;
while ( (t = ts.next()) != null)
{
String word = t.termText();
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String word = nextToken.term();
if ( already.add( word))
top.add( word);
}

View File

@ -17,13 +17,27 @@ package org.apache.lucene.wordnet;
* limitations under the License.
*/
import org.apache.lucene.store.*;
import org.apache.lucene.search.*;
import org.apache.lucene.index.*;
import org.apache.lucene.document.*;
import org.apache.lucene.analysis.*;
import java.io.*;
import java.util.*;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.FSDirectory;
/**
@ -86,10 +100,9 @@ public class SynLookup {
// [1] Parse query into separate words so that when we expand we can avoid dups
TokenStream ts = a.tokenStream( field, new StringReader( query));
org.apache.lucene.analysis.Token t;
while ( (t = ts.next()) != null)
{
String word = t.termText();
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
String word = nextToken.term();
if ( already.add( word))
top.add( word);
}

View File

@ -74,16 +74,14 @@ public class LikeThisQueryBuilder implements QueryBuilder {
if((stopWords!=null)&&(fields!=null))
{
stopWordsSet=new HashSet();
final Token reusableToken = new Token();
for (int i = 0; i < fields.length; i++)
{
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
try
{
Token stopToken=ts.next();
while(stopToken!=null)
{
stopWordsSet.add(stopToken.termText());
stopToken=ts.next();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
stopWordsSet.add(nextToken.term());
}
}
catch(IOException ioe)

View File

@ -52,12 +52,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
{
ArrayList clausesList=new ArrayList();
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
Token token=ts.next();
while(token!=null)
{
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText()));
final Token reusableToken = new Token();
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,nextToken.term()));
clausesList.add(stq);
token=ts.next();
}
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));

View File

@ -59,20 +59,18 @@ public class TermsFilterBuilder implements FilterBuilder
try
{
Token token = ts.next();
final Token reusableToken = new Token();
Term term = null;
while (token != null)
{
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
if (term == null)
{
term = new Term(fieldName, token.termText());
term = new Term(fieldName, nextToken.term());
} else
{
// create from previous to save fieldName.intern overhead
term = term.createTerm(token.termText());
term = term.createTerm(nextToken.term());
}
tf.addTerm(term);
token = ts.next();
}
}
catch (IOException ioe)

View File

@ -58,20 +58,18 @@ public class TermsQueryBuilder implements QueryBuilder {
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
try
{
Token token = ts.next();
final Token reusableToken = new Token();
Term term = null;
while (token != null)
{
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
if (term == null)
{
term = new Term(fieldName, token.termText());
term = new Term(fieldName, nextToken.term());
} else
{
// create from previous to save fieldName.intern overhead
term = term.createTerm(token.termText());
term = term.createTerm(nextToken.term());
}
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
token = ts.next();
}
}
catch (IOException ioe)

View File

@ -487,7 +487,10 @@ null)
private int jj_gc = 0;
public HTMLParser(java.io.InputStream stream) {
jj_input_stream = new SimpleCharStream(stream, 1, 1);
this(stream, null);
}
public HTMLParser(java.io.InputStream stream, String encoding) {
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source = new HTMLParserTokenManager(jj_input_stream);
token = new Token();
jj_ntk = -1;
@ -497,7 +500,10 @@ null)
}
public void ReInit(java.io.InputStream stream) {
jj_input_stream.ReInit(stream, 1, 1);
ReInit(stream, null);
}
public void ReInit(java.io.InputStream stream, String encoding) {
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
token_source.ReInit(jj_input_stream);
token = new Token();
jj_ntk = -1;
@ -627,7 +633,9 @@ null)
jj_lasttokens[jj_endpos++] = kind;
} else if (jj_endpos != 0) {
jj_expentry = new int[jj_endpos];
System.arraycopy(jj_lasttokens, 0, jj_expentry, 0, jj_endpos);
for (int i = 0; i < jj_endpos; i++) {
jj_expentry[i] = jj_lasttokens[i];
}
boolean exists = false;
for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) {
int[] oldentry = (int[])(e.nextElement());
@ -692,6 +700,7 @@ null)
final private void jj_rescan_token() {
jj_rescan = true;
for (int i = 0; i < 2; i++) {
try {
JJCalls p = jj_2_rtns[i];
do {
if (p.gen > jj_gen) {
@ -703,6 +712,7 @@ null)
}
p = p.next;
} while (p != null);
} catch(LookaheadSuccess ls) { }
}
jj_rescan = false;
}

View File

@ -1457,14 +1457,12 @@ protected SimpleCharStream input_stream;
private final int[] jjrounds = new int[28];
private final int[] jjstateSet = new int[56];
protected char curChar;
public HTMLParserTokenManager(SimpleCharStream stream)
{
public HTMLParserTokenManager(SimpleCharStream stream){
if (SimpleCharStream.staticFlag)
throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
input_stream = stream;
}
public HTMLParserTokenManager(SimpleCharStream stream, int lexState)
{
public HTMLParserTokenManager(SimpleCharStream stream, int lexState){
this(stream);
SwitchTo(lexState);
}

View File

@ -98,19 +98,19 @@ public class ParseException extends Exception {
if (!specialConstructor) {
return super.getMessage();
}
String expected = "";
StringBuffer expected = new StringBuffer();
int maxSize = 0;
for (int i = 0; i < expectedTokenSequences.length; i++) {
if (maxSize < expectedTokenSequences[i].length) {
maxSize = expectedTokenSequences[i].length;
}
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
expected += tokenImage[expectedTokenSequences[i][j]] + " ";
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(" ");
}
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
expected += "...";
expected.append("...");
}
expected += eol + " ";
expected.append(eol).append(" ");
}
String retval = "Encountered \"";
Token tok = currentToken.next;
@ -130,7 +130,7 @@ public class ParseException extends Exception {
} else {
retval += "Was expecting one of:" + eol + " ";
}
retval += expected;
retval += expected.toString();
return retval;
}
@ -179,7 +179,7 @@ public class ParseException extends Exception {
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u").append(s.substring(s.length() - 4, s.length()));
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}

View File

@ -1,4 +1,4 @@
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 3.0 */
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.0 */
package org.apache.lucene.demo.html;
/**
@ -27,6 +27,11 @@ public class SimpleCharStream
protected char[] buffer;
protected int maxNextCharInd = 0;
protected int inBuf = 0;
protected int tabSize = 8;
protected void setTabSize(int i) { tabSize = i; }
protected int getTabSize(int i) { return tabSize; }
protected void ExpandBuff(boolean wrapAround)
{
@ -162,7 +167,7 @@ public class SimpleCharStream
break;
case '\t' :
column--;
column += (8 - (column & 07));
column += (tabSize - (column % tabSize));
break;
default :
break;
@ -248,7 +253,7 @@ public class SimpleCharStream
}
public SimpleCharStream(java.io.Reader dstream, int startline,
int startcolumn)
int startcolumn)
{
this(dstream, startline, startcolumn, 4096);
}
@ -277,7 +282,7 @@ public class SimpleCharStream
}
public void ReInit(java.io.Reader dstream, int startline,
int startcolumn)
int startcolumn)
{
ReInit(dstream, startline, startcolumn, 4096);
}
@ -286,35 +291,68 @@ public class SimpleCharStream
{
ReInit(dstream, 1, 1, 4096);
}
public SimpleCharStream(java.io.InputStream dstream, int startline,
int startcolumn, int buffersize)
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
{
this(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096);
this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
}
public SimpleCharStream(java.io.InputStream dstream, int startline,
int startcolumn)
int startcolumn, int buffersize)
{
this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
}
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
int startcolumn) throws java.io.UnsupportedEncodingException
{
this(dstream, encoding, startline, startcolumn, 4096);
}
public SimpleCharStream(java.io.InputStream dstream, int startline,
int startcolumn)
{
this(dstream, startline, startcolumn, 4096);
}
public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
{
this(dstream, encoding, 1, 1, 4096);
}
public SimpleCharStream(java.io.InputStream dstream)
{
this(dstream, 1, 1, 4096);
}
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
{
ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
}
public void ReInit(java.io.InputStream dstream, int startline,
int startcolumn, int buffersize)
{
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096);
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
}
public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
{
ReInit(dstream, encoding, 1, 1, 4096);
}
public void ReInit(java.io.InputStream dstream)
{
ReInit(dstream, 1, 1, 4096);
}
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
int startcolumn) throws java.io.UnsupportedEncodingException
{
ReInit(dstream, encoding, startline, startcolumn, 4096);
}
public void ReInit(java.io.InputStream dstream, int startline,
int startcolumn)
int startcolumn)
{
ReInit(dstream, startline, startcolumn, 4096);
}

View File

@ -72,7 +72,7 @@ public class TokenMgrError extends Error
default:
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
String s = "0000" + Integer.toString(ch, 16);
retval.append("\\u").append(s.substring(s.length() - 4, s.length()));
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
} else {
retval.append(ch);
}

View File

@ -40,11 +40,12 @@ public class CachingTokenFilter extends TokenFilter {
super(input);
}
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (cache == null) {
// fill cache lazily
cache = new LinkedList();
fillCache();
fillCache(reusableToken);
iterator = cache.iterator();
}
@ -52,8 +53,9 @@ public class CachingTokenFilter extends TokenFilter {
// the cache is exhausted, return null
return null;
}
return (Token) iterator.next();
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
Token nextToken = (Token) iterator.next();
return (Token) nextToken.clone();
}
public void reset() throws IOException {
@ -62,10 +64,9 @@ public class CachingTokenFilter extends TokenFilter {
}
}
private void fillCache() throws IOException {
Token token;
while ( (token = input.next()) != null) {
cache.add(token);
private void fillCache(final Token reusableToken) throws IOException {
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
cache.add(nextToken.clone());
}
}

View File

@ -44,11 +44,12 @@ public abstract class CharTokenizer extends Tokenizer {
return c;
}
public final Token next(Token token) throws IOException {
token.clear();
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
reusableToken.clear();
int length = 0;
int start = bufferIndex;
char[] buffer = token.termBuffer();
char[] buffer = reusableToken.termBuffer();
while (true) {
if (bufferIndex >= dataLen) {
@ -70,7 +71,7 @@ public abstract class CharTokenizer extends Tokenizer {
if (length == 0) // start of token
start = offset + bufferIndex - 1;
else if (length == buffer.length)
buffer = token.resizeTermBuffer(1+length);
buffer = reusableToken.resizeTermBuffer(1+length);
buffer[length++] = normalize(c); // buffer it, normalized
@ -81,10 +82,10 @@ public abstract class CharTokenizer extends Tokenizer {
break; // return 'em
}
token.termLength = length;
token.startOffset = start;
token.endOffset = start+length;
return token;
reusableToken.setTermLength(length);
reusableToken.setStartOffset(start);
reusableToken.setEndOffset(start+length);
return reusableToken;
}
public void reset(Reader input) throws IOException {

View File

@ -32,22 +32,23 @@ public class ISOLatin1AccentFilter extends TokenFilter {
private char[] output = new char[256];
private int outputPos;
public final Token next(Token result) throws java.io.IOException {
result = input.next(result);
if (result != null) {
final char[] buffer = result.termBuffer();
final int length = result.termLength();
public final Token next(final Token reusableToken) throws java.io.IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
final char[] buffer = nextToken.termBuffer();
final int length = nextToken.termLength();
// If no characters actually require rewriting then we
// just return token as-is:
for(int i=0;i<length;i++) {
final char c = buffer[i];
if (c >= '\u00c0' && c <= '\uFB06') {
removeAccents(buffer, length);
result.setTermBuffer(output, 0, outputPos);
nextToken.setTermBuffer(output, 0, outputPos);
break;
}
}
return result;
return nextToken;
} else
return null;
}

View File

@ -38,21 +38,22 @@ public class KeywordTokenizer extends Tokenizer {
this.done = false;
}
public Token next(Token result) throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (!done) {
done = true;
int upto = 0;
result.clear();
char[] buffer = result.termBuffer();
reusableToken.clear();
char[] buffer = reusableToken.termBuffer();
while (true) {
final int length = input.read(buffer, upto, buffer.length-upto);
if (length == -1) break;
upto += length;
if (upto == buffer.length)
buffer = result.resizeTermBuffer(1+buffer.length);
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
}
result.termLength = upto;
return result;
reusableToken.setTermLength(upto);
return reusableToken;
}
return null;
}

View File

@ -42,16 +42,17 @@ public final class LengthFilter extends TokenFilter {
}
/**
* Returns the next input Token whose termText() is the right len
* Returns the next input Token whose term() is the right len
*/
public final Token next(Token result) throws IOException
public final Token next(final Token reusableToken) throws IOException
{
assert reusableToken != null;
// return the first non-stop word found
for (Token token = input.next(result); token != null; token = input.next(result))
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken))
{
int len = token.termLength();
int len = nextToken.termLength();
if (len >= min && len <= max) {
return token;
return nextToken;
}
// note: else we ignore it but should we index each part of it?
}

View File

@ -29,16 +29,17 @@ public final class LowerCaseFilter extends TokenFilter {
super(in);
}
public final Token next(Token result) throws IOException {
result = input.next(result);
if (result != null) {
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken != null) {
final char[] buffer = result.termBuffer();
final int length = result.termLength;
final char[] buffer = nextToken.termBuffer();
final int length = nextToken.termLength();
for(int i=0;i<length;i++)
buffer[i] = Character.toLowerCase(buffer[i]);
return result;
return nextToken;
} else
return null;
}

View File

@ -45,13 +45,14 @@ public final class PorterStemFilter extends TokenFilter {
stemmer = new PorterStemmer();
}
public final Token next(Token result) throws IOException {
result = input.next(result);
if (result != null) {
if (stemmer.stem(result.termBuffer(), 0, result.termLength))
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return result;
} else
public final Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
Token nextToken = input.next(reusableToken);
if (nextToken == null)
return null;
if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength()))
nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
return nextToken;
}
}

View File

@ -22,11 +22,11 @@ public class SinkTokenizer extends Tokenizer {
}
public SinkTokenizer() {
this.lst = new ArrayList();
this.lst = new ArrayList/*<Token>*/();
}
public SinkTokenizer(int initCap){
this.lst = new ArrayList(initCap);
this.lst = new ArrayList/*<Token>*/(initCap);
}
/**
@ -35,6 +35,8 @@ public class SinkTokenizer extends Tokenizer {
* WARNING: Adding tokens to this list requires the {@link #reset()} method to be called in order for them
* to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s
* in the case of adds happening while {@link #next(org.apache.lucene.analysis.Token)} is being called.
* <p/>
* WARNING: Since this SinkTokenizer can be reset and the cached tokens made available again, do not modify them. Modify clones instead.
*
* @return A List of {@link org.apache.lucene.analysis.Token}s
*/
@ -47,9 +49,15 @@ public class SinkTokenizer extends Tokenizer {
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
* @throws IOException
*/
public Token next() throws IOException {
public Token next(final Token reusableToken) throws IOException {
assert reusableToken != null;
if (iter == null) iter = lst.iterator();
return iter.hasNext() ? (Token) iter.next() : null;
// Since this TokenStream can be reset we have to maintain the tokens as immutable
if (iter.hasNext()) {
Token nextToken = (Token) iter.next();
return (Token) nextToken.clone();
}
return null;
}

Some files were not shown because too many files have changed in this diff Show More