mirror of https://github.com/apache/lucene.git
LUCENE-1333: improvements to Token reuse API and full cutover to reuse API for all core and contrib analyzers
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@687357 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7675606908
commit
bb6b711718
|
@ -108,6 +108,12 @@ API Changes
|
|||
16. LUCENE-1334: Add new constructor for Term: Term(String fieldName)
|
||||
which defaults term text to "". (DM Smith via Mike McCandless)
|
||||
|
||||
17. LUCENE-1333: Added Token.reinit(*) APIs to re-initialize (reuse) a
|
||||
Token. Also added term() method to return a String, with a
|
||||
performance penalty clearly documented. Also implemented
|
||||
hashCode() and equals() in Token, and fixed all core and contrib
|
||||
analyzers to use the re-use APIs. (DM Smith via Mike McCandless)
|
||||
|
||||
Bug fixes
|
||||
|
||||
1. LUCENE-1134: Fixed BooleanQuery.rewrite to only optimize a single
|
||||
|
|
|
@ -36,7 +36,6 @@ public final class BrazilianStemFilter extends TokenFilter {
|
|||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private BrazilianStemmer stemmer = null;
|
||||
private Set exclusions = null;
|
||||
|
||||
|
@ -53,22 +52,23 @@ public final class BrazilianStemFilter extends TokenFilter {
|
|||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS.
|
||||
*/
|
||||
public final Token next()
|
||||
public final Token next(final Token reusableToken)
|
||||
throws IOException {
|
||||
if ((token = input.next()) == null) {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
String term = nextToken.term();
|
||||
|
||||
// Check the exclusion table.
|
||||
if (exclusions == null || !exclusions.contains(term)) {
|
||||
String s = stemmer.stem(term);
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals(term))
|
||||
nextToken.setTermBuffer(s);
|
||||
}
|
||||
// Check the exclusiontable.
|
||||
else if (exclusions != null && exclusions.contains(token.termText())) {
|
||||
return token;
|
||||
} else {
|
||||
String s = stemmer.stem(token.termText());
|
||||
// If not stemmed, dont waste the time creating a new token.
|
||||
if ((s != null) && !s.equals(token.termText())) {
|
||||
return new Token(s, token.startOffset(), token.endOffset(), token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,7 +26,7 @@ import java.io.Reader;
|
|||
/**
|
||||
* CJKTokenizer was modified from StopTokenizer which does a decent job for
|
||||
* most European languages. It performs other token methods for double-byte
|
||||
* Characters: the token will return at each two charactors with overlap match.<br>
|
||||
* Characters: the token will return at each two characters with overlap match.<br>
|
||||
* Example: "java C1C2C3C4" will be segment to: "java" "C1C2" "C2C3" "C3C4" it
|
||||
* also need filter filter zero length token ""<br>
|
||||
* for Digit: digit, '+', '#' will token as letter<br>
|
||||
|
@ -96,24 +96,26 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
* See http://java.sun.com/j2se/1.3/docs/api/java/lang/Character.UnicodeBlock.html
|
||||
* for detail.
|
||||
*
|
||||
* @param reusableToken a reusable token
|
||||
* @return Token
|
||||
*
|
||||
* @throws java.io.IOException - throw IOException when read error <br>
|
||||
* hanppened in the InputStream
|
||||
* happened in the InputStream
|
||||
*
|
||||
*/
|
||||
public final Token next() throws java.io.IOException {
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
/** how many character(s) has been stored in buffer */
|
||||
assert reusableToken != null;
|
||||
int length = 0;
|
||||
|
||||
/** the position used to create Token */
|
||||
int start = offset;
|
||||
|
||||
while (true) {
|
||||
/** current charactor */
|
||||
/** current character */
|
||||
char c;
|
||||
|
||||
/** unicode block of current charactor for detail */
|
||||
/** unicode block of current character for detail */
|
||||
Character.UnicodeBlock ub;
|
||||
|
||||
offset++;
|
||||
|
@ -198,7 +200,7 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
} else {
|
||||
// non-ASCII letter, eg."C1C2C3C4"
|
||||
// non-ASCII letter, e.g."C1C2C3C4"
|
||||
if (Character.isLetter(c)) {
|
||||
if (length == 0) {
|
||||
start = offset - 1;
|
||||
|
@ -236,8 +238,6 @@ public final class CJKTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
return new Token(new String(buffer, 0, length), start, start + length,
|
||||
tokenType
|
||||
);
|
||||
return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,7 +18,10 @@ package org.apache.lucene.analysis.cn;
|
|||
*/
|
||||
|
||||
import java.util.Hashtable;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Title: ChineseFilter
|
||||
|
@ -61,10 +64,11 @@ public final class ChineseFilter extends TokenFilter {
|
|||
stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
|
||||
for (Token token = input.next(); token != null; token = input.next()) {
|
||||
String text = token.termText();
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
String text = nextToken.term();
|
||||
|
||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||
if (stopTable.get(text) == null) {
|
||||
|
@ -75,7 +79,7 @@ public final class ChineseFilter extends TokenFilter {
|
|||
|
||||
// English word/token should larger than 1 character.
|
||||
if (text.length()>1) {
|
||||
return token;
|
||||
return nextToken;
|
||||
}
|
||||
break;
|
||||
case Character.OTHER_LETTER:
|
||||
|
@ -83,7 +87,7 @@ public final class ChineseFilter extends TokenFilter {
|
|||
// One Chinese character as one Chinese word.
|
||||
// Chinese word extraction to be added later here.
|
||||
|
||||
return token;
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -19,7 +19,9 @@ package org.apache.lucene.analysis.cn;
|
|||
|
||||
|
||||
import java.io.Reader;
|
||||
import org.apache.lucene.analysis.*;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -75,17 +77,19 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
|
||||
}
|
||||
|
||||
private final Token flush() {
|
||||
private final Token flush(final Token token) {
|
||||
|
||||
if (length>0) {
|
||||
//System.out.println(new String(buffer, 0, length));
|
||||
return new Token(new String(buffer, 0, length), start, start+length);
|
||||
//System.out.println(new String(buffer, 0,
|
||||
//length));
|
||||
return token.reinit(buffer, 0, length, start, start+length);
|
||||
}
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException {
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
|
||||
length = 0;
|
||||
start = offset;
|
||||
|
@ -101,7 +105,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
bufferIndex = 0;
|
||||
}
|
||||
|
||||
if (dataLen == -1) return flush();
|
||||
if (dataLen == -1) return flush(reusableToken);
|
||||
else
|
||||
c = ioBuffer[bufferIndex++];
|
||||
|
||||
|
@ -112,20 +116,20 @@ public final class ChineseTokenizer extends Tokenizer {
|
|||
case Character.LOWERCASE_LETTER:
|
||||
case Character.UPPERCASE_LETTER:
|
||||
push(c);
|
||||
if (length == MAX_WORD_LEN) return flush();
|
||||
if (length == MAX_WORD_LEN) return flush(reusableToken);
|
||||
break;
|
||||
|
||||
case Character.OTHER_LETTER:
|
||||
if (length>0) {
|
||||
bufferIndex--;
|
||||
offset--;
|
||||
return flush();
|
||||
return flush(reusableToken);
|
||||
}
|
||||
push(c);
|
||||
return flush();
|
||||
return flush(reusableToken);
|
||||
|
||||
default:
|
||||
if (length>0) return flush();
|
||||
if (length>0) return flush(reusableToken);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -105,17 +105,18 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
return dict;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (tokens.size() > 0) {
|
||||
return (Token)tokens.removeFirst();
|
||||
}
|
||||
|
||||
Token token = input.next();
|
||||
if (token == null) {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
decompose(token);
|
||||
decompose(nextToken);
|
||||
|
||||
if (tokens.size() > 0) {
|
||||
return (Token)tokens.removeFirst();
|
||||
|
@ -145,17 +146,15 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
|||
|
||||
protected final Token createToken(final int offset, final int length,
|
||||
final Token prototype) {
|
||||
Token t = new Token(prototype.startOffset() + offset, prototype
|
||||
.startOffset()
|
||||
+ offset + length, prototype.type());
|
||||
t.setTermBuffer(prototype.termBuffer(), offset, length);
|
||||
int newStart = prototype.startOffset() + offset;
|
||||
Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length);
|
||||
t.setPositionIncrement(0);
|
||||
return t;
|
||||
}
|
||||
|
||||
protected void decompose(final Token token) {
|
||||
// In any case we give the original token back
|
||||
tokens.add(token);
|
||||
tokens.add((Token) token.clone());
|
||||
|
||||
// Only words longer than minWordSize get processed
|
||||
if (token.termLength() < this.minWordSize) {
|
||||
|
|
|
@ -37,7 +37,6 @@ public final class GermanStemFilter extends TokenFilter
|
|||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private GermanStemmer stemmer = null;
|
||||
private Set exclusionSet = null;
|
||||
|
||||
|
@ -48,7 +47,7 @@ public final class GermanStemFilter extends TokenFilter
|
|||
}
|
||||
|
||||
/**
|
||||
* Builds a GermanStemFilter that uses an exclusiontable.
|
||||
* Builds a GermanStemFilter that uses an exclusion table.
|
||||
*/
|
||||
public GermanStemFilter( TokenStream in, Set exclusionSet )
|
||||
{
|
||||
|
@ -59,25 +58,24 @@ public final class GermanStemFilter extends TokenFilter
|
|||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next()
|
||||
public final Token next(final Token reusableToken)
|
||||
throws IOException
|
||||
{
|
||||
if ( ( token = input.next() ) == null ) {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
String term = nextToken.term();
|
||||
// Check the exclusion table.
|
||||
if (exclusionSet == null || !exclusionSet.contains(term)) {
|
||||
String s = stemmer.stem(term);
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals(term))
|
||||
nextToken.setTermBuffer(s);
|
||||
}
|
||||
// Check the exclusiontable
|
||||
else if ( exclusionSet != null && exclusionSet.contains( token.termText() ) ) {
|
||||
return token;
|
||||
}
|
||||
else {
|
||||
String s = stemmer.stem( token.termText() );
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if ( !s.equals( token.termText() ) ) {
|
||||
return new Token( s, token.startOffset(),
|
||||
token.endOffset(), token.type() );
|
||||
}
|
||||
return token;
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -35,25 +35,20 @@ public final class GreekLowerCaseFilter extends TokenFilter
|
|||
this.charset = charset;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException
|
||||
{
|
||||
Token t = input.next();
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
||||
if (t == null)
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
String txt = t.termText();
|
||||
|
||||
char[] chArray = txt.toCharArray();
|
||||
for (int i = 0; i < chArray.length; i++)
|
||||
char[] chArray = nextToken.termBuffer();
|
||||
int chLen = nextToken.termLength();
|
||||
for (int i = 0; i < chLen; i++)
|
||||
{
|
||||
chArray[i] = GreekCharsets.toLowerCase(chArray[i], charset);
|
||||
}
|
||||
|
||||
String newTxt = new String(chArray);
|
||||
// create new token
|
||||
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||
|
||||
return newToken;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
|||
public class ElisionFilter extends TokenFilter {
|
||||
private Set articles = null;
|
||||
|
||||
private static String apostrophes = "'’";
|
||||
private static char[] apostrophes = {'\'', '’'};
|
||||
|
||||
public void setArticles(Set articles) {
|
||||
this.articles = new HashSet();
|
||||
|
@ -74,25 +74,36 @@ public class ElisionFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whith termText() without elisioned start
|
||||
* Returns the next input Token with term() without elisioned start
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (t == null)
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
String text = t.termText();
|
||||
System.out.println(text);
|
||||
int minPoz = -1;
|
||||
int poz;
|
||||
for (int i = 0; i < apostrophes.length(); i++) {
|
||||
poz = text.indexOf(apostrophes.charAt(i));
|
||||
if (poz != -1)
|
||||
minPoz = (minPoz == -1) ? poz : Math.min(poz, minPoz);
|
||||
|
||||
char[] termBuffer = nextToken.termBuffer();
|
||||
int termLength = nextToken.termLength();
|
||||
|
||||
int minPoz = Integer.MAX_VALUE;
|
||||
for (int i = 0; i < apostrophes.length; i++) {
|
||||
char apos = apostrophes[i];
|
||||
// The equivalent of String.indexOf(ch)
|
||||
for (int poz = 0; poz < termLength ; poz++) {
|
||||
if (termBuffer[poz] == apos) {
|
||||
minPoz = Math.min(poz, minPoz);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (minPoz != -1
|
||||
&& articles.contains(text.substring(0, minPoz).toLowerCase()))
|
||||
text = text.substring(minPoz + 1);
|
||||
return new Token(text, t.startOffset(), t.endOffset(), t.type());
|
||||
|
||||
// An apostrophe has been found. If the prefix is an article strip it off.
|
||||
if (minPoz != Integer.MAX_VALUE
|
||||
&& articles.contains(new String(nextToken.termBuffer(), 0, minPoz).toLowerCase())) {
|
||||
nextToken.setTermBuffer(nextToken.termBuffer(), minPoz + 1, nextToken.termLength() - (minPoz + 1));
|
||||
}
|
||||
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -37,12 +37,11 @@ public final class FrenchStemFilter extends TokenFilter {
|
|||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private FrenchStemmer stemmer = null;
|
||||
private Set exclusions = null;
|
||||
|
||||
public FrenchStemFilter( TokenStream in ) {
|
||||
super(in);
|
||||
super(in);
|
||||
stemmer = new FrenchStemmer();
|
||||
}
|
||||
|
||||
|
@ -55,23 +54,23 @@ public final class FrenchStemFilter extends TokenFilter {
|
|||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next()
|
||||
public final Token next(final Token reusableToken)
|
||||
throws IOException {
|
||||
if ( ( token = input.next() ) == null ) {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
String term = nextToken.term();
|
||||
|
||||
// Check the exclusion table
|
||||
if ( exclusions == null || !exclusions.contains( term ) ) {
|
||||
String s = stemmer.stem( term );
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals( term ) )
|
||||
nextToken.setTermBuffer(s);
|
||||
}
|
||||
// Check the exclusiontable
|
||||
else if ( exclusions != null && exclusions.contains( token.termText() ) ) {
|
||||
return token;
|
||||
}
|
||||
else {
|
||||
String s = stemmer.stem( token.termText() );
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if ( !s.equals( token.termText() ) ) {
|
||||
return new Token( s, token.startOffset(), token.endOffset(), token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
/**
|
||||
* Set a alternative/custom FrenchStemmer for this filter.
|
||||
|
|
|
@ -27,18 +27,8 @@ import java.io.IOException;
|
|||
*/
|
||||
public class EmptyTokenStream extends TokenStream {
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return null;
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -55,8 +55,9 @@ public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
|
|||
}
|
||||
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
return suffix.next(result);
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return suffix.next(reusableToken);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -41,30 +41,34 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
prefixExhausted = false;
|
||||
}
|
||||
|
||||
private CopyableToken previousPrefixToken = new CopyableToken();
|
||||
private Token previousPrefixToken = new Token();
|
||||
|
||||
private boolean prefixExhausted;
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
|
||||
Token buf = result;
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
|
||||
if (!prefixExhausted) {
|
||||
result = prefix.next(result);
|
||||
if (result == null) {
|
||||
Token nextToken = prefix.next(reusableToken);
|
||||
if (nextToken == null) {
|
||||
prefixExhausted = true;
|
||||
} else {
|
||||
previousPrefixToken.copyFrom(result);
|
||||
return result;
|
||||
previousPrefixToken.reinit(nextToken);
|
||||
// Make it a deep copy
|
||||
Payload p = previousPrefixToken.getPayload();
|
||||
if (p != null) {
|
||||
previousPrefixToken.setPayload((Payload) p.clone());
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
result = suffix.next(buf);
|
||||
if (result == null) {
|
||||
Token nextToken = suffix.next(reusableToken);
|
||||
if (nextToken == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return updateSuffixToken(result, previousPrefixToken);
|
||||
return updateSuffixToken(nextToken, previousPrefixToken);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -98,7 +102,6 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
|
||||
}
|
||||
|
||||
|
||||
public TokenStream getPrefix() {
|
||||
return prefix;
|
||||
}
|
||||
|
@ -114,35 +117,4 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
|||
public void setSuffix(TokenStream suffix) {
|
||||
this.suffix = suffix;
|
||||
}
|
||||
|
||||
|
||||
public static class CopyableToken extends Token {
|
||||
|
||||
private Payload buf = new Payload();
|
||||
|
||||
public void copyFrom(Token source) {
|
||||
if (source.termBuffer() != null) {
|
||||
setTermBuffer(source.termBuffer(), 0, source.termLength());
|
||||
} else {
|
||||
setTermText(null);
|
||||
setTermLength(0);
|
||||
}
|
||||
|
||||
setPositionIncrement(source.getPositionIncrement());
|
||||
setFlags(source.getFlags());
|
||||
setStartOffset(source.startOffset());
|
||||
setEndOffset(source.endOffset());
|
||||
setType(source.type());
|
||||
if (source.getPayload() == null) {
|
||||
setPayload(null);
|
||||
} else {
|
||||
setPayload(buf);
|
||||
if (buf.getData() == null || buf.getData().length < source.getPayload().length()) {
|
||||
buf.setData(new byte[source.getPayload().length()]);
|
||||
}
|
||||
source.getPayload().copyTo(buf.getData(), 0);
|
||||
buf.setData(buf.getData(), 0, source.getPayload().length());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,20 +28,23 @@ import java.io.IOException;
|
|||
public class SingleTokenTokenStream extends TokenStream {
|
||||
|
||||
private boolean exhausted = false;
|
||||
// The token needs to be immutable, so work with clones!
|
||||
private Token token;
|
||||
|
||||
|
||||
public SingleTokenTokenStream(Token token) {
|
||||
this.token = token;
|
||||
assert token != null;
|
||||
this.token = (Token) token.clone();
|
||||
}
|
||||
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (exhausted) {
|
||||
return null;
|
||||
}
|
||||
exhausted = true;
|
||||
return token;
|
||||
return (Token) token.clone();
|
||||
}
|
||||
|
||||
|
||||
|
@ -50,10 +53,10 @@ public class SingleTokenTokenStream extends TokenStream {
|
|||
}
|
||||
|
||||
public Token getToken() {
|
||||
return token;
|
||||
return (Token) token.clone();
|
||||
}
|
||||
|
||||
public void setToken(Token token) {
|
||||
this.token = token;
|
||||
this.token = (Token) token.clone();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -115,30 +115,30 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (ngrams.size() > 0) {
|
||||
return (Token) ngrams.removeFirst();
|
||||
}
|
||||
|
||||
Token token = input.next();
|
||||
if (token == null) {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
}
|
||||
|
||||
ngram(token);
|
||||
ngram(nextToken);
|
||||
if (ngrams.size() > 0)
|
||||
return (Token) ngrams.removeFirst();
|
||||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
private void ngram(Token token) {
|
||||
String inStr = token.termText();
|
||||
int inLen = inStr.length();
|
||||
private void ngram(final Token token) {
|
||||
int termLength = token.termLength();
|
||||
char[] termBuffer = token.termBuffer();
|
||||
int gramSize = minGram;
|
||||
while (gramSize <= maxGram) {
|
||||
// if the remaining input is too short, we can't generate any n-grams
|
||||
if (gramSize > inLen) {
|
||||
if (gramSize > termLength) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -147,13 +147,13 @@ public class EdgeNGramTokenFilter extends TokenFilter {
|
|||
return;
|
||||
}
|
||||
|
||||
Token tok;
|
||||
if (side == Side.FRONT) {
|
||||
tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
|
||||
}
|
||||
else {
|
||||
tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
|
||||
}
|
||||
// grab gramSize chars from front or back
|
||||
int start = side == Side.FRONT ? 0 : termLength - gramSize;
|
||||
int end = start + gramSize;
|
||||
Token tok = (Token) token.clone();
|
||||
tok.setStartOffset(start);
|
||||
tok.setEndOffset(end);
|
||||
tok.setTermBuffer(termBuffer, start, gramSize);
|
||||
ngrams.add(tok);
|
||||
gramSize++;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis.ngram;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter.Side;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
@ -113,13 +114,14 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
// if we are just starting, read the whole input
|
||||
if (!started) {
|
||||
started = true;
|
||||
char[] chars = new char[1024];
|
||||
input.read(chars);
|
||||
inStr = new String(chars).trim(); // remove any trailing empty strings
|
||||
inStr = new String(chars).trim(); // remove any leading or trailing spaces
|
||||
inLen = inStr.length();
|
||||
gramSize = minGram;
|
||||
}
|
||||
|
@ -134,15 +136,13 @@ public class EdgeNGramTokenizer extends Tokenizer {
|
|||
return null;
|
||||
}
|
||||
|
||||
Token tok;
|
||||
if (side == Side.FRONT) {
|
||||
tok = new Token(inStr.substring(0, gramSize), 0, gramSize);
|
||||
}
|
||||
else {
|
||||
tok = new Token(inStr.substring(inLen-gramSize), inLen-gramSize, inLen);
|
||||
}
|
||||
|
||||
// grab gramSize chars from front or back
|
||||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||
int end = start + gramSize;
|
||||
reusableToken.setTermBuffer(inStr, start, gramSize);
|
||||
reusableToken.setStartOffset(start);
|
||||
reusableToken.setEndOffset(end);
|
||||
gramSize++;
|
||||
return tok;
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -63,17 +63,17 @@ public class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (ngrams.size() > 0) {
|
||||
return (Token) ngrams.removeFirst();
|
||||
}
|
||||
|
||||
Token token = input.next();
|
||||
if (token == null) {
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
}
|
||||
|
||||
ngram(token);
|
||||
ngram(nextToken);
|
||||
if (ngrams.size() > 0)
|
||||
return (Token) ngrams.removeFirst();
|
||||
else
|
||||
|
@ -81,16 +81,13 @@ public class NGramTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
private void ngram(Token token) {
|
||||
String inStr = token.termText();
|
||||
int inLen = inStr.length();
|
||||
char[] termBuffer = token.termBuffer();
|
||||
int termLength = token.termLength();
|
||||
int gramSize = minGram;
|
||||
while (gramSize <= maxGram) {
|
||||
int pos = 0; // reset to beginning of string
|
||||
while (pos+gramSize <= inLen) { // while there is input
|
||||
String gram = inStr.substring(pos, pos+gramSize);
|
||||
Token tok = new Token(gram, pos, pos+gramSize);
|
||||
// tok.setPositionIncrement(pos);
|
||||
ngrams.add(tok);
|
||||
while (pos+gramSize <= termLength) { // while there is input
|
||||
ngrams.add(token.clone(termBuffer, pos, gramSize, pos, pos+gramSize));
|
||||
pos++;
|
||||
}
|
||||
gramSize++; // increase n-gram size
|
||||
|
|
|
@ -64,7 +64,8 @@ public class NGramTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public final Token next() throws IOException {
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (!started) {
|
||||
started = true;
|
||||
gramSize = minGram;
|
||||
|
@ -82,9 +83,9 @@ public class NGramTokenizer extends Tokenizer {
|
|||
if (pos+gramSize > inLen)
|
||||
return null;
|
||||
}
|
||||
String gram = inStr.substring(pos, pos+gramSize);
|
||||
|
||||
int oldPos = pos;
|
||||
pos++;
|
||||
return new Token(gram, oldPos, oldPos+gramSize);
|
||||
return reusableToken.reinit(inStr, oldPos, gramSize, oldPos, oldPos+gramSize);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,7 +38,6 @@ public final class DutchStemFilter extends TokenFilter {
|
|||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private DutchStemmer stemmer = null;
|
||||
private Set exclusions = null;
|
||||
|
||||
|
@ -48,7 +47,7 @@ public final class DutchStemFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/**
|
||||
* Builds a DutchStemFilter that uses an exclusiontable.
|
||||
* Builds a DutchStemFilter that uses an exclusion table.
|
||||
*/
|
||||
public DutchStemFilter(TokenStream _in, Set exclusiontable) {
|
||||
this(_in);
|
||||
|
@ -66,23 +65,22 @@ public final class DutchStemFilter extends TokenFilter {
|
|||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
if ((token = input.next()) == null) {
|
||||
public Token next(Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
}
|
||||
|
||||
// Check the exclusiontable
|
||||
else if (exclusions != null && exclusions.contains(token.termText())) {
|
||||
return token;
|
||||
} else {
|
||||
String s = stemmer.stem(token.termText());
|
||||
// If not stemmed, dont waste the time creating a new token
|
||||
if (!s.equals(token.termText())) {
|
||||
return new Token(s, token.startOffset(),
|
||||
token.endOffset(), token.type());
|
||||
}
|
||||
return token;
|
||||
String term = nextToken.term();
|
||||
|
||||
// Check the exclusion table.
|
||||
if (exclusions == null || !exclusions.contains(term)) {
|
||||
String s = stemmer.stem(term);
|
||||
// If not stemmed, don't waste the time adjusting the token.
|
||||
if ((s != null) && !s.equals(term))
|
||||
nextToken.setTermBuffer(s);
|
||||
}
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -41,11 +41,12 @@ public class NumericPayloadTokenFilter extends TokenFilter {
|
|||
this.typeMatch = typeMatch;
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null && result.type().equals(typeMatch)){
|
||||
result.setPayload(thePayload);
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null && nextToken.type().equals(typeMatch)){
|
||||
nextToken.setPayload(thePayload);
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,15 +38,16 @@ public class TokenOffsetPayloadTokenFilter extends TokenFilter {
|
|||
super(input);
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null){
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null){
|
||||
byte[] data = new byte[8];
|
||||
PayloadHelper.encodeInt(result.startOffset(), data, 0);
|
||||
PayloadHelper.encodeInt(result.endOffset(), data, 4);
|
||||
PayloadHelper.encodeInt(nextToken.startOffset(), data, 0);
|
||||
PayloadHelper.encodeInt(nextToken.endOffset(), data, 4);
|
||||
Payload payload = new Payload(data);
|
||||
result.setPayload(payload);
|
||||
nextToken.setPayload(payload);
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
|
@ -39,11 +39,12 @@ public class TypeAsPayloadTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null && result.type() != null && result.type().equals("") == false){
|
||||
result.setPayload(new Payload(result.type().getBytes("UTF-8")));
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null && nextToken.type() != null && nextToken.type().equals("") == false){
|
||||
nextToken.setPayload(new Payload(nextToken.type().getBytes("UTF-8")));
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
|
@ -37,25 +37,20 @@ public final class RussianLowerCaseFilter extends TokenFilter
|
|||
this.charset = charset;
|
||||
}
|
||||
|
||||
public final Token next() throws java.io.IOException
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException
|
||||
{
|
||||
Token t = input.next();
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
|
||||
if (t == null)
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
String txt = t.termText();
|
||||
|
||||
char[] chArray = txt.toCharArray();
|
||||
for (int i = 0; i < chArray.length; i++)
|
||||
char[] chArray = nextToken.termBuffer();
|
||||
int chLen = nextToken.termLength();
|
||||
for (int i = 0; i < chLen; i++)
|
||||
{
|
||||
chArray[i] = RussianCharsets.toLowerCase(chArray[i], charset);
|
||||
}
|
||||
|
||||
String newTxt = new String(chArray);
|
||||
// create new token
|
||||
Token newToken = new Token(newTxt, t.startOffset(), t.endOffset());
|
||||
|
||||
return newToken;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -35,7 +35,6 @@ public final class RussianStemFilter extends TokenFilter
|
|||
/**
|
||||
* The actual token in the input stream.
|
||||
*/
|
||||
private Token token = null;
|
||||
private RussianStemmer stemmer = null;
|
||||
|
||||
public RussianStemFilter(TokenStream in, char[] charset)
|
||||
|
@ -47,22 +46,18 @@ public final class RussianStemFilter extends TokenFilter
|
|||
/**
|
||||
* @return Returns the next token in the stream, or null at EOS
|
||||
*/
|
||||
public final Token next() throws IOException
|
||||
public final Token next(final Token reusableToken) throws IOException
|
||||
{
|
||||
if ((token = input.next()) == null)
|
||||
{
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
}
|
||||
else
|
||||
{
|
||||
String s = stemmer.stem(token.termText());
|
||||
if (!s.equals(token.termText()))
|
||||
{
|
||||
return new Token(s, token.startOffset(), token.endOffset(),
|
||||
token.type());
|
||||
}
|
||||
return token;
|
||||
}
|
||||
|
||||
String term = nextToken.term();
|
||||
String s = stemmer.stem(term);
|
||||
if (s != null && !s.equals(term))
|
||||
nextToken.setTermBuffer(s);
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -47,7 +47,7 @@ public class ShingleFilter extends TokenFilter {
|
|||
/**
|
||||
* filler token for when positionIncrement is more than 1
|
||||
*/
|
||||
public static final String FILLER_TOKEN = "_";
|
||||
public static final char[] FILLER_TOKEN = { '_' };
|
||||
|
||||
|
||||
/**
|
||||
|
@ -150,11 +150,12 @@ public class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/* (non-Javadoc)
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (outputBuf.isEmpty()) {
|
||||
fillOutputBuf();
|
||||
fillOutputBuf(reusableToken);
|
||||
}
|
||||
Token nextToken = null;
|
||||
if ( ! outputBuf.isEmpty())
|
||||
|
@ -173,16 +174,19 @@ public class ShingleFilter extends TokenFilter {
|
|||
* @return the next token, or null if at end of input stream
|
||||
* @throws IOException if the input stream has a problem
|
||||
*/
|
||||
private Token getNextToken() throws IOException {
|
||||
private Token getNextToken(final Token reusableToken) throws IOException {
|
||||
if (tokenBuf.isEmpty()) {
|
||||
Token lastToken = input.next();
|
||||
if (lastToken != null) {
|
||||
for (int i = 1; i < lastToken.getPositionIncrement(); i++) {
|
||||
tokenBuf.add(new Token(FILLER_TOKEN, lastToken.startOffset(),
|
||||
lastToken.startOffset()));
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
for (int i = 1; i < nextToken.getPositionIncrement(); i++) {
|
||||
Token fillerToken = (Token) nextToken.clone();
|
||||
// A filler token occupies no space
|
||||
fillerToken.setEndOffset(fillerToken.startOffset());
|
||||
fillerToken.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
|
||||
tokenBuf.add(fillerToken);
|
||||
}
|
||||
tokenBuf.add(lastToken);
|
||||
return getNextToken();
|
||||
tokenBuf.add(nextToken.clone());
|
||||
return getNextToken(nextToken);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
@ -196,15 +200,15 @@ public class ShingleFilter extends TokenFilter {
|
|||
*
|
||||
* @throws IOException if there's a problem getting the next token
|
||||
*/
|
||||
private void fillOutputBuf() throws IOException {
|
||||
private void fillOutputBuf(Token token) throws IOException {
|
||||
boolean addedToken = false;
|
||||
/*
|
||||
* Try to fill the shingle buffer.
|
||||
*/
|
||||
do {
|
||||
Token token = getNextToken();
|
||||
token = getNextToken(token);
|
||||
if (token != null) {
|
||||
shingleBuf.add(token);
|
||||
shingleBuf.add(token.clone());
|
||||
if (shingleBuf.size() > maxShingleSize)
|
||||
{
|
||||
shingleBuf.remove(0);
|
||||
|
@ -235,17 +239,17 @@ public class ShingleFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
int i = 0;
|
||||
Token token = null;
|
||||
Token shingle = null;
|
||||
for (Iterator it = shingleBuf.iterator(); it.hasNext(); ) {
|
||||
token = (Token) it.next();
|
||||
shingle = (Token) it.next();
|
||||
for (int j = i; j < shingles.length; j++) {
|
||||
if (shingles[j].length() != 0) {
|
||||
shingles[j].append(TOKEN_SEPARATOR);
|
||||
}
|
||||
shingles[j].append(token.termBuffer(), 0, token.termLength());
|
||||
shingles[j].append(shingle.termBuffer(), 0, shingle.termLength());
|
||||
}
|
||||
|
||||
endOffsets[i] = token.endOffset();
|
||||
endOffsets[i] = shingle.endOffset();
|
||||
i++;
|
||||
}
|
||||
|
||||
|
@ -258,17 +262,26 @@ public class ShingleFilter extends TokenFilter {
|
|||
/*
|
||||
* Push new tokens to the output buffer.
|
||||
*/
|
||||
if (!shingleBuf.isEmpty()) {
|
||||
Token firstShingle = (Token) shingleBuf.get(0);
|
||||
shingle = (Token) firstShingle.clone();
|
||||
shingle.setType(tokenType);
|
||||
}
|
||||
for (int j = 1; j < shingleBuf.size(); j++) {
|
||||
Token shingle = new Token(shingles[j].toString(),
|
||||
((Token) shingleBuf.get(0)).startOffset(),
|
||||
endOffsets[j],
|
||||
tokenType);
|
||||
shingle.setEndOffset(endOffsets[j]);
|
||||
StringBuffer buf = shingles[j];
|
||||
int termLength = buf.length();
|
||||
char[] termBuffer = shingle.termBuffer();
|
||||
if (termBuffer.length < termLength)
|
||||
termBuffer = shingle.resizeTermBuffer(termLength);
|
||||
buf.getChars(0, termLength, termBuffer, 0);
|
||||
shingle.setTermLength(termLength);
|
||||
if ((! outputUnigrams) && j == 1) {
|
||||
shingle.setPositionIncrement(1);
|
||||
} else {
|
||||
shingle.setPositionIncrement(0);
|
||||
}
|
||||
outputBuf.add(shingle);
|
||||
outputBuf.add(shingle.clone());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,15 +17,22 @@ package org.apache.lucene.analysis.shingle;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||
import org.apache.lucene.index.Payload;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
|
||||
/**
|
||||
* <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
|
||||
|
@ -298,7 +305,8 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
|
||||
private Matrix matrix;
|
||||
|
||||
public Token next(Token token) throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (matrix == null) {
|
||||
matrix = new Matrix();
|
||||
// fill matrix with maximumShingleSize columns
|
||||
|
@ -318,7 +326,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
if (ignoringSinglePrefixOrSuffixShingle
|
||||
&& currentShingleLength == 1
|
||||
&& (currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isFirst() || currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isLast())) {
|
||||
return next(token);
|
||||
return next(reusableToken);
|
||||
}
|
||||
|
||||
int termLength = 0;
|
||||
|
@ -336,21 +344,21 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
|
||||
// only produce shingles that not already has been created
|
||||
if (!shinglesSeen.add(shingle)) {
|
||||
return next(token);
|
||||
return next(reusableToken);
|
||||
}
|
||||
|
||||
// shingle token factory
|
||||
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future.
|
||||
StringBuilder sb = new StringBuilder(termLength + 10); // paranormal ability to foresee the future.
|
||||
for (Token shingleToken : shingle) {
|
||||
if (spacerCharacter != null && sb.length() > 0) {
|
||||
sb.append(spacerCharacter);
|
||||
}
|
||||
sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
|
||||
}
|
||||
token.setTermText(sb.toString());
|
||||
updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
|
||||
reusableToken.setTermBuffer(sb.toString());
|
||||
updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
|
||||
|
||||
return token;
|
||||
return reusableToken;
|
||||
|
||||
} else {
|
||||
|
||||
|
@ -360,7 +368,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
// reset shingle size and move one step to the right in the current tokens permutation
|
||||
currentPermutationTokensStartOffset++;
|
||||
currentShingleLength = minimumShingleSize - 1;
|
||||
return next(token);
|
||||
return next(reusableToken);
|
||||
}
|
||||
|
||||
|
||||
|
@ -411,7 +419,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
}
|
||||
|
||||
nextTokensPermutation();
|
||||
return next(token);
|
||||
return next(reusableToken);
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -426,7 +434,7 @@ public class ShingleMatrixFilter extends TokenStream {
|
|||
|
||||
nextTokensPermutation();
|
||||
|
||||
return next(token);
|
||||
return next(reusableToken);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -73,10 +73,10 @@ public class DateRecognizerSinkTokenizer extends SinkTokenizer {
|
|||
//Check to see if this token is a date
|
||||
if (t != null) {
|
||||
try {
|
||||
Date date = dateFormat.parse(new String(t.termBuffer(), 0, t.termLength()));//We don't care about the date, just that we can parse it as a date
|
||||
Date date = dateFormat.parse(t.term());//We don't care about the date, just that we can parse it as a date
|
||||
if (date != null) {
|
||||
t.setType(DATE_TYPE);
|
||||
lst.add(t.clone());
|
||||
super.add(t);
|
||||
}
|
||||
} catch (ParseException e) {
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ public class TokenTypeSinkTokenizer extends SinkTokenizer {
|
|||
public void add(Token t) {
|
||||
//check to see if this is a Category
|
||||
if (t != null && typeToMatch.equals(t.type())){
|
||||
lst.add(t.clone());
|
||||
super.add(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,31 +40,38 @@ public class ThaiWordFilter extends TokenFilter {
|
|||
breaker = BreakIterator.getWordInstance(new Locale("th"));
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (thaiToken != null) {
|
||||
String text = thaiToken.termText();
|
||||
int start = breaker.current();
|
||||
int end = breaker.next();
|
||||
if (end != BreakIterator.DONE) {
|
||||
return new Token(text.substring(start, end),
|
||||
thaiToken.startOffset()+start, thaiToken.startOffset()+end, thaiToken.type());
|
||||
reusableToken.reinit(thaiToken, thaiToken.termBuffer(), start, end - start);
|
||||
reusableToken.setStartOffset(thaiToken.startOffset()+start);
|
||||
reusableToken.setEndOffset(thaiToken.endOffset()+end);
|
||||
return reusableToken;
|
||||
}
|
||||
thaiToken = null;
|
||||
}
|
||||
Token tk = input.next();
|
||||
if (tk == null) {
|
||||
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null || nextToken.termLength() == 0) {
|
||||
return null;
|
||||
}
|
||||
String text = tk.termText();
|
||||
|
||||
String text = nextToken.term();
|
||||
if (UnicodeBlock.of(text.charAt(0)) != UnicodeBlock.THAI) {
|
||||
return new Token(text.toLowerCase(), tk.startOffset(), tk.endOffset(), tk.type());
|
||||
nextToken.setTermBuffer(text.toLowerCase());
|
||||
return nextToken;
|
||||
}
|
||||
thaiToken = tk;
|
||||
|
||||
thaiToken = (Token) nextToken.clone();
|
||||
breaker.setText(text);
|
||||
int end = breaker.next();
|
||||
if (end != BreakIterator.DONE) {
|
||||
return new Token(text.substring(0, end),
|
||||
thaiToken.startOffset(), thaiToken.startOffset()+end, thaiToken.type());
|
||||
nextToken.setTermBuffer(text, 0, end);
|
||||
nextToken.setEndOffset(nextToken.startOffset() + end);
|
||||
return nextToken;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -33,14 +33,13 @@ public class TestChineseTokenizer extends TestCase
|
|||
{
|
||||
String s = "a天b";
|
||||
ChineseTokenizer tokenizer = new ChineseTokenizer(new StringReader(s));
|
||||
Token token;
|
||||
|
||||
int correctStartOffset = 0;
|
||||
int correctEndOffset = 1;
|
||||
while ((token = tokenizer.next()) != null)
|
||||
{
|
||||
assertEquals(correctStartOffset, token.startOffset());
|
||||
assertEquals(correctEndOffset, token.endOffset());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
||||
assertEquals(correctStartOffset, nextToken.startOffset());
|
||||
assertEquals(correctEndOffset, nextToken.endOffset());
|
||||
correctStartOffset++;
|
||||
correctEndOffset++;
|
||||
}
|
||||
|
|
|
@ -153,15 +153,16 @@ public class TestCompoundWordTokenFilter extends TestCase {
|
|||
|
||||
private void assertFiltersTo(TokenFilter tf, String[] s, int[] startOffset,
|
||||
int[] endOffset, int[] posIncr) throws Exception {
|
||||
final Token reusableToken = new Token();
|
||||
for (int i = 0; i < s.length; ++i) {
|
||||
Token t = tf.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(s[i], new String(t.termBuffer(), 0, t.termLength()));
|
||||
assertEquals(startOffset[i], t.startOffset());
|
||||
assertEquals(endOffset[i], t.endOffset());
|
||||
assertEquals(posIncr[i], t.getPositionIncrement());
|
||||
Token nextToken = tf.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(s[i], nextToken.term());
|
||||
assertEquals(startOffset[i], nextToken.startOffset());
|
||||
assertEquals(endOffset[i], nextToken.endOffset());
|
||||
assertEquals(posIncr[i], nextToken.getPositionIncrement());
|
||||
}
|
||||
assertNull(tf.next());
|
||||
assertNull(tf.next(reusableToken));
|
||||
}
|
||||
|
||||
private void getHyphenationPatternFileContents() {
|
||||
|
|
|
@ -69,10 +69,11 @@ public class TestGermanStemFilter extends TestCase {
|
|||
private void check(final String input, final String expected) throws IOException {
|
||||
StandardTokenizer tokenStream = new StandardTokenizer(new StringReader(input));
|
||||
GermanStemFilter filter = new GermanStemFilter(tokenStream);
|
||||
Token t = filter.next();
|
||||
if (t == null)
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = filter.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
fail();
|
||||
assertEquals(expected, t.termText());
|
||||
assertEquals(expected, nextToken.term());
|
||||
filter.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -42,12 +42,13 @@ public class GreekAnalyzerTest extends TestCase {
|
|||
*/
|
||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
final Token reusableToken = new Token();
|
||||
for (int i=0; i<output.length; i++) {
|
||||
Token t = ts.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(t.termText(), output[i]);
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(nextToken.term(), output[i]);
|
||||
}
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -53,13 +53,9 @@ public class TestElision extends TestCase {
|
|||
private List filtre(TokenFilter filter) {
|
||||
List tas = new ArrayList();
|
||||
try {
|
||||
boolean encore = true;
|
||||
Token token;
|
||||
while (encore) {
|
||||
token = filter.next();
|
||||
encore = token != null;
|
||||
if (token != null)
|
||||
tas.add(token.termText());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
tas.add(nextToken.term());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
|
|
|
@ -77,12 +77,13 @@ public class TestFrenchAnalyzer extends TestCase {
|
|||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
Token t = ts.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(t.termText(), output[i]);
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(nextToken.term(), output[i]);
|
||||
}
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -30,25 +30,32 @@ public class TestPrefixAndSuffixAwareTokenFilter extends TestCase {
|
|||
public void test() throws IOException {
|
||||
|
||||
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
|
||||
new SingleTokenTokenStream(new Token("^", 0, 0)),
|
||||
new SingleTokenTokenStream(createToken("^", 0, 0)),
|
||||
new WhitespaceTokenizer(new StringReader("hello world")),
|
||||
new SingleTokenTokenStream(new Token("$", 0, 0)));
|
||||
new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertNext(ts, "^", 0, 0);
|
||||
assertNext(ts, "hello", 0, 5);
|
||||
assertNext(ts, "world", 6, 11);
|
||||
assertNext(ts, "$", 11, 11);
|
||||
assertNull(ts.next());
|
||||
Token token = new Token();
|
||||
assertNext(ts, token, "^", 0, 0);
|
||||
assertNext(ts, token, "hello", 0, 5);
|
||||
assertNext(ts, token, "world", 6, 11);
|
||||
assertNext(ts, token, "$", 11, 11);
|
||||
assertNull(ts.next(token));
|
||||
}
|
||||
|
||||
|
||||
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token token = ts.next();
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertEquals(startOffset, token.startOffset());
|
||||
assertEquals(endOffset, token.endOffset());
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -32,33 +32,40 @@ public class TestPrefixAwareTokenFilter extends TestCase {
|
|||
PrefixAwareTokenFilter ts;
|
||||
|
||||
ts = new PrefixAwareTokenFilter(
|
||||
new SingleTokenTokenStream(new Token("a", 0, 1)),
|
||||
new SingleTokenTokenStream(new Token("b", 0, 1)));
|
||||
assertNext(ts, "a", 0, 1);
|
||||
assertNext(ts, "b", 1, 2);
|
||||
assertNull(ts.next());
|
||||
new SingleTokenTokenStream(createToken("a", 0, 1)),
|
||||
new SingleTokenTokenStream(createToken("b", 0, 1)));
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "a", 0, 1);
|
||||
assertNext(ts, reusableToken, "b", 1, 2);
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
|
||||
// prefix and suffix using 2x prefix
|
||||
|
||||
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(new Token("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
|
||||
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(new Token("$", 0, 0)));
|
||||
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
|
||||
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||
|
||||
assertNext(ts, "^", 0, 0);
|
||||
assertNext(ts, "hello", 0, 5);
|
||||
assertNext(ts, "world", 6, 11);
|
||||
assertNext(ts, "$", 11, 11);
|
||||
assertNull(ts.next());
|
||||
assertNext(ts, reusableToken, "^", 0, 0);
|
||||
assertNext(ts, reusableToken, "hello", 0, 5);
|
||||
assertNext(ts, reusableToken, "world", 6, 11);
|
||||
assertNext(ts, reusableToken, "$", 11, 11);
|
||||
assertNull(ts.next(reusableToken));
|
||||
}
|
||||
|
||||
|
||||
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token token = ts.next();
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertEquals(startOffset, token.startOffset());
|
||||
assertEquals(endOffset, token.endOffset());
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,23 +17,20 @@ package org.apache.lucene.analysis.miscellaneous;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.util.LuceneTestCase;
|
||||
|
||||
public class TestSingleTokenTokenFilter extends TestCase {
|
||||
public class TestSingleTokenTokenFilter extends LuceneTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
|
||||
Token token = new Token();
|
||||
|
||||
SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
|
||||
|
||||
assertEquals(token, ts.next());
|
||||
assertNull(ts.next());
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
assertEquals(token, ts.next(reusableToken));
|
||||
assertNull(ts.next(reusableToken));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -68,52 +68,46 @@ public class EdgeNGramTokenFilterTest extends TestCase {
|
|||
|
||||
public void testFrontUnigram() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 1);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(a,0,1)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(a,0,1)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testBackUnigram() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 1);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(e,4,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(e,4,5)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 6, 6);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
assertNull(tokenizer.next(new Token()));
|
||||
}
|
||||
|
||||
public void testFrontRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(a,0,1)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(ab,0,2)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(abc,0,3)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(a,0,1)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(ab,0,2)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(abc,0,3)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testBackRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.BACK, 1, 3);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(e,4,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(de,3,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(cde,2,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(e,4,5)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(de,3,5)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(cde,2,5)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -66,52 +66,46 @@ public class EdgeNGramTokenizerTest extends TestCase {
|
|||
|
||||
public void testFrontUnigram() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 1);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(a,0,1)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(a,0,1)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testBackUnigram() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 1);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(e,4,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(e,4,5)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 6, 6);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
assertNull(tokenizer.next(new Token()));
|
||||
}
|
||||
|
||||
public void testFrontRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.FRONT, 1, 3);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(a,0,1)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(ab,0,2)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(abc,0,3)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(a,0,1)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(ab,0,2)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(abc,0,3)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
|
||||
public void testBackRangeOfNgrams() throws Exception {
|
||||
EdgeNGramTokenizer tokenizer = new EdgeNGramTokenizer(input, EdgeNGramTokenizer.Side.BACK, 1, 3);
|
||||
Token token = null;
|
||||
token = tokenizer.next();
|
||||
assertEquals("(e,4,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(de,3,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertEquals("(cde,2,5)", token.toString());
|
||||
token = tokenizer.next();
|
||||
assertNull(token);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(e,4,5)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(de,3,5)", nextToken.toString());
|
||||
nextToken = tokenizer.next(reusableToken);
|
||||
assertEquals("(cde,2,5)", nextToken.toString());
|
||||
assertNull(tokenizer.next(reusableToken));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -60,17 +60,14 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
public void testUnigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 1);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = filter.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(5, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -80,17 +77,13 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 2, 2);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = filter.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(4, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -100,17 +93,13 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 1, 3);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = filter.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(12, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -122,17 +111,13 @@ public class NGramTokenFilterTest extends TestCase {
|
|||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenFilter filter = new NGramTokenFilter(input, 6, 7);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = filter.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertTrue(tokens.isEmpty());
|
||||
}
|
||||
|
|
|
@ -59,16 +59,13 @@ public class NGramTokenizerTest extends TestCase {
|
|||
public void testUnigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 1);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = tokenizer.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(5, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -78,17 +75,13 @@ public class NGramTokenizerTest extends TestCase {
|
|||
|
||||
public void testBigrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 2, 2);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = tokenizer.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(4, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -98,17 +91,13 @@ public class NGramTokenizerTest extends TestCase {
|
|||
|
||||
public void testNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 1, 3);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = tokenizer.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertEquals(12, tokens.size());
|
||||
ArrayList exp = new ArrayList();
|
||||
|
@ -120,17 +109,14 @@ public class NGramTokenizerTest extends TestCase {
|
|||
|
||||
public void testOversizedNgrams() throws Exception {
|
||||
NGramTokenizer tokenizer = new NGramTokenizer(input, 6, 7);
|
||||
|
||||
Token token = null;
|
||||
do {
|
||||
token = tokenizer.next();
|
||||
if (token != null) {
|
||||
tokens.add(token.toString());
|
||||
// System.out.println(token.termText());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
} while (token != null);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenizer.next(reusableToken); nextToken != null; nextToken = tokenizer.next(reusableToken)) {
|
||||
tokens.add(nextToken.toString());
|
||||
// System.out.println(token.term());
|
||||
// System.out.println(token);
|
||||
// Thread.sleep(1000);
|
||||
}
|
||||
|
||||
assertTrue(tokens.isEmpty());
|
||||
}
|
||||
|
|
|
@ -43,20 +43,20 @@ public class NumericPayloadTokenFilterTest extends TestCase {
|
|||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), 3, "D");
|
||||
Token tok = new Token();
|
||||
boolean seenDogs = false;
|
||||
while ((tok = nptf.next(tok)) != null){
|
||||
if (tok.termText().equals("dogs")){
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
||||
if (nextToken.term().equals("dogs")){
|
||||
seenDogs = true;
|
||||
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
|
||||
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
|
||||
byte [] bytes = tok.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
|
||||
assertTrue(bytes.length + " does not equal: " + tok.getPayload().length(), bytes.length == tok.getPayload().length());
|
||||
assertTrue(tok.getPayload().getOffset() + " does not equal: " + 0, tok.getPayload().getOffset() == 0);
|
||||
assertTrue(nextToken.type() + " is not equal to " + "D", nextToken.type().equals("D") == true);
|
||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
|
||||
byte [] bytes = nextToken.getPayload().getData();//safe here to just use the bytes, otherwise we should use offset, length
|
||||
assertTrue(bytes.length + " does not equal: " + nextToken.getPayload().length(), bytes.length == nextToken.getPayload().length());
|
||||
assertTrue(nextToken.getPayload().getOffset() + " does not equal: " + 0, nextToken.getPayload().getOffset() == 0);
|
||||
float pay = PayloadHelper.decodeFloat(bytes);
|
||||
assertTrue(pay + " does not equal: " + 3, pay == 3);
|
||||
} else {
|
||||
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
|
||||
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals("word"));
|
||||
}
|
||||
}
|
||||
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||
|
@ -67,12 +67,13 @@ public class NumericPayloadTokenFilterTest extends TestCase {
|
|||
super(input);
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null && result.termText().equals("dogs")) {
|
||||
result.setType("D");
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null && nextToken.term().equals("dogs")) {
|
||||
nextToken.setType("D");
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -42,17 +42,17 @@ public class TokenOffsetPayloadTokenFilterTest extends TestCase {
|
|||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(new StringReader(test)));
|
||||
Token tok = new Token();
|
||||
int count = 0;
|
||||
while ((tok = nptf.next(tok)) != null){
|
||||
assertTrue("tok is null and it shouldn't be", tok != null);
|
||||
Payload pay = tok.getPayload();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
Payload pay = nextToken.getPayload();
|
||||
assertTrue("pay is null and it shouldn't be", pay != null);
|
||||
byte [] data = pay.getData();
|
||||
int start = PayloadHelper.decodeInt(data, 0);
|
||||
assertTrue(start + " does not equal: " + tok.startOffset(), start == tok.startOffset());
|
||||
assertTrue(start + " does not equal: " + nextToken.startOffset(), start == nextToken.startOffset());
|
||||
int end = PayloadHelper.decodeInt(data, 4);
|
||||
assertTrue(end + " does not equal: " + tok.endOffset(), end == tok.endOffset());
|
||||
assertTrue(end + " does not equal: " + nextToken.endOffset(), end == nextToken.endOffset());
|
||||
count++;
|
||||
}
|
||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||
|
|
|
@ -44,14 +44,14 @@ public class TypeAsPayloadTokenFilterTest extends TestCase {
|
|||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))));
|
||||
Token tok = new Token();
|
||||
int count = 0;
|
||||
while ((tok = nptf.next(tok)) != null){
|
||||
assertTrue(tok.type() + " is not null and it should be", tok.type().equals(String.valueOf(Character.toUpperCase(tok.termBuffer()[0]))));
|
||||
assertTrue("tok.getPayload() is null and it shouldn't be", tok.getPayload() != null);
|
||||
String type = new String(tok.getPayload().getData(), "UTF-8");
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = nptf.next(reusableToken); nextToken != null; nextToken = nptf.next(reusableToken)) {
|
||||
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0]))));
|
||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", nextToken.getPayload() != null);
|
||||
String type = new String(nextToken.getPayload().getData(), "UTF-8");
|
||||
assertTrue("type is null and it shouldn't be", type != null);
|
||||
assertTrue(type + " is not equal to " + tok.type(), type.equals(tok.type()) == true);
|
||||
assertTrue(type + " is not equal to " + nextToken.type(), type.equals(nextToken.type()) == true);
|
||||
count++;
|
||||
}
|
||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||
|
@ -64,12 +64,13 @@ public class TypeAsPayloadTokenFilterTest extends TestCase {
|
|||
|
||||
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null) {
|
||||
result.setType(String.valueOf(Character.toUpperCase(result.termBuffer()[0])));
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
nextToken.setType(String.valueOf(Character.toUpperCase(nextToken.termBuffer()[0])));
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -17,12 +17,17 @@ package org.apache.lucene.analysis.ru;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Test case for RussianAnalyzer.
|
||||
|
@ -72,22 +77,26 @@ public class TestRussianAnalyzer extends TestCase
|
|||
sampleUnicode,
|
||||
RussianCharsets.UnicodeRussian);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
final Token reusableSampleToken = new Token();
|
||||
Token nextToken;
|
||||
Token nextSampleToken;
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
nextToken = in.next(reusableToken);
|
||||
|
||||
if (token == null)
|
||||
if (nextToken == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
nextSampleToken = sample.next(reusableSampleToken);
|
||||
assertEquals(
|
||||
"Unicode",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
nextToken.term(),
|
||||
nextSampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
: nextSampleToken.term());
|
||||
}
|
||||
|
||||
inWords.close();
|
||||
|
@ -109,22 +118,26 @@ public class TestRussianAnalyzer extends TestCase
|
|||
sampleKOI8,
|
||||
RussianCharsets.KOI8);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
final Token reusableSampleToken = new Token();
|
||||
Token nextToken;
|
||||
Token nextSampleToken;
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
nextToken = in.next(reusableToken);
|
||||
|
||||
if (token == null)
|
||||
if (nextToken == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
nextSampleToken = sample.next(reusableSampleToken);
|
||||
assertEquals(
|
||||
"KOI8",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
nextToken.term(),
|
||||
nextSampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
: nextSampleToken.term());
|
||||
|
||||
}
|
||||
|
||||
|
@ -146,22 +159,26 @@ public class TestRussianAnalyzer extends TestCase
|
|||
sample1251,
|
||||
RussianCharsets.CP1251);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
final Token reusableSampleToken = new Token();
|
||||
Token nextToken;
|
||||
Token nextSampleToken;
|
||||
for (;;)
|
||||
{
|
||||
Token token = in.next();
|
||||
nextToken = in.next(reusableToken);
|
||||
|
||||
if (token == null)
|
||||
if (nextToken == null)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
Token sampleToken = sample.next();
|
||||
nextSampleToken = sample.next(reusableSampleToken);
|
||||
assertEquals(
|
||||
"1251",
|
||||
token.termText(),
|
||||
sampleToken == null
|
||||
nextToken.term(),
|
||||
nextSampleToken == null
|
||||
? null
|
||||
: sampleToken.termText());
|
||||
: nextSampleToken.term());
|
||||
|
||||
}
|
||||
|
||||
|
@ -175,9 +192,10 @@ public class TestRussianAnalyzer extends TestCase
|
|||
RussianAnalyzer ra = new RussianAnalyzer();
|
||||
TokenStream stream = ra.tokenStream("", reader);
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
try {
|
||||
assertEquals("text", stream.next().termText());
|
||||
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next());
|
||||
assertEquals("text", stream.next(reusableToken).term());
|
||||
assertNotNull("RussianAnalyzer's tokenizer skips numbers from input text", stream.next(reusableToken));
|
||||
}
|
||||
catch (IOException e)
|
||||
{
|
||||
|
|
|
@ -156,11 +156,11 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
|
|||
|
||||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("this sentence"));
|
||||
Token token;
|
||||
int j = -1;
|
||||
while ((token = ts.next()) != null) {
|
||||
j += token.getPositionIncrement();
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
j += nextToken.getPositionIncrement();
|
||||
String termText = nextToken.term();
|
||||
q.add(new Term("content", termText), j);
|
||||
}
|
||||
|
||||
|
@ -182,9 +182,9 @@ public class ShingleAnalyzerWrapperTest extends TestCase {
|
|||
|
||||
TokenStream ts = analyzer.tokenStream("content",
|
||||
new StringReader("test sentence"));
|
||||
Token token;
|
||||
while ((token = ts.next()) != null) {
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String termText = nextToken.term();
|
||||
q.add(new TermQuery(new Term("content", termText)),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
|
|
|
@ -35,7 +35,8 @@ public class ShingleFilterTest extends TestCase {
|
|||
this.testToken = testToken;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (index < testToken.length) {
|
||||
return testToken[index++];
|
||||
} else {
|
||||
|
@ -49,28 +50,28 @@ public class ShingleFilterTest extends TestCase {
|
|||
}
|
||||
|
||||
public static final Token[] TEST_TOKEN = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("this", 14, 18),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("into", 28, 32),
|
||||
new Token("shingles", 33, 39),
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("this", 14, 18),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("into", 28, 32),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static Token[] testTokenWithHoles;
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide this", 7, 18),
|
||||
new Token("this", 14, 18),
|
||||
new Token("this sentence", 14, 27),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence into", 19, 32),
|
||||
new Token("into", 28, 32),
|
||||
new Token("into shingles", 28, 39),
|
||||
new Token("shingles", 33, 39),
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence", 14, 27),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence into", 19, 32),
|
||||
createToken("into", 28, 32),
|
||||
createToken("into shingles", 28, 39),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||
|
@ -83,17 +84,17 @@ public class ShingleFilterTest extends TestCase {
|
|||
};
|
||||
|
||||
public static final Token[] BI_GRAM_TOKENS_WITH_HOLES = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide _", 7, 19),
|
||||
new Token("_", 19, 19),
|
||||
new Token("_ sentence", 19, 27),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence _", 19, 33),
|
||||
new Token("_", 33, 33),
|
||||
new Token("_ shingles", 33, 39),
|
||||
new Token("shingles", 33, 39),
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide _", 7, 19),
|
||||
createToken("_", 19, 19),
|
||||
createToken("_ sentence", 19, 27),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence _", 19, 33),
|
||||
createToken("_", 33, 33),
|
||||
createToken("_ shingles", 33, 39),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
public static final int[] BI_GRAM_POSITION_INCREMENTS_WITH_HOLES = new int[] {
|
||||
|
@ -101,21 +102,21 @@ public class ShingleFilterTest extends TestCase {
|
|||
};
|
||||
|
||||
public static final Token[] TRI_GRAM_TOKENS = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("please divide", 0, 13),
|
||||
new Token("please divide this", 0, 18),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("divide this", 7, 18),
|
||||
new Token("divide this sentence", 7, 27),
|
||||
new Token("this", 14, 18),
|
||||
new Token("this sentence", 14, 27),
|
||||
new Token("this sentence into", 14, 32),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("sentence into", 19, 32),
|
||||
new Token("sentence into shingles", 19, 39),
|
||||
new Token("into", 28, 32),
|
||||
new Token("into shingles", 28, 39),
|
||||
new Token("shingles", 33, 39)
|
||||
createToken("please", 0, 6),
|
||||
createToken("please divide", 0, 13),
|
||||
createToken("please divide this", 0, 18),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("divide this", 7, 18),
|
||||
createToken("divide this sentence", 7, 27),
|
||||
createToken("this", 14, 18),
|
||||
createToken("this sentence", 14, 27),
|
||||
createToken("this sentence into", 14, 32),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("sentence into", 19, 32),
|
||||
createToken("sentence into shingles", 19, 39),
|
||||
createToken("into", 28, 32),
|
||||
createToken("into shingles", 28, 39),
|
||||
createToken("shingles", 33, 39)
|
||||
};
|
||||
|
||||
public static final int[] TRI_GRAM_POSITION_INCREMENTS = new int[] {
|
||||
|
@ -135,10 +136,10 @@ public class ShingleFilterTest extends TestCase {
|
|||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
testTokenWithHoles = new Token[] {
|
||||
new Token("please", 0, 6),
|
||||
new Token("divide", 7, 13),
|
||||
new Token("sentence", 19, 27),
|
||||
new Token("shingles", 33, 39),
|
||||
createToken("please", 0, 6),
|
||||
createToken("divide", 7, 13),
|
||||
createToken("sentence", 19, 27),
|
||||
createToken("shingles", 33, 39),
|
||||
};
|
||||
|
||||
testTokenWithHoles[2].setPositionIncrement(2);
|
||||
|
@ -168,22 +169,27 @@ public class ShingleFilterTest extends TestCase {
|
|||
throws IOException {
|
||||
|
||||
TokenStream filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
|
||||
Token token;
|
||||
int i = 0;
|
||||
|
||||
while ((token = filter.next()) != null) {
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
String goldText
|
||||
= new String(tokensToCompare[i].termBuffer(), 0, tokensToCompare[i].termLength());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = filter.next(reusableToken); nextToken != null; nextToken = filter.next(reusableToken)) {
|
||||
String termText = nextToken.term();
|
||||
String goldText = tokensToCompare[i].term();
|
||||
assertEquals("Wrong termText", goldText, termText);
|
||||
assertEquals("Wrong startOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].startOffset(), token.startOffset());
|
||||
tokensToCompare[i].startOffset(), nextToken.startOffset());
|
||||
assertEquals("Wrong endOffset for token \"" + termText + "\"",
|
||||
tokensToCompare[i].endOffset(), token.endOffset());
|
||||
tokensToCompare[i].endOffset(), nextToken.endOffset());
|
||||
assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
|
||||
positionIncrements[i], token.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], token.type());
|
||||
positionIncrements[i], nextToken.getPositionIncrement());
|
||||
assertEquals("Wrong type for token \"" + termText + "\"", types[i], nextToken.type());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -40,29 +40,23 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ShingleMatrixFilter.defaultSettingsCodec = null;
|
||||
|
||||
Token token = new Token(); // for debug use only
|
||||
|
||||
|
||||
|
||||
|
||||
TokenStream ts;
|
||||
|
||||
|
||||
ts = new ShingleMatrixFilter(new EmptyTokenStream(), 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(new Token()));
|
||||
|
||||
TokenListStream tls;
|
||||
LinkedList<Token> tokens;
|
||||
|
||||
// test a plain old token stream with synonyms tranlated to rows.
|
||||
// test a plain old token stream with synonyms translated to rows.
|
||||
|
||||
tokens = new LinkedList<Token>();
|
||||
tokens.add(new Token("please", 0, 6));
|
||||
tokens.add(new Token("divide", 7, 13));
|
||||
tokens.add(new Token("this", 14, 18));
|
||||
tokens.add(new Token("sentence", 19, 27));
|
||||
tokens.add(new Token("into", 28, 32));
|
||||
tokens.add(new Token("shingles", 33, 39));
|
||||
tokens.add(createToken("please", 0, 6));
|
||||
tokens.add(createToken("divide", 7, 13));
|
||||
tokens.add(createToken("this", 14, 18));
|
||||
tokens.add(createToken("sentence", 19, 27));
|
||||
tokens.add(createToken("into", 28, 32));
|
||||
tokens.add(createToken("shingles", 33, 39));
|
||||
|
||||
tls = new TokenListStream(tokens);
|
||||
|
||||
|
@ -70,20 +64,22 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
|
||||
|
||||
assertNext(ts, "please", 0, 6);
|
||||
assertNext(ts, "please divide", 0, 13);
|
||||
assertNext(ts, "divide", 7, 13);
|
||||
assertNext(ts, "divide this", 7, 18);
|
||||
assertNext(ts, "this", 14, 18);
|
||||
assertNext(ts, "this sentence", 14, 27);
|
||||
assertNext(ts, "sentence", 19, 27);
|
||||
assertNext(ts, "sentence into", 19, 32);
|
||||
assertNext(ts, "into", 28, 32);
|
||||
assertNext(ts, "into shingles", 28, 39);
|
||||
assertNext(ts, "shingles", 33, 39);
|
||||
Token reusableToken = new Token();
|
||||
|
||||
assertNext(ts, reusableToken, "please", 0, 6);
|
||||
assertNext(ts, reusableToken, "please divide", 0, 13);
|
||||
assertNext(ts, reusableToken, "divide", 7, 13);
|
||||
assertNext(ts, reusableToken, "divide this", 7, 18);
|
||||
assertNext(ts, reusableToken, "this", 14, 18);
|
||||
assertNext(ts, reusableToken, "this sentence", 14, 27);
|
||||
assertNext(ts, reusableToken, "sentence", 19, 27);
|
||||
assertNext(ts, reusableToken, "sentence into", 19, 32);
|
||||
assertNext(ts, reusableToken, "into", 28, 32);
|
||||
assertNext(ts, reusableToken, "into shingles", 28, 39);
|
||||
assertNext(ts, reusableToken, "shingles", 33, 39);
|
||||
|
||||
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
}
|
||||
|
||||
|
@ -95,9 +91,6 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
|
||||
|
||||
Token token = new Token(); // for debug use only
|
||||
|
||||
|
||||
TokenStream ts;
|
||||
TokenListStream tls;
|
||||
LinkedList<Token> tokens;
|
||||
|
@ -117,25 +110,26 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
|
||||
|
||||
assertNext(ts, "hello_world");
|
||||
assertNext(ts, "greetings_world");
|
||||
assertNext(ts, "hello_earth");
|
||||
assertNext(ts, "greetings_earth");
|
||||
assertNext(ts, "hello_tellus");
|
||||
assertNext(ts, "greetings_tellus");
|
||||
assertNull(ts.next());
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "hello_world");
|
||||
assertNext(ts, reusableToken, "greetings_world");
|
||||
assertNext(ts, reusableToken, "hello_earth");
|
||||
assertNext(ts, reusableToken, "greetings_earth");
|
||||
assertNext(ts, reusableToken, "hello_tellus");
|
||||
assertNext(ts, reusableToken, "greetings_tellus");
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
// bi-grams with no spacer character, start offset, end offset
|
||||
|
||||
tls.reset();
|
||||
ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
|
||||
assertNext(ts, "helloworld", 0, 10);
|
||||
assertNext(ts, "greetingsworld", 0, 10);
|
||||
assertNext(ts, "helloearth", 0, 10);
|
||||
assertNext(ts, "greetingsearth", 0, 10);
|
||||
assertNext(ts, "hellotellus", 0, 10);
|
||||
assertNext(ts, "greetingstellus", 0, 10);
|
||||
assertNull(ts.next());
|
||||
assertNext(ts, reusableToken, "helloworld", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingsworld", 0, 10);
|
||||
assertNext(ts, reusableToken, "helloearth", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingsearth", 0, 10);
|
||||
assertNext(ts, reusableToken, "hellotellus", 0, 10);
|
||||
assertNext(ts, reusableToken, "greetingstellus", 0, 10);
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
|
||||
// add ^_prefix_and_suffix_$
|
||||
|
@ -160,119 +154,119 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);
|
||||
//
|
||||
// while ((token = ts.next(token)) != null) {
|
||||
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
|
||||
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNull(ts.next());
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
// test unlimited size and allow single boundary token as shingle
|
||||
tls.reset();
|
||||
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false);
|
||||
|
||||
//
|
||||
// while ((token = ts.next(token)) != null) {
|
||||
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
|
||||
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, "^", 1, 10.0f, 0, 0);
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "$", 1, 7.071068f, 10, 10);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^", 1, 10.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "$", 1, 7.071068f, 10, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
// test unlimited size but don't allow single boundary token as shingle
|
||||
|
||||
tls.reset();
|
||||
ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true);
|
||||
// while ((token = ts.next(token)) != null) {
|
||||
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
|
||||
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_hello_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "world", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "world_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings", 1, 10.049875f, 0, 4);
|
||||
assertNext(ts, reusableToken, "^_greetings_world", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_world_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings", 1, 1.0f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_world_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "earth", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "earth_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_earth_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
assertNext(ts, reusableToken, "tellus", 1, 1.0f, 5, 10);
|
||||
assertNext(ts, reusableToken, "tellus_$", 1, 7.1414285f, 5, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus", 1, 10.099504f, 0, 10);
|
||||
assertNext(ts, reusableToken, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
|
||||
|
||||
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
System.currentTimeMillis();
|
||||
|
||||
|
@ -300,27 +294,27 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);
|
||||
|
||||
// while ((token = ts.next(token)) != null) {
|
||||
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
|
||||
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// token.clear();
|
||||
// }
|
||||
|
||||
// shingle, position increment, weight, start offset, end offset
|
||||
|
||||
assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
|
||||
assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "greetings_and", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, reusableToken, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
|
||||
assertNext(ts, reusableToken, "and_salutations", 1, 1.4142135f, 0, 4);
|
||||
assertNext(ts, reusableToken, "and_salutations_world", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_world", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "and_salutations_earth", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_earth", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "hello_tellus", 1, 1.4142135f, 0, 10);
|
||||
assertNext(ts, reusableToken, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
|
||||
assertNext(ts, reusableToken, "salutations_tellus", 1, 1.4142135f, 0, 10);
|
||||
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
System.currentTimeMillis();
|
||||
|
||||
|
@ -361,53 +355,53 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
|
||||
|
||||
// Token token = new Token();
|
||||
// while ((token = ts.next(token)) != null) {
|
||||
// System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// for (Token token = ts.next(new Token()); token != null; token = ts.next(token)) {
|
||||
// System.out.println("assertNext(ts, \"" + token.term() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
|
||||
// token.clear();
|
||||
// }
|
||||
|
||||
assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
|
||||
final Token reusableToken = new Token();
|
||||
assertNext(ts, reusableToken, "no_surprise", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "no_surprise_to", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "no_surprise_to_see", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to_see", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "surprise_to_see_england", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see_england", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "to_see_england_manager", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager_svennis", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_svennis", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_svennis_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_svennis_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "svennis_in_the_croud", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "in_the", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "in_the_croud", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "the_croud", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "see_england_manager_sven", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_sven", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "england_manager_sven_göran", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven_göran", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in", 1, 1.4142135f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in_the", 1, 1.7320508f, 0, 0);
|
||||
assertNext(ts, reusableToken, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
|
||||
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
|
||||
}
|
||||
|
||||
|
@ -417,11 +411,9 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
|
||||
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
|
||||
Token token = new Token();
|
||||
token.setTermText(text);
|
||||
Token token = new Token(startOffset, endOffset);
|
||||
token.setTermBuffer(text);
|
||||
token.setPositionIncrement(posIncr);
|
||||
token.setStartOffset(startOffset);
|
||||
token.setEndOffset(endOffset);
|
||||
return token;
|
||||
}
|
||||
|
||||
|
@ -435,61 +427,64 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
}
|
||||
|
||||
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
|
||||
Token token = new Token();
|
||||
token.setTermText(text);
|
||||
Token token = new Token(startOffset, endOffset);
|
||||
token.setTermBuffer(text);
|
||||
token.setPositionIncrement(posIncr);
|
||||
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
||||
token.setStartOffset(startOffset);
|
||||
token.setEndOffset(endOffset);
|
||||
return token;
|
||||
}
|
||||
|
||||
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
|
||||
Token token = new Token();
|
||||
token.setTermText(text);
|
||||
Token token = new Token(startOffset, endOffset);
|
||||
token.setTermBuffer(text);
|
||||
token.setPositionIncrement(posIncr);
|
||||
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
||||
token.setStartOffset(startOffset);
|
||||
token.setEndOffset(endOffset);
|
||||
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
|
||||
return token;
|
||||
}
|
||||
|
||||
// assert-methods start here
|
||||
|
||||
private Token assertNext(TokenStream ts, String text) throws IOException {
|
||||
Token token = ts.next(new Token());
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
return token;
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
|
||||
Token token = ts.next(new Token());
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertEquals(positionIncrement, token.getPositionIncrement());
|
||||
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
|
||||
return token;
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(positionIncrement, nextToken.getPositionIncrement());
|
||||
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()));
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||
Token token = ts.next(new Token());
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertEquals(positionIncrement, token.getPositionIncrement());
|
||||
assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
|
||||
assertEquals(startOffset, token.startOffset());
|
||||
assertEquals(endOffset, token.endOffset());
|
||||
return token;
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(positionIncrement, nextToken.getPositionIncrement());
|
||||
assertEquals(boost, nextToken.getPayload() == null ? 1f : PayloadHelper.decodeFloat(nextToken.getPayload().getData()));
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token token = ts.next(new Token());
|
||||
assertNotNull(token);
|
||||
assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertEquals(startOffset, token.startOffset());
|
||||
assertEquals(endOffset, token.endOffset());
|
||||
private Token assertNext(TokenStream ts, final Token reusableToken, String text, int startOffset, int endOffset) throws IOException {
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(text, nextToken.term());
|
||||
assertEquals(startOffset, nextToken.startOffset());
|
||||
assertEquals(endOffset, nextToken.endOffset());
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
|
@ -500,9 +495,9 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
public TokenListStream(TokenStream ts) throws IOException {
|
||||
tokens = new ArrayList<Token>();
|
||||
Token token;
|
||||
while ((token = ts.next(new Token())) != null) {
|
||||
tokens.add(token);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
tokens.add((Token) nextToken.clone());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -512,14 +507,16 @@ public class TestShingleMatrixFilter extends TestCase {
|
|||
|
||||
private Iterator<Token> iterator;
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (iterator == null) {
|
||||
iterator = tokens.iterator();
|
||||
}
|
||||
if (!iterator.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
return iterator.next();
|
||||
Token nextToken = (Token) iterator.next();
|
||||
return (Token) nextToken.clone();
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -43,13 +43,13 @@ public class DateRecognizerSinkTokenizerTest extends TestCase {
|
|||
DateRecognizerSinkTokenizer sink = new DateRecognizerSinkTokenizer(new SimpleDateFormat("MM/dd/yyyy"));
|
||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), sink);
|
||||
Token tok = null;
|
||||
int count = 0;
|
||||
while ((tok = tee.next()) != null){
|
||||
assertTrue("tok is null and it shouldn't be", tok != null);
|
||||
if (tok.termBuffer()[0] == '7'){
|
||||
assertTrue(tok.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
|
||||
tok.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tee.next(reusableToken); nextToken != null; nextToken = tee.next(reusableToken)) {
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
if (nextToken.termBuffer()[0] == '7'){
|
||||
assertTrue(nextToken.type() + " is not equal to " + DateRecognizerSinkTokenizer.DATE_TYPE,
|
||||
nextToken.type().equals(DateRecognizerSinkTokenizer.DATE_TYPE) == true);
|
||||
}
|
||||
count++;
|
||||
}
|
||||
|
|
|
@ -42,10 +42,10 @@ public class TokenRangeSinkTokenizerTest extends TestCase {
|
|||
TokenRangeSinkTokenizer rangeToks = new TokenRangeSinkTokenizer(2, 4);
|
||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
TeeTokenFilter tee = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(test)), rangeToks);
|
||||
Token tok = null;
|
||||
int count = 0;
|
||||
while ((tok = tee.next()) != null){
|
||||
assertTrue("tok is null and it shouldn't be", tok != null);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tee.next(reusableToken); nextToken != null; nextToken = tee.next(reusableToken)) {
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
count++;
|
||||
}
|
||||
assertTrue(count + " does not equal: " + 10, count == 10);
|
||||
|
|
|
@ -16,13 +16,17 @@ package org.apache.lucene.analysis.sinks;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.TeeTokenFilter;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
public class TokenTypeSinkTokenizerTest extends TestCase {
|
||||
|
||||
|
||||
|
@ -42,14 +46,14 @@ public class TokenTypeSinkTokenizerTest extends TestCase {
|
|||
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||
|
||||
TeeTokenFilter ttf = new TeeTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(new StringReader(test))), sink);
|
||||
Token tok = new Token();
|
||||
boolean seenDogs = false;
|
||||
while ((tok = ttf.next(tok)) != null) {
|
||||
if (tok.termText().equals("dogs")) {
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ttf.next(reusableToken); nextToken != null; nextToken = ttf.next(reusableToken)) {
|
||||
if (nextToken.term().equals("dogs")) {
|
||||
seenDogs = true;
|
||||
assertTrue(tok.type() + " is not equal to " + "D", tok.type().equals("D") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + "D", nextToken.type().equals("D") == true);
|
||||
} else {
|
||||
assertTrue(tok.type() + " is not null and it should be", tok.type().equals("word"));
|
||||
assertTrue(nextToken.type() + " is not null and it should be", nextToken.type().equals("word"));
|
||||
}
|
||||
}
|
||||
assertTrue(seenDogs + " does not equal: " + true, seenDogs == true);
|
||||
|
@ -61,12 +65,13 @@ public class TokenTypeSinkTokenizerTest extends TestCase {
|
|||
super(input);
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null && result.termText().equals("dogs")) {
|
||||
result.setType("D");
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null && nextToken.term().equals("dogs")) {
|
||||
nextToken.setType("D");
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -36,13 +36,13 @@ public class TestThaiAnalyzer extends TestCase {
|
|||
throws Exception {
|
||||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
Token t = ts.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(t.termText(), output[i]);
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertNotNull(nextToken);
|
||||
assertEquals(nextToken.term(), output[i]);
|
||||
}
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
|
|
@ -22,6 +22,7 @@ import java.util.ArrayList;
|
|||
import java.util.Iterator;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.util.PriorityQueue;
|
||||
|
||||
|
@ -217,7 +218,7 @@ public class Highlighter
|
|||
|
||||
try
|
||||
{
|
||||
org.apache.lucene.analysis.Token token;
|
||||
final Token reusableToken = new Token();
|
||||
String tokenText;
|
||||
int startOffset;
|
||||
int endOffset;
|
||||
|
@ -225,10 +226,12 @@ public class Highlighter
|
|||
textFragmenter.start(text);
|
||||
|
||||
TokenGroup tokenGroup=new TokenGroup();
|
||||
token = tokenStream.next();
|
||||
while ((token!= null)&&(token.startOffset()< maxDocCharsToAnalyze))
|
||||
|
||||
for (Token nextToken = tokenStream.next(reusableToken);
|
||||
(nextToken!= null)&&(nextToken.startOffset()< maxDocCharsToAnalyze);
|
||||
nextToken = tokenStream.next(reusableToken))
|
||||
{
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))
|
||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(nextToken)))
|
||||
{
|
||||
//the current token is distinct from previous tokens -
|
||||
// markup the cached token group info
|
||||
|
@ -244,7 +247,7 @@ public class Highlighter
|
|||
tokenGroup.clear();
|
||||
|
||||
//check if current token marks the start of a new fragment
|
||||
if(textFragmenter.isNewFragment(token))
|
||||
if(textFragmenter.isNewFragment(nextToken))
|
||||
{
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
//record stats for a new fragment
|
||||
|
@ -255,13 +258,12 @@ public class Highlighter
|
|||
}
|
||||
}
|
||||
|
||||
tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));
|
||||
tokenGroup.addToken(nextToken,fragmentScorer.getTokenScore(nextToken));
|
||||
|
||||
// if(lastEndOffset>maxDocBytesToAnalyze)
|
||||
// {
|
||||
// break;
|
||||
// }
|
||||
token = tokenStream.next();
|
||||
}
|
||||
currentFrag.setScore(fragmentScorer.getFragmentScore());
|
||||
|
||||
|
|
|
@ -106,7 +106,7 @@ public class QueryScorer implements Scorer
|
|||
*/
|
||||
public float getTokenScore(Token token)
|
||||
{
|
||||
String termText=token.termText();
|
||||
String termText=token.term();
|
||||
|
||||
WeightedTerm queryTerm=(WeightedTerm) termsToFind.get(termText);
|
||||
if(queryTerm==null)
|
||||
|
|
|
@ -62,7 +62,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
|
|||
return false;
|
||||
}
|
||||
|
||||
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(new String(token.termBuffer(), 0, token.termLength()));
|
||||
WeightedSpanTerm wSpanTerm = spanScorer.getWeightedSpanTerm(token.term());
|
||||
|
||||
if (wSpanTerm != null) {
|
||||
List positionSpans = wSpanTerm.getPositionSpans();
|
||||
|
|
|
@ -121,7 +121,7 @@ public class SpanScorer implements Scorer {
|
|||
*/
|
||||
public float getTokenScore(Token token) {
|
||||
position += token.getPositionIncrement();
|
||||
String termText = new String(token.termBuffer(), 0, token.termLength());
|
||||
String termText = token.term();
|
||||
|
||||
WeightedSpanTerm weightedSpanTerm;
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ public class TokenGroup
|
|||
tot+=score;
|
||||
}
|
||||
}
|
||||
tokens[numTokens]=token;
|
||||
tokens[numTokens]= (Token) token.clone();
|
||||
scores[numTokens]=score;
|
||||
numTokens++;
|
||||
}
|
||||
|
|
|
@ -147,8 +147,9 @@ public class TokenSources
|
|||
{
|
||||
this.tokens=tokens;
|
||||
}
|
||||
public Token next()
|
||||
public Token next(final Token reusableToken)
|
||||
{
|
||||
assert reusableToken != null;
|
||||
if(currentToken>=tokens.length)
|
||||
{
|
||||
return null;
|
||||
|
@ -160,6 +161,7 @@ public class TokenSources
|
|||
String[] terms=tpv.getTerms();
|
||||
int[] freq=tpv.getTermFrequencies();
|
||||
int totalTokens=0;
|
||||
Token newToken = new Token();
|
||||
for (int t = 0; t < freq.length; t++)
|
||||
{
|
||||
totalTokens+=freq[t];
|
||||
|
@ -189,9 +191,8 @@ public class TokenSources
|
|||
}
|
||||
for (int tp = 0; tp < offsets.length; tp++)
|
||||
{
|
||||
unsortedTokens.add(new Token(terms[t],
|
||||
offsets[tp].getStartOffset(),
|
||||
offsets[tp].getEndOffset()));
|
||||
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
unsortedTokens.add(newToken.clone());
|
||||
}
|
||||
}
|
||||
else
|
||||
|
@ -204,9 +205,8 @@ public class TokenSources
|
|||
//tokens stored with positions - can use this to index straight into sorted array
|
||||
for (int tp = 0; tp < pos.length; tp++)
|
||||
{
|
||||
tokensInOriginalOrder[pos[tp]]=new Token(terms[t],
|
||||
offsets[tp].getStartOffset(),
|
||||
offsets[tp].getEndOffset());
|
||||
newToken.reinit(terms[t], offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
|
||||
tokensInOriginalOrder[pos[tp]] = (Token) newToken.clone();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -261,7 +261,7 @@ public class TokenSources
|
|||
}
|
||||
return getTokenStream(field, contents, analyzer);
|
||||
}
|
||||
//conevenience method
|
||||
//convenience method
|
||||
public static TokenStream getTokenStream(String field, String contents, Analyzer analyzer){
|
||||
return analyzer.tokenStream(field,new StringReader(contents));
|
||||
}
|
||||
|
|
|
@ -1127,21 +1127,22 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
{
|
||||
lst = new ArrayList();
|
||||
Token t;
|
||||
t = new Token("hi", 0, 2);
|
||||
t = createToken("hi", 0, 2);
|
||||
lst.add(t);
|
||||
t = new Token("hispeed", 0, 8);
|
||||
t = createToken("hispeed", 0, 8);
|
||||
lst.add(t);
|
||||
t = new Token("speed", 3, 8);
|
||||
t = createToken("speed", 3, 8);
|
||||
t.setPositionIncrement(0);
|
||||
lst.add(t);
|
||||
t = new Token("10", 8, 10);
|
||||
t = createToken("10", 8, 10);
|
||||
lst.add(t);
|
||||
t = new Token("foo", 11, 14);
|
||||
t = createToken("foo", 11, 14);
|
||||
lst.add(t);
|
||||
iter = lst.iterator();
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return iter.hasNext() ? (Token) iter.next() : null;
|
||||
}
|
||||
};
|
||||
|
@ -1156,21 +1157,22 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
{
|
||||
lst = new ArrayList();
|
||||
Token t;
|
||||
t = new Token("hispeed", 0, 8);
|
||||
t = createToken("hispeed", 0, 8);
|
||||
lst.add(t);
|
||||
t = new Token("hi", 0, 2);
|
||||
t = createToken("hi", 0, 2);
|
||||
t.setPositionIncrement(0);
|
||||
lst.add(t);
|
||||
t = new Token("speed", 3, 8);
|
||||
t = createToken("speed", 3, 8);
|
||||
lst.add(t);
|
||||
t = new Token("10", 8, 10);
|
||||
t = createToken("10", 8, 10);
|
||||
lst.add(t);
|
||||
t = new Token("foo", 11, 14);
|
||||
t = createToken("foo", 11, 14);
|
||||
lst.add(t);
|
||||
iter = lst.iterator();
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return iter.hasNext() ? (Token) iter.next() : null;
|
||||
}
|
||||
};
|
||||
|
@ -1407,6 +1409,13 @@ public class HighlighterTest extends TestCase implements Formatter {
|
|||
super.tearDown();
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// ===================================================================
|
||||
|
@ -1453,31 +1462,32 @@ class SynonymTokenizer extends TokenStream {
|
|||
this.synonyms = synonyms;
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (currentRealToken == null) {
|
||||
Token nextRealToken = realStream.next();
|
||||
Token nextRealToken = realStream.next(reusableToken);
|
||||
if (nextRealToken == null) {
|
||||
return null;
|
||||
}
|
||||
String expansions = (String) synonyms.get(nextRealToken.termText());
|
||||
String expansions = (String) synonyms.get(nextRealToken.term());
|
||||
if (expansions == null) {
|
||||
return nextRealToken;
|
||||
}
|
||||
st = new StringTokenizer(expansions, ",");
|
||||
if (st.hasMoreTokens()) {
|
||||
currentRealToken = nextRealToken;
|
||||
currentRealToken = (Token) nextRealToken.clone();
|
||||
}
|
||||
return currentRealToken;
|
||||
} else {
|
||||
String nextExpandedValue = st.nextToken();
|
||||
Token expandedToken = new Token(nextExpandedValue, currentRealToken.startOffset(),
|
||||
currentRealToken.endOffset());
|
||||
expandedToken.setPositionIncrement(0);
|
||||
reusableToken.reinit(st.nextToken(),
|
||||
currentRealToken.startOffset(),
|
||||
currentRealToken.endOffset());
|
||||
reusableToken.setPositionIncrement(0);
|
||||
if (!st.hasMoreTokens()) {
|
||||
currentRealToken = null;
|
||||
st = null;
|
||||
}
|
||||
return expandedToken;
|
||||
return reusableToken;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -520,12 +520,10 @@ public class InstantiatedIndexWriter {
|
|||
} else {
|
||||
tokenStream = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
|
||||
}
|
||||
Token next = tokenStream.next();
|
||||
|
||||
while (next != null) {
|
||||
next.setTermText(next.termText().intern()); // todo: not sure this needs to be interned?
|
||||
tokens.add(next); // the vector will be built on commit.
|
||||
next = tokenStream.next();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tokenStream.next(reusableToken); nextToken != null; nextToken = tokenStream.next(reusableToken)) {
|
||||
tokens.add((Token) nextToken.clone()); // the vector will be built on commit.
|
||||
fieldSetting.fieldLength++;
|
||||
if (fieldSetting.fieldLength > maxFieldLength) {
|
||||
break;
|
||||
|
@ -533,7 +531,10 @@ public class InstantiatedIndexWriter {
|
|||
}
|
||||
} else {
|
||||
// untokenized
|
||||
tokens.add(new Token(field.stringValue().intern(), 0, field.stringValue().length(), "untokenized"));
|
||||
String fieldVal = field.stringValue();
|
||||
Token token = new Token(0, fieldVal.length(), "untokenized");
|
||||
token.setTermBuffer(fieldVal);
|
||||
tokens.add(token);
|
||||
fieldSetting.fieldLength++;
|
||||
}
|
||||
}
|
||||
|
@ -567,10 +568,10 @@ public class InstantiatedIndexWriter {
|
|||
|
||||
for (Token token : eField_Tokens.getValue()) {
|
||||
|
||||
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.termText());
|
||||
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
|
||||
if (termDocumentInformationFactory == null) {
|
||||
termDocumentInformationFactory = new TermDocumentInformationFactory();
|
||||
termDocumentInformationFactoryByTermText.put(token.termText(), termDocumentInformationFactory);
|
||||
termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
|
||||
}
|
||||
//termDocumentInformationFactory.termFrequency++;
|
||||
|
||||
|
|
|
@ -15,19 +15,32 @@ package org.apache.lucene.store.instantiated;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.document.Field;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.IndexWriter;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.TermDocs;
|
||||
import org.apache.lucene.index.TermEnum;
|
||||
import org.apache.lucene.index.TermFreqVector;
|
||||
import org.apache.lucene.index.TermPositionVector;
|
||||
import org.apache.lucene.index.TermPositions;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.apache.lucene.store.RAMDirectory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.*;
|
||||
|
||||
/**
|
||||
* Asserts equality of content and behaviour of two index readers.
|
||||
*/
|
||||
|
@ -151,21 +164,24 @@ public class TestIndicesEquals extends TestCase {
|
|||
document.add(f);
|
||||
if (i > 4) {
|
||||
final List<Token> tokens = new ArrayList<Token>(2);
|
||||
Token t = new Token("the", 0, 2, "text");
|
||||
Token t = createToken("the", 0, 2, "text");
|
||||
t.setPayload(new Payload(new byte[]{1, 2, 3}));
|
||||
tokens.add(t);
|
||||
t = new Token("end", 3, 5, "text");
|
||||
t = createToken("end", 3, 5, "text");
|
||||
t.setPayload(new Payload(new byte[]{2}));
|
||||
tokens.add(t);
|
||||
tokens.add(new Token("fin", 7, 9));
|
||||
tokens.add(createToken("fin", 7, 9));
|
||||
document.add(new Field("f", new TokenStream() {
|
||||
Iterator<Token> it = tokens.iterator();
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (!it.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
return it.next();
|
||||
// Resettable token streams need to return clones.
|
||||
Token nextToken = (Token) it.next();
|
||||
return (Token) nextToken.clone();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
|
@ -466,4 +482,19 @@ public class TestIndicesEquals extends TestCase {
|
|||
testReader.close();
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset)
|
||||
{
|
||||
Token token = new Token(start, offset);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
private static Token createToken(String term, int start, int offset, String type)
|
||||
{
|
||||
Token token = new Token(start, offset, type);
|
||||
token.setTermBuffer(term);
|
||||
return token;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -279,6 +279,7 @@ class LuceneMethods {
|
|||
|
||||
Analyzer analyzer = new StandardAnalyzer();
|
||||
Enumeration fields = doc.fields();
|
||||
final Token reusableToken = new Token();
|
||||
while (fields.hasMoreElements()) {
|
||||
Field field = (Field) fields.nextElement();
|
||||
String fieldName = field.name();
|
||||
|
@ -299,10 +300,10 @@ class LuceneMethods {
|
|||
// Tokenize field and add to postingTable
|
||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||
try {
|
||||
for (Token t = stream.next(); t != null; t = stream.next()) {
|
||||
position += (t.getPositionIncrement() - 1);
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
position += (nextToken.getPositionIncrement() - 1);
|
||||
position++;
|
||||
String name = t.termText();
|
||||
String name = nextToken.term();
|
||||
Integer Count = (Integer) tokenHash.get(name);
|
||||
if (Count == null) { // not in there yet
|
||||
tokenHash.put(name, new Integer(1)); //first one
|
||||
|
|
|
@ -73,10 +73,11 @@ public class AnalyzerUtil {
|
|||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int position = -1;
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next(); // from filter super class
|
||||
log.println(toString(token));
|
||||
return token;
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken); // from filter super class
|
||||
log.println(toString(nextToken));
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
private String toString(Token token) {
|
||||
|
@ -84,7 +85,7 @@ public class AnalyzerUtil {
|
|||
|
||||
position += token.getPositionIncrement();
|
||||
return "[" + logName + ":" + position + ":" + fieldName + ":"
|
||||
+ token.termText() + ":" + token.startOffset()
|
||||
+ token.term() + ":" + token.startOffset()
|
||||
+ "-" + token.endOffset() + ":" + token.type()
|
||||
+ "]";
|
||||
}
|
||||
|
@ -121,8 +122,9 @@ public class AnalyzerUtil {
|
|||
return new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
private int todo = maxTokens;
|
||||
|
||||
public Token next() throws IOException {
|
||||
return --todo >= 0 ? input.next() : null;
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
return --todo >= 0 ? input.next(reusableToken) : null;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -239,10 +241,11 @@ public class AnalyzerUtil {
|
|||
final ArrayList tokens2 = new ArrayList();
|
||||
TokenStream tokenStream = new TokenFilter(child.tokenStream(fieldName, reader)) {
|
||||
|
||||
public Token next() throws IOException {
|
||||
Token token = input.next(); // from filter super class
|
||||
if (token != null) tokens2.add(token);
|
||||
return token;
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken); // from filter super class
|
||||
if (nextToken != null) tokens2.add(nextToken.clone());
|
||||
return nextToken;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -253,7 +256,8 @@ public class AnalyzerUtil {
|
|||
|
||||
private Iterator iter = tokens.iterator();
|
||||
|
||||
public Token next() {
|
||||
public Token next(Token token) {
|
||||
assert token != null;
|
||||
if (!iter.hasNext()) return null;
|
||||
return (Token) iter.next();
|
||||
}
|
||||
|
@ -300,12 +304,12 @@ public class AnalyzerUtil {
|
|||
HashMap map = new HashMap();
|
||||
TokenStream stream = analyzer.tokenStream("", new StringReader(text));
|
||||
try {
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
MutableInteger freq = (MutableInteger) map.get(token.termText());
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
MutableInteger freq = (MutableInteger) map.get(nextToken.term());
|
||||
if (freq == null) {
|
||||
freq = new MutableInteger(1);
|
||||
map.put(token.termText(), freq);
|
||||
map.put(nextToken.term(), freq);
|
||||
} else {
|
||||
freq.setValue(freq.intValue() + 1);
|
||||
}
|
||||
|
|
|
@ -275,7 +275,8 @@ public class MemoryIndex implements Serializable {
|
|||
return new TokenStream() {
|
||||
private Iterator iter = keywords.iterator();
|
||||
private int start = 0;
|
||||
public Token next() {
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
if (!iter.hasNext()) return null;
|
||||
|
||||
Object obj = iter.next();
|
||||
|
@ -283,9 +284,9 @@ public class MemoryIndex implements Serializable {
|
|||
throw new IllegalArgumentException("keyword must not be null");
|
||||
|
||||
String term = obj.toString();
|
||||
Token token = new Token(term, start, start + term.length());
|
||||
reusableToken.reinit(term, start, start+reusableToken.termLength());
|
||||
start += term.length() + 1; // separate words by 1 (blank) character
|
||||
return token;
|
||||
return reusableToken;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
@ -349,14 +350,13 @@ public class MemoryIndex implements Serializable {
|
|||
HashMap terms = new HashMap();
|
||||
int numTokens = 0;
|
||||
int pos = -1;
|
||||
Token token;
|
||||
|
||||
while ((token = stream.next()) != null) {
|
||||
String term = token.termText();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
String term = nextToken.term();
|
||||
if (term.length() == 0) continue; // nothing to do
|
||||
// if (DEBUG) System.err.println("token='" + term + "'");
|
||||
numTokens++;
|
||||
pos += token.getPositionIncrement();
|
||||
pos += nextToken.getPositionIncrement();
|
||||
|
||||
ArrayIntList positions = (ArrayIntList) terms.get(term);
|
||||
if (positions == null) { // term not seen before
|
||||
|
@ -366,7 +366,7 @@ public class MemoryIndex implements Serializable {
|
|||
if (stride == 1) {
|
||||
positions.add(pos);
|
||||
} else {
|
||||
positions.add(pos, token.startOffset(), token.endOffset());
|
||||
positions.add(pos, nextToken.startOffset(), nextToken.endOffset());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -334,7 +334,8 @@ public class PatternAnalyzer extends Analyzer {
|
|||
this.toLowerCase = toLowerCase;
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
if (matcher == null) return null;
|
||||
|
||||
while (true) { // loop takes care of leading and trailing boundary cases
|
||||
|
@ -352,7 +353,7 @@ public class PatternAnalyzer extends Analyzer {
|
|||
if (start != end) { // non-empty match (header/trailer)
|
||||
String text = str.substring(start, end);
|
||||
if (toLowerCase) text = text.toLowerCase(locale);
|
||||
return new Token(text, start, end);
|
||||
return reusableToken.reinit(text, start, end);
|
||||
}
|
||||
if (!isMatch) return null;
|
||||
}
|
||||
|
@ -384,7 +385,8 @@ public class PatternAnalyzer extends Analyzer {
|
|||
this.stopWords = stopWords;
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
// cache loop instance vars (performance)
|
||||
String s = str;
|
||||
int len = s.length();
|
||||
|
@ -422,7 +424,11 @@ public class PatternAnalyzer extends Analyzer {
|
|||
} while (text != null && isStopWord(text));
|
||||
|
||||
pos = i;
|
||||
return text != null ? new Token(text, start, i) : null;
|
||||
if (text == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
return reusableToken.reinit(text, start, i);
|
||||
}
|
||||
|
||||
private boolean isTokenChar(char c, boolean isLetter) {
|
||||
|
|
|
@ -68,48 +68,51 @@ public class SynonymTokenFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
public Token next() throws IOException {
|
||||
Token token;
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
while (todo > 0 && index < stack.length) { // pop from stack
|
||||
token = createToken(stack[index++], current);
|
||||
if (token != null) {
|
||||
Token nextToken = createToken(stack[index++], current, reusableToken);
|
||||
if (nextToken != null) {
|
||||
todo--;
|
||||
return token;
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
||||
token = input.next();
|
||||
if (token == null) return null; // EOS; iterator exhausted
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null) return null; // EOS; iterator exhausted
|
||||
|
||||
stack = synonyms.getSynonyms(token.termText()); // push onto stack
|
||||
stack = synonyms.getSynonyms(nextToken.term()); // push onto stack
|
||||
if (stack.length > maxSynonyms) randomize(stack);
|
||||
index = 0;
|
||||
current = token;
|
||||
current = (Token) nextToken.clone();
|
||||
todo = maxSynonyms;
|
||||
return token;
|
||||
return nextToken;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates and returns a token for the given synonym of the current input
|
||||
* token; Override for custom (stateless or stateful) behaviour, if desired.
|
||||
* token; Override for custom (stateless or stateful) behavior, if desired.
|
||||
*
|
||||
* @param synonym
|
||||
* a synonym for the current token's term
|
||||
* @param current
|
||||
* the current token from the underlying child stream
|
||||
* @param reusableToken
|
||||
* the token to reuse
|
||||
* @return a new token, or null to indicate that the given synonym should be
|
||||
* ignored
|
||||
*/
|
||||
protected Token createToken(String synonym, Token current) {
|
||||
Token token = new Token(
|
||||
synonym, current.startOffset(), current.endOffset(), SYNONYM_TOKEN_TYPE);
|
||||
token.setPositionIncrement(0);
|
||||
return token;
|
||||
protected Token createToken(String synonym, Token current, final Token reusableToken) {
|
||||
reusableToken.reinit(current, synonym);
|
||||
reusableToken.setTermBuffer(synonym);
|
||||
reusableToken.setType(SYNONYM_TOKEN_TYPE);
|
||||
reusableToken.setPositionIncrement(0);
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
/**
|
||||
* Randomize synonyms to later sample a subset. Uses constant random seed
|
||||
* for reproducability. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* for reproducibility. Uses "DRand", a simple, fast, uniform pseudo-random
|
||||
* number generator with medium statistical quality (multiplicative
|
||||
* congruential method), producing integers in the range [Integer.MIN_VALUE,
|
||||
* Integer.MAX_VALUE].
|
||||
|
|
|
@ -197,9 +197,9 @@ public class PatternAnalyzerTest extends TestCase {
|
|||
|
||||
private List getTokens(TokenStream stream) throws IOException {
|
||||
ArrayList tokens = new ArrayList();
|
||||
Token token;
|
||||
while ((token = stream.next()) != null) {
|
||||
tokens.add(token);
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||
tokens.add(nextToken.clone());
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
@ -211,7 +211,7 @@ public class PatternAnalyzerTest extends TestCase {
|
|||
for (; i < size; i++) {
|
||||
Token t1 = (Token) tokens1.get(i);
|
||||
Token t2 = (Token) tokens2.get(i);
|
||||
if (!(t1.termText().equals(t2.termText()))) throw new IllegalStateException("termText");
|
||||
if (!(t1.term().equals(t2.term()))) throw new IllegalStateException("termText");
|
||||
if (t1.startOffset() != t2.startOffset()) throw new IllegalStateException("startOffset");
|
||||
if (t1.endOffset() != t2.endOffset()) throw new IllegalStateException("endOffset");
|
||||
if (!(t1.type().equals(t2.type()))) throw new IllegalStateException("type");
|
||||
|
@ -222,8 +222,8 @@ public class PatternAnalyzerTest extends TestCase {
|
|||
catch (IllegalStateException e) {
|
||||
if (size > 0) {
|
||||
System.out.println("i=" + i + ", size=" + size);
|
||||
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).termText() + "'");
|
||||
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).termText() + "'");
|
||||
System.out.println("t1[size]='" + ((Token) tokens1.get(size-1)).term() + "'");
|
||||
System.out.println("t2[size]='" + ((Token) tokens2.get(size-1)).term() + "'");
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
@ -234,7 +234,7 @@ public class PatternAnalyzerTest extends TestCase {
|
|||
String str = "[";
|
||||
for (int i=0; i < tokens.size(); i++) {
|
||||
Token t1 = (Token) tokens.get(i);
|
||||
str = str + "'" + t1.termText() + "', ";
|
||||
str = str + "'" + t1.term() + "', ";
|
||||
}
|
||||
return str + "]";
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.util.ArrayList;
|
|||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
import org.apache.lucene.search.Query;
|
||||
|
@ -105,21 +106,23 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
|
||||
// get Analyzer from superclass and tokenize the term
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken;
|
||||
|
||||
int countTokens = 0;
|
||||
while (true) {
|
||||
try {
|
||||
t = source.next();
|
||||
nextToken = source.next(reusableToken);
|
||||
} catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
if (t == null) {
|
||||
if (nextToken == null) {
|
||||
break;
|
||||
}
|
||||
if (!"".equals(t.termText())) {
|
||||
String term = nextToken.term();
|
||||
if (!"".equals(term)) {
|
||||
try {
|
||||
tlist.set(countTokens++, t.termText());
|
||||
tlist.set(countTokens++, term);
|
||||
} catch (IndexOutOfBoundsException ioobe) {
|
||||
countTokens = -1;
|
||||
}
|
||||
|
@ -189,18 +192,19 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
// get Analyzer from superclass and tokenize the term
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||
List tlist = new ArrayList();
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
t = source.next();
|
||||
nextToken = source.next(reusableToken);
|
||||
} catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
if (t == null) {
|
||||
if (nextToken == null) {
|
||||
break;
|
||||
}
|
||||
tlist.add(t.termText());
|
||||
tlist.add(nextToken.term());
|
||||
}
|
||||
|
||||
try {
|
||||
|
@ -238,14 +242,15 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
throws ParseException {
|
||||
// get Analyzer from superclass and tokenize the term
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken;
|
||||
boolean multipleTokens = false;
|
||||
|
||||
try {
|
||||
t = source.next();
|
||||
multipleTokens = source.next() != null;
|
||||
nextToken = source.next(reusableToken);
|
||||
multipleTokens = source.next(reusableToken) != null;
|
||||
} catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
|
||||
try {
|
||||
|
@ -259,7 +264,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
+ " - tokens were added");
|
||||
}
|
||||
|
||||
return (t == null) ? null : super.getFuzzyQuery(field, t.termText(), minSimilarity);
|
||||
return (nextToken == null) ? null : super.getFuzzyQuery(field, nextToken.term(), minSimilarity);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -270,18 +275,20 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
throws ParseException {
|
||||
// get Analyzer from superclass and tokenize the terms
|
||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken;
|
||||
Token multipleToken;
|
||||
boolean multipleTokens = false;
|
||||
|
||||
// part1
|
||||
try {
|
||||
t = source.next();
|
||||
if (t != null) {
|
||||
part1 = t.termText();
|
||||
nextToken = source.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
part1 = nextToken.term();
|
||||
}
|
||||
multipleTokens = source.next() != null;
|
||||
multipleTokens = source.next(reusableToken) != null;
|
||||
} catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
try {
|
||||
source.close();
|
||||
|
@ -293,16 +300,16 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
|||
+ " - tokens were added to part1");
|
||||
}
|
||||
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
||||
// part2
|
||||
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
||||
try {
|
||||
t = source.next();
|
||||
if (t != null) {
|
||||
part2 = t.termText();
|
||||
nextToken = source.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
part2 = nextToken.term();
|
||||
}
|
||||
multipleTokens = source.next() != null;
|
||||
multipleTokens = source.next(reusableToken) != null;
|
||||
} catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
try {
|
||||
source.close();
|
||||
|
|
|
@ -25,6 +25,20 @@ public interface CharStream {
|
|||
*/
|
||||
char readChar() throws java.io.IOException;
|
||||
|
||||
/**
|
||||
* Returns the column position of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndColumn
|
||||
*/
|
||||
int getColumn();
|
||||
|
||||
/**
|
||||
* Returns the line number of the character last read.
|
||||
* @deprecated
|
||||
* @see #getEndLine
|
||||
*/
|
||||
int getLine();
|
||||
|
||||
/**
|
||||
* Returns the column number of the last character for current token (being
|
||||
* matched after the last call to BeginTOken).
|
||||
|
|
|
@ -1,14 +1,29 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParser.java */
|
||||
package org.apache.lucene.queryParser.precedence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Vector;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.Parameter;
|
||||
|
||||
/**
|
||||
|
@ -296,21 +311,22 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
|||
|
||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||
Vector v = new Vector();
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
||||
org.apache.lucene.analysis.Token nextToken;
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
t = source.next();
|
||||
nextToken = source.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
if (t == null)
|
||||
if (nextToken == null)
|
||||
break;
|
||||
v.addElement(t);
|
||||
if (t.getPositionIncrement() == 1)
|
||||
v.addElement(nextToken.clone());
|
||||
if (nextToken.getPositionIncrement() == 1)
|
||||
positionCount++;
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
|
@ -325,17 +341,17 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
|||
if (v.size() == 0)
|
||||
return null;
|
||||
else if (v.size() == 1) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
|
||||
return new TermQuery(new Term(field, t.termText()));
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(0);
|
||||
return new TermQuery(new Term(field, nextToken.term()));
|
||||
} else {
|
||||
if (severalTokensAtSamePosition) {
|
||||
if (positionCount == 1) {
|
||||
// no phrase query:
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
TermQuery currentQuery = new TermQuery(
|
||||
new Term(field, t.termText()));
|
||||
new Term(field, nextToken.term()));
|
||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return q;
|
||||
|
@ -345,12 +361,12 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
|||
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
List multiTerms = new ArrayList();
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
if (nextToken.getPositionIncrement() == 1 && multiTerms.size() > 0) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]));
|
||||
multiTerms.clear();
|
||||
}
|
||||
multiTerms.add(new Term(field, t.termText()));
|
||||
multiTerms.add(new Term(field, nextToken.term()));
|
||||
}
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]));
|
||||
return mpq;
|
||||
|
@ -361,7 +377,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
|||
q.setSlop(phraseSlop);
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
|
||||
v.elementAt(i)).termText()));
|
||||
v.elementAt(i)).term()));
|
||||
|
||||
}
|
||||
return q;
|
||||
|
|
|
@ -25,14 +25,29 @@ PARSER_BEGIN(PrecedenceQueryParser)
|
|||
|
||||
package org.apache.lucene.queryParser.precedence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Vector;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.Parameter;
|
||||
|
||||
/**
|
||||
|
@ -320,21 +335,22 @@ public class PrecedenceQueryParser {
|
|||
|
||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||
Vector v = new Vector();
|
||||
org.apache.lucene.analysis.Token t;
|
||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
||||
org.apache.lucene.analysis.Token nextToken;
|
||||
int positionCount = 0;
|
||||
boolean severalTokensAtSamePosition = false;
|
||||
|
||||
while (true) {
|
||||
try {
|
||||
t = source.next();
|
||||
nextToken = source.next(reusableToken);
|
||||
}
|
||||
catch (IOException e) {
|
||||
t = null;
|
||||
nextToken = null;
|
||||
}
|
||||
if (t == null)
|
||||
if (nextToken == null)
|
||||
break;
|
||||
v.addElement(t);
|
||||
if (t.getPositionIncrement() == 1)
|
||||
v.addElement(nextToken.clone());
|
||||
if (nextToken.getPositionIncrement() == 1)
|
||||
positionCount++;
|
||||
else
|
||||
severalTokensAtSamePosition = true;
|
||||
|
@ -349,17 +365,17 @@ public class PrecedenceQueryParser {
|
|||
if (v.size() == 0)
|
||||
return null;
|
||||
else if (v.size() == 1) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(0);
|
||||
return new TermQuery(new Term(field, t.termText()));
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(0);
|
||||
return new TermQuery(new Term(field, nextToken.term()));
|
||||
} else {
|
||||
if (severalTokensAtSamePosition) {
|
||||
if (positionCount == 1) {
|
||||
// no phrase query:
|
||||
BooleanQuery q = new BooleanQuery();
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
TermQuery currentQuery = new TermQuery(
|
||||
new Term(field, t.termText()));
|
||||
new Term(field, nextToken.term()));
|
||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
return q;
|
||||
|
@ -369,12 +385,12 @@ public class PrecedenceQueryParser {
|
|||
MultiPhraseQuery mpq = new MultiPhraseQuery();
|
||||
List multiTerms = new ArrayList();
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
t = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
if (t.getPositionIncrement() == 1 && multiTerms.size() > 0) {
|
||||
nextToken = (org.apache.lucene.analysis.Token) v.elementAt(i);
|
||||
if (nextToken.getPositionIncrement() == 1 && multiTerms.size() > 0) {
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]));
|
||||
multiTerms.clear();
|
||||
}
|
||||
multiTerms.add(new Term(field, t.termText()));
|
||||
multiTerms.add(new Term(field, nextToken.term()));
|
||||
}
|
||||
mpq.add((Term[])multiTerms.toArray(new Term[0]));
|
||||
return mpq;
|
||||
|
@ -385,7 +401,7 @@ public class PrecedenceQueryParser {
|
|||
q.setSlop(phraseSlop);
|
||||
for (int i = 0; i < v.size(); i++) {
|
||||
q.add(new Term(field, ((org.apache.lucene.analysis.Token)
|
||||
v.elementAt(i)).termText()));
|
||||
v.elementAt(i)).term()));
|
||||
|
||||
}
|
||||
return q;
|
||||
|
|
|
@ -1,13 +1,27 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. PrecedenceQueryParserTokenManager.java */
|
||||
package org.apache.lucene.queryParser.precedence;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.text.DateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Vector;
|
||||
import java.io.*;
|
||||
import java.text.*;
|
||||
import java.util.*;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.DateTools;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.FuzzyQuery;
|
||||
import org.apache.lucene.search.MultiPhraseQuery;
|
||||
import org.apache.lucene.search.PhraseQuery;
|
||||
import org.apache.lucene.search.PrefixQuery;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.RangeQuery;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.search.WildcardQuery;
|
||||
import org.apache.lucene.util.Parameter;
|
||||
|
||||
public class PrecedenceQueryParserTokenManager implements PrecedenceQueryParserConstants
|
||||
|
|
|
@ -57,19 +57,26 @@ public class TestPrecedenceQueryParser extends TestCase {
|
|||
boolean inPhrase = false;
|
||||
int savedStart = 0, savedEnd = 0;
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (inPhrase) {
|
||||
inPhrase = false;
|
||||
return new Token("phrase2", savedStart, savedEnd);
|
||||
reusableToken.setTermBuffer("phrase2");
|
||||
reusableToken.setStartOffset(savedStart);
|
||||
reusableToken.setEndOffset(savedEnd);
|
||||
return reusableToken;
|
||||
} else
|
||||
for (Token token = input.next(); token != null; token = input.next()) {
|
||||
if (token.termText().equals("phrase")) {
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
if (nextToken.term().equals("phrase")) {
|
||||
inPhrase = true;
|
||||
savedStart = token.startOffset();
|
||||
savedEnd = token.endOffset();
|
||||
return new Token("phrase1", savedStart, savedEnd);
|
||||
} else if (!token.termText().equals("stop"))
|
||||
return token;
|
||||
savedStart = nextToken.startOffset();
|
||||
savedEnd = nextToken.endOffset();
|
||||
nextToken.setTermBuffer("phrase1");
|
||||
nextToken.setStartOffset(savedStart);
|
||||
nextToken.setEndOffset(savedEnd);
|
||||
return nextToken;
|
||||
} else if (!nextToken.term().equals("stop"))
|
||||
return nextToken;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -104,18 +104,19 @@ public class FuzzyLikeThisQuery extends Query
|
|||
{
|
||||
if(f.queryString==null) return;
|
||||
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
||||
Token token=ts.next();
|
||||
final Token reusableToken = new Token();
|
||||
int corpusNumDocs=reader.numDocs();
|
||||
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
||||
HashSet processedTerms=new HashSet();
|
||||
while(token!=null)
|
||||
{
|
||||
if(!processedTerms.contains(token.termText()))
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken!=null; nextToken = ts.next(reusableToken))
|
||||
{
|
||||
String term = nextToken.term();
|
||||
if(!processedTerms.contains(term))
|
||||
{
|
||||
processedTerms.add(token.termText());
|
||||
processedTerms.add(term);
|
||||
ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
|
||||
float minScore=0;
|
||||
Term startTerm=internSavingTemplateTerm.createTerm(token.termText());
|
||||
Term startTerm=internSavingTemplateTerm.createTerm(term);
|
||||
FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
|
||||
TermEnum origEnum = reader.terms(startTerm);
|
||||
int df=0;
|
||||
|
@ -162,8 +163,7 @@ public class FuzzyLikeThisQuery extends Query
|
|||
q.insert(st);
|
||||
}
|
||||
}
|
||||
token=ts.next();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Query rewrite(IndexReader reader) throws IOException
|
||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.search.IndexSearcher;
|
|||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
|
@ -808,10 +809,11 @@ public final class MoreLikeThis {
|
|||
throws IOException
|
||||
{
|
||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
||||
org.apache.lucene.analysis.Token token;
|
||||
int tokenCount=0;
|
||||
while ((token = ts.next()) != null) { // for every token
|
||||
String word = token.termText();
|
||||
// for every token
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String word = nextToken.term();
|
||||
tokenCount++;
|
||||
if(tokenCount>maxNumTokensParsed)
|
||||
{
|
||||
|
@ -872,7 +874,7 @@ public final class MoreLikeThis {
|
|||
* For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}.
|
||||
*
|
||||
* @param r the reader that has the content of the document
|
||||
* @return the most intresting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||
* @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first
|
||||
*
|
||||
* @see #retrieveInterestingTerms
|
||||
*/
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.util.HashSet;
|
|||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
|
@ -85,12 +86,11 @@ public final class SimilarityQueries
|
|||
throws IOException
|
||||
{
|
||||
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
BooleanQuery tmp = new BooleanQuery();
|
||||
Set already = new HashSet(); // ignore dups
|
||||
while ( (t = ts.next()) != null)
|
||||
{
|
||||
String word = t.termText();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String word = nextToken.term();
|
||||
// ignore opt stop words
|
||||
if ( stop != null &&
|
||||
stop.contains( word)) continue;
|
||||
|
|
|
@ -18,11 +18,10 @@ package org.apache.lucene.analysis.snowball;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import java.lang.reflect.Method;
|
||||
|
||||
import net.sf.snowball.SnowballProgram;
|
||||
import net.sf.snowball.ext.*;
|
||||
import net.sf.snowball.ext.EnglishStemmer;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
|
@ -60,20 +59,22 @@ public class SnowballFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/** Returns the next input Token, after being stemmed */
|
||||
public final Token next() throws IOException {
|
||||
Token token = input.next();
|
||||
if (token == null)
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
stemmer.setCurrent(token.termText());
|
||||
String originalTerm = nextToken.term();
|
||||
stemmer.setCurrent(originalTerm);
|
||||
try {
|
||||
stemMethod.invoke(stemmer, EMPTY_ARGS);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e.toString());
|
||||
}
|
||||
|
||||
Token newToken = new Token(stemmer.getCurrent(),
|
||||
token.startOffset(), token.endOffset(), token.type());
|
||||
newToken.setPositionIncrement(token.getPositionIncrement());
|
||||
return newToken;
|
||||
String finalTerm = stemmer.getCurrent();
|
||||
// Don't bother updating, if it is unchanged.
|
||||
if (!originalTerm.equals(finalTerm))
|
||||
nextToken.setTermBuffer(finalTerm);
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,64 +1,30 @@
|
|||
package org.apache.lucene.analysis.snowball;
|
||||
|
||||
/* ====================================================================
|
||||
* The Apache Software License, Version 1.1
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* Copyright (c) 2004 The Apache Software Foundation. All rights
|
||||
* reserved.
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
*
|
||||
* 3. The end-user documentation included with the redistribution,
|
||||
* if any, must include the following acknowledgment:
|
||||
* "This product includes software developed by the
|
||||
* Apache Software Foundation (http://www.apache.org/)."
|
||||
* Alternately, this acknowledgment may appear in the software itself,
|
||||
* if and wherever such third-party acknowledgments normally appear.
|
||||
*
|
||||
* 4. The names "Apache" and "Apache Software Foundation" and
|
||||
* "Apache Lucene" must not be used to endorse or promote products
|
||||
* derived from this software without prior written permission. For
|
||||
* written permission, please contact apache@apache.org.
|
||||
*
|
||||
* 5. Products derived from this software may not be called "Apache",
|
||||
* "Apache Lucene", nor may "Apache" appear in their name, without
|
||||
* prior written permission of the Apache Software Foundation.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
|
||||
* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
|
||||
* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
* DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
|
||||
* ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
|
||||
* USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
|
||||
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
* ====================================================================
|
||||
*
|
||||
* This software consists of voluntary contributions made by many
|
||||
* individuals on behalf of the Apache Software Foundation. For more
|
||||
* information on the Apache Software Foundation, please see
|
||||
* <http://www.apache.org/>.
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.*;
|
||||
import java.io.StringReader;
|
||||
|
||||
import junit.framework.*;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.index.Payload;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
public class TestSnowball extends TestCase {
|
||||
|
||||
|
@ -66,12 +32,12 @@ public class TestSnowball extends TestCase {
|
|||
String input,
|
||||
String[] output) throws Exception {
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
final Token reusableToken = new Token();
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
Token t = ts.next();
|
||||
assertNotNull(t);
|
||||
assertEquals(output[i], t.termText());
|
||||
Token nextToken = ts.next(reusableToken);
|
||||
assertEquals(output[i], nextToken.term());
|
||||
}
|
||||
assertNull(ts.next());
|
||||
assertNull(ts.next(reusableToken));
|
||||
ts.close();
|
||||
}
|
||||
|
||||
|
@ -83,25 +49,33 @@ public class TestSnowball extends TestCase {
|
|||
|
||||
|
||||
public void testFilterTokens() throws Exception {
|
||||
final Token tok = new Token("accents", 2, 7, "wrd");
|
||||
final Token tok = new Token(2, 7, "wrd");
|
||||
tok.setTermBuffer("accents");
|
||||
tok.setPositionIncrement(3);
|
||||
Payload tokPayload = new Payload(new byte[]{0,1,2,3});
|
||||
tok.setPayload(tokPayload);
|
||||
int tokFlags = 77;
|
||||
tok.setFlags(tokFlags);
|
||||
|
||||
SnowballFilter filter = new SnowballFilter(
|
||||
new TokenStream() {
|
||||
public Token next() {
|
||||
public Token next(final Token reusableToken) {
|
||||
assert reusableToken != null;
|
||||
return tok;
|
||||
}
|
||||
},
|
||||
"English"
|
||||
);
|
||||
|
||||
Token newtok = filter.next();
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = filter.next(reusableToken);
|
||||
|
||||
assertEquals("accent", newtok.termText());
|
||||
assertEquals(2, newtok.startOffset());
|
||||
assertEquals(7, newtok.endOffset());
|
||||
assertEquals("wrd", newtok.type());
|
||||
assertEquals(3, newtok.getPositionIncrement());
|
||||
assertEquals("accent", nextToken.term());
|
||||
assertEquals(2, nextToken.startOffset());
|
||||
assertEquals(7, nextToken.endOffset());
|
||||
assertEquals("wrd", nextToken.type());
|
||||
assertEquals(3, nextToken.getPositionIncrement());
|
||||
assertEquals(tokFlags, nextToken.getFlags());
|
||||
assertEquals(tokPayload, nextToken.getPayload());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -133,7 +133,8 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
*
|
||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||
*/
|
||||
public Token next(Token result) throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (tokens != null && tokens.hasNext()){
|
||||
return (Token)tokens.next();
|
||||
}
|
||||
|
@ -144,22 +145,22 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
|
||||
if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
|
||||
setupToken(result);
|
||||
setupToken(reusableToken);
|
||||
} else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
|
||||
collapseTokens(result, tokenType);
|
||||
collapseTokens(reusableToken, tokenType);
|
||||
|
||||
}
|
||||
else if (tokenOutput == BOTH){
|
||||
//collapse into a single token, add it to tokens AND output the individual tokens
|
||||
//output the untokenized Token first
|
||||
collapseAndSaveTokens(result, tokenType, type);
|
||||
collapseAndSaveTokens(reusableToken, tokenType, type);
|
||||
}
|
||||
result.setPositionIncrement(scanner.getPositionIncrement());
|
||||
result.setType(type);
|
||||
return result;
|
||||
reusableToken.setPositionIncrement(scanner.getPositionIncrement());
|
||||
reusableToken.setType(type);
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
private void collapseAndSaveTokens(Token result, int tokenType, String type) throws IOException {
|
||||
private void collapseAndSaveTokens(final Token reusableToken, int tokenType, String type) throws IOException {
|
||||
//collapse
|
||||
StringBuffer buffer = new StringBuffer(32);
|
||||
int numAdded = scanner.setText(buffer);
|
||||
|
@ -188,10 +189,10 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
//trim the buffer
|
||||
String s = buffer.toString().trim();
|
||||
result.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
result.setStartOffset(theStart);
|
||||
result.setEndOffset(theStart + s.length());
|
||||
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
reusableToken.setStartOffset(theStart);
|
||||
reusableToken.setEndOffset(theStart + s.length());
|
||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||
scanner.yypushback(scanner.yylength());
|
||||
|
@ -205,7 +206,7 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
saved.setType(type);
|
||||
}
|
||||
|
||||
private void collapseTokens(Token result, int tokenType) throws IOException {
|
||||
private void collapseTokens(final Token reusableToken, int tokenType) throws IOException {
|
||||
//collapse
|
||||
StringBuffer buffer = new StringBuffer(32);
|
||||
int numAdded = scanner.setText(buffer);
|
||||
|
@ -227,10 +228,10 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
//trim the buffer
|
||||
String s = buffer.toString().trim();
|
||||
result.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
result.setStartOffset(theStart);
|
||||
result.setEndOffset(theStart + s.length());
|
||||
result.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
reusableToken.setTermBuffer(s.toCharArray(), 0, s.length());
|
||||
reusableToken.setStartOffset(theStart);
|
||||
reusableToken.setEndOffset(theStart + s.length());
|
||||
reusableToken.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||
if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
|
||||
scanner.yypushback(scanner.yylength());
|
||||
|
@ -239,11 +240,11 @@ public class WikipediaTokenizer extends Tokenizer {
|
|||
}
|
||||
}
|
||||
|
||||
private void setupToken(Token result) {
|
||||
scanner.getText(result);
|
||||
private void setupToken(final Token reusableToken) {
|
||||
scanner.getText(reusableToken);
|
||||
final int start = scanner.yychar();
|
||||
result.setStartOffset(start);
|
||||
result.setEndOffset(start + result.termLength());
|
||||
reusableToken.setStartOffset(start);
|
||||
reusableToken.setEndOffset(start + reusableToken.termLength());
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -126,28 +126,28 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
tcm.put("3.25", "<NUM>");
|
||||
tcm.put("3.50", "<NUM>");
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
int count = 0;
|
||||
int numItalics = 0;
|
||||
int numBoldItalics = 0;
|
||||
int numCategory = 0;
|
||||
int numCitation = 0;
|
||||
while ((token = tf.next(token)) != null) {
|
||||
String tokText = token.termText();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = tf.next(reusableToken); nextToken != null; nextToken = tf.next(reusableToken)) {
|
||||
String tokText = nextToken.term();
|
||||
//System.out.println("Text: " + tokText + " Type: " + token.type());
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
String expectedType = (String) tcm.get(tokText);
|
||||
assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
|
||||
assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
|
||||
assertTrue("expectedType is null and it shouldn't be for: " + nextToken, expectedType != null);
|
||||
assertTrue(nextToken.type() + " is not equal to " + expectedType + " for " + nextToken, nextToken.type().equals(expectedType) == true);
|
||||
count++;
|
||||
if (token.type().equals(WikipediaTokenizer.ITALICS) == true){
|
||||
if (nextToken.type().equals(WikipediaTokenizer.ITALICS) == true){
|
||||
numItalics++;
|
||||
} else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
||||
} else if (nextToken.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
||||
numBoldItalics++;
|
||||
} else if (token.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
||||
} else if (nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
||||
numCategory++;
|
||||
}
|
||||
else if (token.type().equals(WikipediaTokenizer.CITATION) == true){
|
||||
else if (nextToken.type().equals(WikipediaTokenizer.CITATION) == true){
|
||||
numCitation++;
|
||||
}
|
||||
}
|
||||
|
@ -166,105 +166,105 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
}
|
||||
|
||||
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "click", nextToken.term().equals("click") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "link", nextToken.term().equals("link") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
||||
nextToken.term().equals("here") == true);
|
||||
//The link, and here should be at the same position for phrases to work
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "again",
|
||||
nextToken.term().equals("again") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "click",
|
||||
nextToken.term().equals("click") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org",
|
||||
nextToken.term().equals("http://lucene.apache.org") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
||||
nextToken.term().equals("here") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "again",
|
||||
nextToken.term().equals("again") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "a",
|
||||
nextToken.term().equals("a") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "b",
|
||||
nextToken.term().equals("b") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "c",
|
||||
nextToken.term().equals("c") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "d",
|
||||
nextToken.term().equals("d") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is not null and it should be", token == null);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
||||
}
|
||||
|
||||
public void testLinks() throws Exception {
|
||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||
Token token = new Token();
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(token);//skip here
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(token);//skip here
|
||||
token = tf.next(token);
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
||||
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(reusableToken);//skip here
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
||||
nextToken.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
tf.next(reusableToken);//skip here
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
||||
nextToken.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, nextToken.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is not null and it should be", token == null);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
||||
|
||||
}
|
||||
|
||||
|
@ -277,72 +277,72 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
checkLinkPhrases(tf);
|
||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
||||
Token token;
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
|
||||
nextToken.term().equals("a b c d") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "e f g",
|
||||
nextToken.term().equals("e f g") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
||||
nextToken.term().equals("link") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
||||
nextToken.term().equals("here") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
||||
nextToken.term().equals("link") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "there",
|
||||
nextToken.term().equals("there") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "italics here",
|
||||
nextToken.term().equals("italics here") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "something",
|
||||
nextToken.term().equals("something") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "more italics",
|
||||
nextToken.term().equals("more italics") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "h i j",
|
||||
nextToken.term().equals("h i j") == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is not null and it should be", token == null);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
||||
}
|
||||
|
||||
public void testBoth() throws Exception {
|
||||
|
@ -352,225 +352,225 @@ public class WikipediaTokenizerTest extends TestCase {
|
|||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
||||
Token token;
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
|
||||
final Token reusableToken = new Token();
|
||||
Token nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "a b c d",
|
||||
nextToken.term().equals("a b c d") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "a",
|
||||
nextToken.term().equals("a") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", nextToken.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 11, nextToken.startOffset() == 11);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 12, nextToken.endOffset() == 12);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "b",
|
||||
nextToken.term().equals("b") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 13, nextToken.startOffset() == 13);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 14, nextToken.endOffset() == 14);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "c",
|
||||
nextToken.term().equals("c") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 15, nextToken.startOffset() == 15);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 16, nextToken.endOffset() == 16);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "d",
|
||||
nextToken.term().equals("d") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 17, nextToken.startOffset() == 17);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 18, nextToken.endOffset() == 18);
|
||||
|
||||
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "e f g",
|
||||
nextToken.term().equals("e f g") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "e",
|
||||
nextToken.term().equals("e") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 32, nextToken.startOffset() == 32);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 33, nextToken.endOffset() == 33);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "f",
|
||||
nextToken.term().equals("f") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 34, nextToken.startOffset() == 34);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 35, nextToken.endOffset() == 35);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "g",
|
||||
nextToken.term().equals("g") == true);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 36, nextToken.startOffset() == 36);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 37, nextToken.endOffset() == 37);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
||||
nextToken.term().equals("link") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 42, nextToken.startOffset() == 42);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 46, nextToken.endOffset() == 46);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
||||
nextToken.term().equals("here") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 47, nextToken.startOffset() == 47);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 51, nextToken.endOffset() == 51);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "link",
|
||||
nextToken.term().equals("link") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 56, nextToken.startOffset() == 56);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 60, nextToken.endOffset() == 60);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "there",
|
||||
nextToken.term().equals("there") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, nextToken.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 61, nextToken.startOffset() == 61);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 66, nextToken.endOffset() == 66);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "italics here",
|
||||
nextToken.term().equals("italics here") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "italics",
|
||||
nextToken.term().equals("italics") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 71, nextToken.startOffset() == 71);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 78, nextToken.endOffset() == 78);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "here",
|
||||
nextToken.term().equals("here") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 79, nextToken.startOffset() == 79);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 83, nextToken.endOffset() == 83);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "something",
|
||||
nextToken.term().equals("something") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 86, nextToken.startOffset() == 86);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 95, nextToken.endOffset() == 95);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "more italics",
|
||||
nextToken.term().equals("more italics") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "more",
|
||||
nextToken.term().equals("more") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 98, nextToken.startOffset() == 98);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 102, nextToken.endOffset() == 102);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "italics",
|
||||
nextToken.term().equals("italics") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.ITALICS, nextToken.type().equals(WikipediaTokenizer.ITALICS) == true);
|
||||
|
||||
assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 103, nextToken.startOffset() == 103);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 110, nextToken.endOffset() == 110);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h i j",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("h i j") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "h i j",
|
||||
nextToken.term().equals("h i j") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, nextToken.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "h",
|
||||
nextToken.term().equals("h") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 0, nextToken.getPositionIncrement() == 0);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 124, nextToken.startOffset() == 124);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 125, nextToken.endOffset() == 125);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
|
||||
token = tf.next();
|
||||
assertTrue("token is null and it shouldn't be", token != null);
|
||||
assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
|
||||
new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
|
||||
assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
|
||||
assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
|
||||
assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "i",
|
||||
nextToken.term().equals("i") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 128, nextToken.startOffset() == 128);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 129, nextToken.endOffset() == 129);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is null and it shouldn't be", nextToken != null);
|
||||
assertTrue(nextToken.term() + " is not equal to " + "j",
|
||||
nextToken.term().equals("j") == true);
|
||||
assertTrue(nextToken.getPositionIncrement() + " does not equal: " + 1, nextToken.getPositionIncrement() == 1);
|
||||
assertTrue(nextToken.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, nextToken.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
||||
assertTrue(nextToken.startOffset() + " does not equal: " + 132, nextToken.startOffset() == 132);
|
||||
assertTrue(nextToken.endOffset() + " does not equal: " + 133, nextToken.endOffset() == 133);
|
||||
|
||||
token = tf.next();
|
||||
assertTrue("token is not null and it should be", token == null);
|
||||
nextToken = tf.next(reusableToken);
|
||||
assertTrue("nextToken is not null and it should be", nextToken == null);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,14 +17,28 @@ package org.apache.lucene.wordnet;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import org.apache.lucene.analysis.standard.*;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -99,10 +113,10 @@ public final class SynExpand {
|
|||
|
||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
while ( (t = ts.next()) != null)
|
||||
{
|
||||
String word = t.termText();
|
||||
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String word = nextToken.term();
|
||||
if ( already.add( word))
|
||||
top.add( word);
|
||||
}
|
||||
|
|
|
@ -17,13 +17,27 @@ package org.apache.lucene.wordnet;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.store.*;
|
||||
import org.apache.lucene.search.*;
|
||||
import org.apache.lucene.index.*;
|
||||
import org.apache.lucene.document.*;
|
||||
import org.apache.lucene.analysis.*;
|
||||
import java.io.*;
|
||||
import java.util.*;
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.document.Document;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.Hits;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Searcher;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
import org.apache.lucene.store.FSDirectory;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -86,10 +100,9 @@ public class SynLookup {
|
|||
|
||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||
org.apache.lucene.analysis.Token t;
|
||||
while ( (t = ts.next()) != null)
|
||||
{
|
||||
String word = t.termText();
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
String word = nextToken.term();
|
||||
if ( already.add( word))
|
||||
top.add( word);
|
||||
}
|
||||
|
|
|
@ -74,16 +74,14 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
|||
if((stopWords!=null)&&(fields!=null))
|
||||
{
|
||||
stopWordsSet=new HashSet();
|
||||
final Token reusableToken = new Token();
|
||||
for (int i = 0; i < fields.length; i++)
|
||||
{
|
||||
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
|
||||
try
|
||||
{
|
||||
Token stopToken=ts.next();
|
||||
while(stopToken!=null)
|
||||
{
|
||||
stopWordsSet.add(stopToken.termText());
|
||||
stopToken=ts.next();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
stopWordsSet.add(nextToken.term());
|
||||
}
|
||||
}
|
||||
catch(IOException ioe)
|
||||
|
|
|
@ -52,12 +52,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
|
|||
{
|
||||
ArrayList clausesList=new ArrayList();
|
||||
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
||||
Token token=ts.next();
|
||||
while(token!=null)
|
||||
{
|
||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,token.termText()));
|
||||
final Token reusableToken = new Token();
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName,nextToken.term()));
|
||||
clausesList.add(stq);
|
||||
token=ts.next();
|
||||
}
|
||||
SpanOrQuery soq=new SpanOrQuery((SpanQuery[]) clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||
soq.setBoost(DOMUtils.getAttribute(e,"boost",1.0f));
|
||||
|
|
|
@ -59,20 +59,18 @@ public class TermsFilterBuilder implements FilterBuilder
|
|||
|
||||
try
|
||||
{
|
||||
Token token = ts.next();
|
||||
final Token reusableToken = new Token();
|
||||
Term term = null;
|
||||
while (token != null)
|
||||
{
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
if (term == null)
|
||||
{
|
||||
term = new Term(fieldName, token.termText());
|
||||
term = new Term(fieldName, nextToken.term());
|
||||
} else
|
||||
{
|
||||
// create from previous to save fieldName.intern overhead
|
||||
term = term.createTerm(token.termText());
|
||||
term = term.createTerm(nextToken.term());
|
||||
}
|
||||
tf.addTerm(term);
|
||||
token = ts.next();
|
||||
}
|
||||
}
|
||||
catch (IOException ioe)
|
||||
|
|
|
@ -58,20 +58,18 @@ public class TermsQueryBuilder implements QueryBuilder {
|
|||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||
try
|
||||
{
|
||||
Token token = ts.next();
|
||||
final Token reusableToken = new Token();
|
||||
Term term = null;
|
||||
while (token != null)
|
||||
{
|
||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
||||
if (term == null)
|
||||
{
|
||||
term = new Term(fieldName, token.termText());
|
||||
term = new Term(fieldName, nextToken.term());
|
||||
} else
|
||||
{
|
||||
// create from previous to save fieldName.intern overhead
|
||||
term = term.createTerm(token.termText());
|
||||
term = term.createTerm(nextToken.term());
|
||||
}
|
||||
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
||||
token = ts.next();
|
||||
}
|
||||
}
|
||||
catch (IOException ioe)
|
||||
|
|
|
@ -487,7 +487,10 @@ null)
|
|||
private int jj_gc = 0;
|
||||
|
||||
public HTMLParser(java.io.InputStream stream) {
|
||||
jj_input_stream = new SimpleCharStream(stream, 1, 1);
|
||||
this(stream, null);
|
||||
}
|
||||
public HTMLParser(java.io.InputStream stream, String encoding) {
|
||||
try { jj_input_stream = new SimpleCharStream(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
||||
token_source = new HTMLParserTokenManager(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
|
@ -497,7 +500,10 @@ null)
|
|||
}
|
||||
|
||||
public void ReInit(java.io.InputStream stream) {
|
||||
jj_input_stream.ReInit(stream, 1, 1);
|
||||
ReInit(stream, null);
|
||||
}
|
||||
public void ReInit(java.io.InputStream stream, String encoding) {
|
||||
try { jj_input_stream.ReInit(stream, encoding, 1, 1); } catch(java.io.UnsupportedEncodingException e) { throw new RuntimeException(e); }
|
||||
token_source.ReInit(jj_input_stream);
|
||||
token = new Token();
|
||||
jj_ntk = -1;
|
||||
|
@ -627,7 +633,9 @@ null)
|
|||
jj_lasttokens[jj_endpos++] = kind;
|
||||
} else if (jj_endpos != 0) {
|
||||
jj_expentry = new int[jj_endpos];
|
||||
System.arraycopy(jj_lasttokens, 0, jj_expentry, 0, jj_endpos);
|
||||
for (int i = 0; i < jj_endpos; i++) {
|
||||
jj_expentry[i] = jj_lasttokens[i];
|
||||
}
|
||||
boolean exists = false;
|
||||
for (java.util.Enumeration e = jj_expentries.elements(); e.hasMoreElements();) {
|
||||
int[] oldentry = (int[])(e.nextElement());
|
||||
|
@ -692,6 +700,7 @@ null)
|
|||
final private void jj_rescan_token() {
|
||||
jj_rescan = true;
|
||||
for (int i = 0; i < 2; i++) {
|
||||
try {
|
||||
JJCalls p = jj_2_rtns[i];
|
||||
do {
|
||||
if (p.gen > jj_gen) {
|
||||
|
@ -703,6 +712,7 @@ null)
|
|||
}
|
||||
p = p.next;
|
||||
} while (p != null);
|
||||
} catch(LookaheadSuccess ls) { }
|
||||
}
|
||||
jj_rescan = false;
|
||||
}
|
||||
|
|
|
@ -1457,14 +1457,12 @@ protected SimpleCharStream input_stream;
|
|||
private final int[] jjrounds = new int[28];
|
||||
private final int[] jjstateSet = new int[56];
|
||||
protected char curChar;
|
||||
public HTMLParserTokenManager(SimpleCharStream stream)
|
||||
{
|
||||
public HTMLParserTokenManager(SimpleCharStream stream){
|
||||
if (SimpleCharStream.staticFlag)
|
||||
throw new Error("ERROR: Cannot use a static CharStream class with a non-static lexical analyzer.");
|
||||
input_stream = stream;
|
||||
}
|
||||
public HTMLParserTokenManager(SimpleCharStream stream, int lexState)
|
||||
{
|
||||
public HTMLParserTokenManager(SimpleCharStream stream, int lexState){
|
||||
this(stream);
|
||||
SwitchTo(lexState);
|
||||
}
|
||||
|
|
|
@ -98,19 +98,19 @@ public class ParseException extends Exception {
|
|||
if (!specialConstructor) {
|
||||
return super.getMessage();
|
||||
}
|
||||
String expected = "";
|
||||
StringBuffer expected = new StringBuffer();
|
||||
int maxSize = 0;
|
||||
for (int i = 0; i < expectedTokenSequences.length; i++) {
|
||||
if (maxSize < expectedTokenSequences[i].length) {
|
||||
maxSize = expectedTokenSequences[i].length;
|
||||
}
|
||||
for (int j = 0; j < expectedTokenSequences[i].length; j++) {
|
||||
expected += tokenImage[expectedTokenSequences[i][j]] + " ";
|
||||
expected.append(tokenImage[expectedTokenSequences[i][j]]).append(" ");
|
||||
}
|
||||
if (expectedTokenSequences[i][expectedTokenSequences[i].length - 1] != 0) {
|
||||
expected += "...";
|
||||
expected.append("...");
|
||||
}
|
||||
expected += eol + " ";
|
||||
expected.append(eol).append(" ");
|
||||
}
|
||||
String retval = "Encountered \"";
|
||||
Token tok = currentToken.next;
|
||||
|
@ -130,7 +130,7 @@ public class ParseException extends Exception {
|
|||
} else {
|
||||
retval += "Was expecting one of:" + eol + " ";
|
||||
}
|
||||
retval += expected;
|
||||
retval += expected.toString();
|
||||
return retval;
|
||||
}
|
||||
|
||||
|
@ -179,7 +179,7 @@ public class ParseException extends Exception {
|
|||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u").append(s.substring(s.length() - 4, s.length()));
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 3.0 */
|
||||
/* Generated By:JavaCC: Do not edit this line. SimpleCharStream.java Version 4.0 */
|
||||
package org.apache.lucene.demo.html;
|
||||
|
||||
/**
|
||||
|
@ -27,6 +27,11 @@ public class SimpleCharStream
|
|||
protected char[] buffer;
|
||||
protected int maxNextCharInd = 0;
|
||||
protected int inBuf = 0;
|
||||
protected int tabSize = 8;
|
||||
|
||||
protected void setTabSize(int i) { tabSize = i; }
|
||||
protected int getTabSize(int i) { return tabSize; }
|
||||
|
||||
|
||||
protected void ExpandBuff(boolean wrapAround)
|
||||
{
|
||||
|
@ -162,7 +167,7 @@ public class SimpleCharStream
|
|||
break;
|
||||
case '\t' :
|
||||
column--;
|
||||
column += (8 - (column & 07));
|
||||
column += (tabSize - (column % tabSize));
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
|
@ -248,7 +253,7 @@ public class SimpleCharStream
|
|||
}
|
||||
|
||||
public SimpleCharStream(java.io.Reader dstream, int startline,
|
||||
int startcolumn)
|
||||
int startcolumn)
|
||||
{
|
||||
this(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
@ -277,7 +282,7 @@ public class SimpleCharStream
|
|||
}
|
||||
|
||||
public void ReInit(java.io.Reader dstream, int startline,
|
||||
int startcolumn)
|
||||
int startcolumn)
|
||||
{
|
||||
ReInit(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
@ -286,35 +291,68 @@ public class SimpleCharStream
|
|||
{
|
||||
ReInit(dstream, 1, 1, 4096);
|
||||
}
|
||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096);
|
||||
this(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
||||
int startcolumn)
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
this(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(dstream, encoding, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
public SimpleCharStream(java.io.InputStream dstream, int startline,
|
||||
int startcolumn)
|
||||
{
|
||||
this(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
||||
public SimpleCharStream(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
this(dstream, encoding, 1, 1, 4096);
|
||||
}
|
||||
|
||||
public SimpleCharStream(java.io.InputStream dstream)
|
||||
{
|
||||
this(dstream, 1, 1, 4096);
|
||||
}
|
||||
|
||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn, int buffersize) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(encoding == null ? new java.io.InputStreamReader(dstream) : new java.io.InputStreamReader(dstream, encoding), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
public void ReInit(java.io.InputStream dstream, int startline,
|
||||
int startcolumn, int buffersize)
|
||||
{
|
||||
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, 4096);
|
||||
ReInit(new java.io.InputStreamReader(dstream), startline, startcolumn, buffersize);
|
||||
}
|
||||
|
||||
public void ReInit(java.io.InputStream dstream, String encoding) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(dstream, encoding, 1, 1, 4096);
|
||||
}
|
||||
|
||||
public void ReInit(java.io.InputStream dstream)
|
||||
{
|
||||
ReInit(dstream, 1, 1, 4096);
|
||||
}
|
||||
public void ReInit(java.io.InputStream dstream, String encoding, int startline,
|
||||
int startcolumn) throws java.io.UnsupportedEncodingException
|
||||
{
|
||||
ReInit(dstream, encoding, startline, startcolumn, 4096);
|
||||
}
|
||||
public void ReInit(java.io.InputStream dstream, int startline,
|
||||
int startcolumn)
|
||||
int startcolumn)
|
||||
{
|
||||
ReInit(dstream, startline, startcolumn, 4096);
|
||||
}
|
||||
|
|
|
@ -72,7 +72,7 @@ public class TokenMgrError extends Error
|
|||
default:
|
||||
if ((ch = str.charAt(i)) < 0x20 || ch > 0x7e) {
|
||||
String s = "0000" + Integer.toString(ch, 16);
|
||||
retval.append("\\u").append(s.substring(s.length() - 4, s.length()));
|
||||
retval.append("\\u" + s.substring(s.length() - 4, s.length()));
|
||||
} else {
|
||||
retval.append(ch);
|
||||
}
|
||||
|
|
|
@ -40,11 +40,12 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
super(input);
|
||||
}
|
||||
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (cache == null) {
|
||||
// fill cache lazily
|
||||
cache = new LinkedList();
|
||||
fillCache();
|
||||
fillCache(reusableToken);
|
||||
iterator = cache.iterator();
|
||||
}
|
||||
|
||||
|
@ -52,8 +53,9 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
// the cache is exhausted, return null
|
||||
return null;
|
||||
}
|
||||
|
||||
return (Token) iterator.next();
|
||||
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||
Token nextToken = (Token) iterator.next();
|
||||
return (Token) nextToken.clone();
|
||||
}
|
||||
|
||||
public void reset() throws IOException {
|
||||
|
@ -62,10 +64,9 @@ public class CachingTokenFilter extends TokenFilter {
|
|||
}
|
||||
}
|
||||
|
||||
private void fillCache() throws IOException {
|
||||
Token token;
|
||||
while ( (token = input.next()) != null) {
|
||||
cache.add(token);
|
||||
private void fillCache(final Token reusableToken) throws IOException {
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||
cache.add(nextToken.clone());
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -44,11 +44,12 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
return c;
|
||||
}
|
||||
|
||||
public final Token next(Token token) throws IOException {
|
||||
token.clear();
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
reusableToken.clear();
|
||||
int length = 0;
|
||||
int start = bufferIndex;
|
||||
char[] buffer = token.termBuffer();
|
||||
char[] buffer = reusableToken.termBuffer();
|
||||
while (true) {
|
||||
|
||||
if (bufferIndex >= dataLen) {
|
||||
|
@ -70,7 +71,7 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
if (length == 0) // start of token
|
||||
start = offset + bufferIndex - 1;
|
||||
else if (length == buffer.length)
|
||||
buffer = token.resizeTermBuffer(1+length);
|
||||
buffer = reusableToken.resizeTermBuffer(1+length);
|
||||
|
||||
buffer[length++] = normalize(c); // buffer it, normalized
|
||||
|
||||
|
@ -81,10 +82,10 @@ public abstract class CharTokenizer extends Tokenizer {
|
|||
break; // return 'em
|
||||
}
|
||||
|
||||
token.termLength = length;
|
||||
token.startOffset = start;
|
||||
token.endOffset = start+length;
|
||||
return token;
|
||||
reusableToken.setTermLength(length);
|
||||
reusableToken.setStartOffset(start);
|
||||
reusableToken.setEndOffset(start+length);
|
||||
return reusableToken;
|
||||
}
|
||||
|
||||
public void reset(Reader input) throws IOException {
|
||||
|
|
|
@ -32,22 +32,23 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
|||
private char[] output = new char[256];
|
||||
private int outputPos;
|
||||
|
||||
public final Token next(Token result) throws java.io.IOException {
|
||||
result = input.next(result);
|
||||
if (result != null) {
|
||||
final char[] buffer = result.termBuffer();
|
||||
final int length = result.termLength();
|
||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
final char[] buffer = nextToken.termBuffer();
|
||||
final int length = nextToken.termLength();
|
||||
// If no characters actually require rewriting then we
|
||||
// just return token as-is:
|
||||
for(int i=0;i<length;i++) {
|
||||
final char c = buffer[i];
|
||||
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||
removeAccents(buffer, length);
|
||||
result.setTermBuffer(output, 0, outputPos);
|
||||
nextToken.setTermBuffer(output, 0, outputPos);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
return nextToken;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -38,21 +38,22 @@ public class KeywordTokenizer extends Tokenizer {
|
|||
this.done = false;
|
||||
}
|
||||
|
||||
public Token next(Token result) throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (!done) {
|
||||
done = true;
|
||||
int upto = 0;
|
||||
result.clear();
|
||||
char[] buffer = result.termBuffer();
|
||||
reusableToken.clear();
|
||||
char[] buffer = reusableToken.termBuffer();
|
||||
while (true) {
|
||||
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||
if (length == -1) break;
|
||||
upto += length;
|
||||
if (upto == buffer.length)
|
||||
buffer = result.resizeTermBuffer(1+buffer.length);
|
||||
buffer = reusableToken.resizeTermBuffer(1+buffer.length);
|
||||
}
|
||||
result.termLength = upto;
|
||||
return result;
|
||||
reusableToken.setTermLength(upto);
|
||||
return reusableToken;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -42,16 +42,17 @@ public final class LengthFilter extends TokenFilter {
|
|||
}
|
||||
|
||||
/**
|
||||
* Returns the next input Token whose termText() is the right len
|
||||
* Returns the next input Token whose term() is the right len
|
||||
*/
|
||||
public final Token next(Token result) throws IOException
|
||||
public final Token next(final Token reusableToken) throws IOException
|
||||
{
|
||||
assert reusableToken != null;
|
||||
// return the first non-stop word found
|
||||
for (Token token = input.next(result); token != null; token = input.next(result))
|
||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken))
|
||||
{
|
||||
int len = token.termLength();
|
||||
int len = nextToken.termLength();
|
||||
if (len >= min && len <= max) {
|
||||
return token;
|
||||
return nextToken;
|
||||
}
|
||||
// note: else we ignore it but should we index each part of it?
|
||||
}
|
||||
|
|
|
@ -29,16 +29,17 @@ public final class LowerCaseFilter extends TokenFilter {
|
|||
super(in);
|
||||
}
|
||||
|
||||
public final Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null) {
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken != null) {
|
||||
|
||||
final char[] buffer = result.termBuffer();
|
||||
final int length = result.termLength;
|
||||
final char[] buffer = nextToken.termBuffer();
|
||||
final int length = nextToken.termLength();
|
||||
for(int i=0;i<length;i++)
|
||||
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||
|
||||
return result;
|
||||
return nextToken;
|
||||
} else
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -45,13 +45,14 @@ public final class PorterStemFilter extends TokenFilter {
|
|||
stemmer = new PorterStemmer();
|
||||
}
|
||||
|
||||
public final Token next(Token result) throws IOException {
|
||||
result = input.next(result);
|
||||
if (result != null) {
|
||||
if (stemmer.stem(result.termBuffer(), 0, result.termLength))
|
||||
result.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return result;
|
||||
} else
|
||||
public final Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
Token nextToken = input.next(reusableToken);
|
||||
if (nextToken == null)
|
||||
return null;
|
||||
|
||||
if (stemmer.stem(nextToken.termBuffer(), 0, nextToken.termLength()))
|
||||
nextToken.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||
return nextToken;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,11 +22,11 @@ public class SinkTokenizer extends Tokenizer {
|
|||
}
|
||||
|
||||
public SinkTokenizer() {
|
||||
this.lst = new ArrayList();
|
||||
this.lst = new ArrayList/*<Token>*/();
|
||||
}
|
||||
|
||||
public SinkTokenizer(int initCap){
|
||||
this.lst = new ArrayList(initCap);
|
||||
this.lst = new ArrayList/*<Token>*/(initCap);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -35,6 +35,8 @@ public class SinkTokenizer extends Tokenizer {
|
|||
* WARNING: Adding tokens to this list requires the {@link #reset()} method to be called in order for them
|
||||
* to be made available. Also, this Tokenizer does nothing to protect against {@link java.util.ConcurrentModificationException}s
|
||||
* in the case of adds happening while {@link #next(org.apache.lucene.analysis.Token)} is being called.
|
||||
* <p/>
|
||||
* WARNING: Since this SinkTokenizer can be reset and the cached tokens made available again, do not modify them. Modify clones instead.
|
||||
*
|
||||
* @return A List of {@link org.apache.lucene.analysis.Token}s
|
||||
*/
|
||||
|
@ -47,9 +49,15 @@ public class SinkTokenizer extends Tokenizer {
|
|||
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
||||
* @throws IOException
|
||||
*/
|
||||
public Token next() throws IOException {
|
||||
public Token next(final Token reusableToken) throws IOException {
|
||||
assert reusableToken != null;
|
||||
if (iter == null) iter = lst.iterator();
|
||||
return iter.hasNext() ? (Token) iter.next() : null;
|
||||
// Since this TokenStream can be reset we have to maintain the tokens as immutable
|
||||
if (iter.hasNext()) {
|
||||
Token nextToken = (Token) iter.next();
|
||||
return (Token) nextToken.clone();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue