mirror of https://github.com/apache/lucene.git
LUCENE-1422: New TokenStream API that uses a new class called AttributeSource instead of the now deprecated Token class. All attributes that the Token class had have been moved into separate classes: TermAttribute, OffsetAttribute, PositionIncrementAttribute, PayloadAttribute, TypeAttribute and FlagsAttribute.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@718798 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
72e94add53
commit
898cfe87cd
|
@ -25,6 +25,15 @@ API Changes
|
||||||
and deprecate FSDirectory.getDirectory(). FSDirectory instances
|
and deprecate FSDirectory.getDirectory(). FSDirectory instances
|
||||||
are not required to be singletons per path. (yonik)
|
are not required to be singletons per path. (yonik)
|
||||||
|
|
||||||
|
4. LUCENE-1422: New TokenStream API that uses a new class called
|
||||||
|
AttributeSource instead of the now deprecated Token class. All attributes
|
||||||
|
that the Token class had have been moved into separate classes:
|
||||||
|
TermAttribute, OffsetAttribute, PositionIncrementAttribute,
|
||||||
|
PayloadAttribute, TypeAttribute and FlagsAttribute. The new API
|
||||||
|
is much more flexible; it allows to combine the Attributes arbitrarily
|
||||||
|
and also to define custom Attributes. The new API has the same performance
|
||||||
|
as the old next(Token) approach. (Michael Busch)
|
||||||
|
|
||||||
Bug fixes
|
Bug fixes
|
||||||
|
|
||||||
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
1. LUCENE-1415: MultiPhraseQuery has incorrect hashCode() and equals()
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.util.Iterator;
|
||||||
import java.util.LinkedList;
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class can be used if the Tokens of a TokenStream
|
* This class can be used if the Tokens of a TokenStream
|
||||||
* are intended to be consumed more than once. It caches
|
* are intended to be consumed more than once. It caches
|
||||||
|
@ -34,12 +36,31 @@ import java.util.List;
|
||||||
*/
|
*/
|
||||||
public class CachingTokenFilter extends TokenFilter {
|
public class CachingTokenFilter extends TokenFilter {
|
||||||
private List cache;
|
private List cache;
|
||||||
private Iterator iterator;
|
private Iterator iterator;
|
||||||
|
|
||||||
public CachingTokenFilter(TokenStream input) {
|
public CachingTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (cache == null) {
|
||||||
|
// fill cache lazily
|
||||||
|
cache = new LinkedList();
|
||||||
|
fillCache();
|
||||||
|
iterator = cache.iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!iterator.hasNext()) {
|
||||||
|
// the cache is exhausted, return null
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Since the TokenFilter can be reset, the tokens need to be preserved as immutable.
|
||||||
|
AttributeSource state = (AttributeSource) iterator.next();
|
||||||
|
state.restoreState(this);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
if (cache == null) {
|
if (cache == null) {
|
||||||
|
@ -60,10 +81,17 @@ public class CachingTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
if(cache != null) {
|
if(cache != null) {
|
||||||
iterator = cache.iterator();
|
iterator = cache.iterator();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void fillCache() throws IOException {
|
||||||
|
while(input.incrementToken()) {
|
||||||
|
cache.add(captureState());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
private void fillCache(final Token reusableToken) throws IOException {
|
private void fillCache(final Token reusableToken) throws IOException {
|
||||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
||||||
cache.add(nextToken.clone());
|
cache.add(nextToken.clone());
|
||||||
|
|
|
@ -20,16 +20,24 @@ package org.apache.lucene.analysis;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/** An abstract base class for simple, character-oriented tokenizers.*/
|
/** An abstract base class for simple, character-oriented tokenizers.*/
|
||||||
public abstract class CharTokenizer extends Tokenizer {
|
public abstract class CharTokenizer extends Tokenizer {
|
||||||
public CharTokenizer(Reader input) {
|
public CharTokenizer(Reader input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
private int offset = 0, bufferIndex = 0, dataLen = 0;
|
||||||
private static final int MAX_WORD_LEN = 255;
|
private static final int MAX_WORD_LEN = 255;
|
||||||
private static final int IO_BUFFER_SIZE = 4096;
|
private static final int IO_BUFFER_SIZE = 4096;
|
||||||
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
/** Returns true iff a character should be included in a token. This
|
/** Returns true iff a character should be included in a token. This
|
||||||
* tokenizer generates as tokens adjacent sequences of characters which
|
* tokenizer generates as tokens adjacent sequences of characters which
|
||||||
|
@ -44,6 +52,50 @@ public abstract class CharTokenizer extends Tokenizer {
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
clearAttributes();
|
||||||
|
int length = 0;
|
||||||
|
int start = bufferIndex;
|
||||||
|
char[] buffer = termAtt.termBuffer();
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
if (bufferIndex >= dataLen) {
|
||||||
|
offset += dataLen;
|
||||||
|
dataLen = input.read(ioBuffer);
|
||||||
|
if (dataLen == -1) {
|
||||||
|
if (length > 0)
|
||||||
|
break;
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
bufferIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
final char c = ioBuffer[bufferIndex++];
|
||||||
|
|
||||||
|
if (isTokenChar(c)) { // if it's a token char
|
||||||
|
|
||||||
|
if (length == 0) // start of token
|
||||||
|
start = offset + bufferIndex - 1;
|
||||||
|
else if (length == buffer.length)
|
||||||
|
buffer = termAtt.resizeTermBuffer(1+length);
|
||||||
|
|
||||||
|
buffer[length++] = normalize(c); // buffer it, normalized
|
||||||
|
|
||||||
|
if (length == MAX_WORD_LEN) // buffer overflow!
|
||||||
|
break;
|
||||||
|
|
||||||
|
} else if (length > 0) // at non-Letter w/ chars
|
||||||
|
break; // return 'em
|
||||||
|
}
|
||||||
|
|
||||||
|
termAtt.setTermLength(length);
|
||||||
|
offsetAtt.setStartOffset(start);
|
||||||
|
offsetAtt.setEndOffset(start+length);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
reusableToken.clear();
|
reusableToken.clear();
|
||||||
|
|
|
@ -1,5 +1,7 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -27,11 +29,33 @@ package org.apache.lucene.analysis;
|
||||||
public class ISOLatin1AccentFilter extends TokenFilter {
|
public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
public ISOLatin1AccentFilter(TokenStream input) {
|
public ISOLatin1AccentFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private char[] output = new char[256];
|
private char[] output = new char[256];
|
||||||
private int outputPos;
|
private int outputPos;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
final char[] buffer = termAtt.termBuffer();
|
||||||
|
final int length = termAtt.termLength();
|
||||||
|
// If no characters actually require rewriting then we
|
||||||
|
// just return token as-is:
|
||||||
|
for(int i=0;i<length;i++) {
|
||||||
|
final char c = buffer[i];
|
||||||
|
if (c >= '\u00c0' && c <= '\uFB06') {
|
||||||
|
removeAccents(buffer, length);
|
||||||
|
termAtt.setTermBuffer(output, 0, outputPos);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
|
@ -241,7 +265,7 @@ public class ISOLatin1AccentFilter extends TokenFilter {
|
||||||
case '\uFB06': // st
|
case '\uFB06': // st
|
||||||
output[outputPos++] = 's';
|
output[outputPos++] = 's';
|
||||||
output[outputPos++] = 't';
|
output[outputPos++] = 't';
|
||||||
break;
|
break;
|
||||||
default :
|
default :
|
||||||
output[outputPos++] = c;
|
output[outputPos++] = c;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -20,6 +20,9 @@ package org.apache.lucene.analysis;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Emits the entire input as a single token.
|
* Emits the entire input as a single token.
|
||||||
*/
|
*/
|
||||||
|
@ -28,7 +31,9 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
private static final int DEFAULT_BUFFER_SIZE = 256;
|
private static final int DEFAULT_BUFFER_SIZE = 256;
|
||||||
|
|
||||||
private boolean done;
|
private boolean done;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
public KeywordTokenizer(Reader input) {
|
public KeywordTokenizer(Reader input) {
|
||||||
this(input, DEFAULT_BUFFER_SIZE);
|
this(input, DEFAULT_BUFFER_SIZE);
|
||||||
}
|
}
|
||||||
|
@ -36,8 +41,32 @@ public class KeywordTokenizer extends Tokenizer {
|
||||||
public KeywordTokenizer(Reader input, int bufferSize) {
|
public KeywordTokenizer(Reader input, int bufferSize) {
|
||||||
super(input);
|
super(input);
|
||||||
this.done = false;
|
this.done = false;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!done) {
|
||||||
|
done = true;
|
||||||
|
int upto = 0;
|
||||||
|
termAtt.clear();
|
||||||
|
char[] buffer = termAtt.termBuffer();
|
||||||
|
while (true) {
|
||||||
|
final int length = input.read(buffer, upto, buffer.length-upto);
|
||||||
|
if (length == -1) break;
|
||||||
|
upto += length;
|
||||||
|
if (upto == buffer.length)
|
||||||
|
buffer = termAtt.resizeTermBuffer(1+buffer.length);
|
||||||
|
}
|
||||||
|
termAtt.setTermLength(upto);
|
||||||
|
offsetAtt.setStartOffset(0);
|
||||||
|
offsetAtt.setEndOffset(upto);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
if (!done) {
|
if (!done) {
|
||||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes words that are too long and too short from the stream.
|
* Removes words that are too long and too short from the stream.
|
||||||
*
|
*
|
||||||
|
@ -29,6 +31,8 @@ public final class LengthFilter extends TokenFilter {
|
||||||
|
|
||||||
final int min;
|
final int min;
|
||||||
final int max;
|
final int max;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build a filter that removes words that are too long or too
|
* Build a filter that removes words that are too long or too
|
||||||
|
@ -39,10 +43,28 @@ public final class LengthFilter extends TokenFilter {
|
||||||
super(in);
|
super(in);
|
||||||
this.min = min;
|
this.min = min;
|
||||||
this.max = max;
|
this.max = max;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next input Token whose term() is the right len
|
||||||
|
*/
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
// return the first non-stop word found
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
int len = termAtt.termLength();
|
||||||
|
if (len >= min && len <= max) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// note: else we ignore it but should we index each part of it?
|
||||||
|
}
|
||||||
|
// reached EOS -- return null
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token whose term() is the right len
|
* Returns the next input Token whose term() is the right len
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken) throws IOException
|
public final Token next(final Token reusableToken) throws IOException
|
||||||
{
|
{
|
||||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes token text to lower case.
|
* Normalizes token text to lower case.
|
||||||
*
|
*
|
||||||
|
@ -27,8 +29,25 @@ import java.io.IOException;
|
||||||
public final class LowerCaseFilter extends TokenFilter {
|
public final class LowerCaseFilter extends TokenFilter {
|
||||||
public LowerCaseFilter(TokenStream in) {
|
public LowerCaseFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
|
||||||
|
final char[] buffer = termAtt.termBuffer();
|
||||||
|
final int length = termAtt.termLength();
|
||||||
|
for(int i=0;i<length;i++)
|
||||||
|
buffer[i] = Character.toLowerCase(buffer[i]);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
|
|
|
@ -19,6 +19,8 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/** Transforms the token stream as per the Porter stemming algorithm.
|
/** Transforms the token stream as per the Porter stemming algorithm.
|
||||||
Note: the input to the stemming filter must already be in lower case,
|
Note: the input to the stemming filter must already be in lower case,
|
||||||
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
|
so you will need to use LowerCaseFilter or LowerCaseTokenizer farther
|
||||||
|
@ -39,12 +41,24 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public final class PorterStemFilter extends TokenFilter {
|
public final class PorterStemFilter extends TokenFilter {
|
||||||
private PorterStemmer stemmer;
|
private PorterStemmer stemmer;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
public PorterStemFilter(TokenStream in) {
|
public PorterStemFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new PorterStemmer();
|
stemmer = new PorterStemmer();
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (stemmer.stem(termAtt.termBuffer(), 0, termAtt.termLength()))
|
||||||
|
termAtt.setTermBuffer(stemmer.getResultBuffer(), 0, stemmer.getResultLength());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.util.ArrayList;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
|
* A SinkTokenizer can be used to cache Tokens for use in an Analyzer
|
||||||
|
@ -32,7 +34,7 @@ import java.util.List;
|
||||||
public class SinkTokenizer extends Tokenizer {
|
public class SinkTokenizer extends Tokenizer {
|
||||||
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
|
protected List/*<Token>*/ lst = new ArrayList/*<Token>*/();
|
||||||
protected Iterator/*<Token>*/ iter;
|
protected Iterator/*<Token>*/ iter;
|
||||||
|
|
||||||
public SinkTokenizer(List/*<Token>*/ input) {
|
public SinkTokenizer(List/*<Token>*/ input) {
|
||||||
this.lst = input;
|
this.lst = input;
|
||||||
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
|
if (this.lst == null) this.lst = new ArrayList/*<Token>*/();
|
||||||
|
@ -61,10 +63,30 @@ public class SinkTokenizer extends Tokenizer {
|
||||||
return lst;
|
return lst;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Increments this stream to the next token out of the list of cached tokens
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (iter == null) iter = lst.iterator();
|
||||||
|
// Since this TokenStream can be reset we have to maintain the tokens as immutable
|
||||||
|
if (iter.hasNext()) {
|
||||||
|
AttributeSource state = (AttributeSource) iter.next();
|
||||||
|
state.restoreState(this);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(AttributeSource source) throws IOException {
|
||||||
|
lst.add(source);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next token out of the list of cached tokens
|
* Returns the next token out of the list of cached tokens
|
||||||
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
* @return The next {@link org.apache.lucene.analysis.Token} in the Sink.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
|
@ -77,8 +99,6 @@ public class SinkTokenizer extends Tokenizer {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override this method to cache only certain tokens, or new tokens based
|
* Override this method to cache only certain tokens, or new tokens based
|
||||||
* on the old tokens.
|
* on the old tokens.
|
||||||
|
|
|
@ -21,6 +21,9 @@ import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Removes stop words from a token stream.
|
* Removes stop words from a token stream.
|
||||||
*/
|
*/
|
||||||
|
@ -32,6 +35,9 @@ public final class StopFilter extends TokenFilter {
|
||||||
private final CharArraySet stopWords;
|
private final CharArraySet stopWords;
|
||||||
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
|
private boolean enablePositionIncrements = ENABLE_POSITION_INCREMENTS_DEFAULT;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Construct a token stream filtering the given input.
|
* Construct a token stream filtering the given input.
|
||||||
*/
|
*/
|
||||||
|
@ -47,6 +53,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
public StopFilter(TokenStream in, String[] stopWords, boolean ignoreCase) {
|
||||||
super(in);
|
super(in);
|
||||||
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
|
this.stopWords = (CharArraySet)makeStopSet(stopWords, ignoreCase);
|
||||||
|
init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -74,6 +81,7 @@ public final class StopFilter extends TokenFilter {
|
||||||
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
this.stopWords = new CharArraySet(stopWords.size(), ignoreCase);
|
||||||
this.stopWords.addAll(stopWords);
|
this.stopWords.addAll(stopWords);
|
||||||
}
|
}
|
||||||
|
init();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -85,6 +93,11 @@ public final class StopFilter extends TokenFilter {
|
||||||
public StopFilter(TokenStream in, Set stopWords) {
|
public StopFilter(TokenStream in, Set stopWords) {
|
||||||
this(in, stopWords, false);
|
this(in, stopWords, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public void init() {
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Builds a Set from an array of stop words,
|
* Builds a Set from an array of stop words,
|
||||||
|
@ -109,9 +122,29 @@ public final class StopFilter extends TokenFilter {
|
||||||
stopSet.addAll(Arrays.asList(stopWords));
|
stopSet.addAll(Arrays.asList(stopWords));
|
||||||
return stopSet;
|
return stopSet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next input Token whose term() is not a stop word.
|
||||||
|
*/
|
||||||
|
public final boolean incrementToken() throws IOException {
|
||||||
|
// return the first non-stop word found
|
||||||
|
int skippedPositions = 0;
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
if (!stopWords.contains(termAtt.termBuffer(), 0, termAtt.termLength())) {
|
||||||
|
if (enablePositionIncrements) {
|
||||||
|
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
skippedPositions += posIncrAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
// reached EOS -- return null
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the next input Token whose term() is not a stop word.
|
* Returns the next input Token whose term() is not a stop word.
|
||||||
|
* @deprecated
|
||||||
*/
|
*/
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public final Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
|
|
|
@ -18,6 +18,7 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -60,8 +61,21 @@ public class TeeTokenFilter extends TokenFilter {
|
||||||
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
|
public TeeTokenFilter(TokenStream input, SinkTokenizer sink) {
|
||||||
super(input);
|
super(input);
|
||||||
this.sink = sink;
|
this.sink = sink;
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
sink.addAttribute(it.next().getClass());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (input.incrementToken()) {
|
||||||
|
sink.add(captureState());
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
|
|
|
@ -21,7 +21,11 @@ import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.index.TermPositions; // for javadoc
|
import org.apache.lucene.index.TermPositions; // for javadoc
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
/** A Token is an occurrence of a term from the text of a field. It consists of
|
/**
|
||||||
|
This class is now deprecated and a new TokenStream API was introduced with Lucene 2.9.
|
||||||
|
See Javadocs in {@link TokenStream} for further details.
|
||||||
|
<p>
|
||||||
|
A Token is an occurrence of a term from the text of a field. It consists of
|
||||||
a term's text, the start and end offset of the term in the text of the field,
|
a term's text, the start and end offset of the term in the text of the field,
|
||||||
and a type string.
|
and a type string.
|
||||||
<p>
|
<p>
|
||||||
|
@ -114,6 +118,8 @@ import org.apache.lucene.util.ArrayUtil;
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@see org.apache.lucene.index.Payload
|
@see org.apache.lucene.index.Payload
|
||||||
|
@deprecated A new TokenStream API was introduced with Lucene 2.9.
|
||||||
|
See javadocs in {@link TokenStream} for further details.
|
||||||
*/
|
*/
|
||||||
public class Token implements Cloneable {
|
public class Token implements Cloneable {
|
||||||
|
|
||||||
|
|
|
@ -22,9 +22,16 @@ import java.io.IOException;
|
||||||
/** A TokenFilter is a TokenStream whose input is another token stream.
|
/** A TokenFilter is a TokenStream whose input is another token stream.
|
||||||
<p>
|
<p>
|
||||||
This is an abstract class.
|
This is an abstract class.
|
||||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
NOTE: subclasses must override
|
||||||
also OK to instead override {@link #next()} but that
|
{@link #incrementToken()} if the new TokenStream API is used
|
||||||
method is now deprecated in favor of {@link #next(Token)}.
|
and {@link #next(Token)} or {@link #next()} if the old
|
||||||
|
TokenStream API is used.
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
<p>
|
||||||
|
See {@link TokenStream}
|
||||||
*/
|
*/
|
||||||
public abstract class TokenFilter extends TokenStream {
|
public abstract class TokenFilter extends TokenStream {
|
||||||
/** The source of tokens for this filter. */
|
/** The source of tokens for this filter. */
|
||||||
|
@ -32,9 +39,10 @@ public abstract class TokenFilter extends TokenStream {
|
||||||
|
|
||||||
/** Construct a token stream filtering the given input. */
|
/** Construct a token stream filtering the given input. */
|
||||||
protected TokenFilter(TokenStream input) {
|
protected TokenFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
this.input = input;
|
this.input = input;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Close the input TokenStream. */
|
/** Close the input TokenStream. */
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
input.close();
|
input.close();
|
||||||
|
@ -45,4 +53,17 @@ public abstract class TokenFilter extends TokenStream {
|
||||||
super.reset();
|
super.reset();
|
||||||
input.reset();
|
input.reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean useNewAPI() {
|
||||||
|
return input.useNewAPI();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets whether or not to use the new TokenStream API. Settings this
|
||||||
|
* will apply to this Filter and all TokenStream/Filters upstream.
|
||||||
|
*/
|
||||||
|
public void setUseNewAPI(boolean use) {
|
||||||
|
input.setUseNewAPI(use);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,9 +17,12 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.index.Payload;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/** A TokenStream enumerates the sequence of tokens, either from
|
/** A TokenStream enumerates the sequence of tokens, either from
|
||||||
fields of a document or from query text.
|
fields of a document or from query text.
|
||||||
|
@ -31,13 +34,140 @@ import java.io.IOException;
|
||||||
<li>{@link TokenFilter}, a TokenStream
|
<li>{@link TokenFilter}, a TokenStream
|
||||||
whose input is another TokenStream.
|
whose input is another TokenStream.
|
||||||
</ul>
|
</ul>
|
||||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
A new TokenStream API is introduced with Lucene 2.9. Since
|
||||||
also OK to instead override {@link #next()} but that
|
2.9 Token is deprecated and the preferred way to store
|
||||||
method is now deprecated in favor of {@link #next(Token)}.
|
the information of a token is to use {@link Attribute}s.
|
||||||
|
<p>
|
||||||
|
For that reason TokenStream extends {@link AttributeSource}
|
||||||
|
now. Note that only one instance per {@link Attribute} is
|
||||||
|
created and reused for every token. This approach reduces
|
||||||
|
object creations and allows local caching of references to
|
||||||
|
the {@link Attribute}s. See {@link #incrementToken()} for further details.
|
||||||
|
<p>
|
||||||
|
<b>The workflow of the new TokenStream API is as follows:</b>
|
||||||
|
<ol>
|
||||||
|
<li>Instantiation of TokenStream/TokenFilters which add/get attributes
|
||||||
|
to/from the {@link AttributeSource}.
|
||||||
|
<li>The consumer calls {@link TokenStream#reset()}.
|
||||||
|
<li>the consumer retrieves attributes from the
|
||||||
|
stream and stores local references to all attributes it wants to access
|
||||||
|
<li>The consumer calls {@link #incrementToken()} until it returns false and
|
||||||
|
consumes the attributes after each call.
|
||||||
|
</ol>
|
||||||
|
To make sure that filters and consumers know which attributes are available
|
||||||
|
the attributes must be added in the during instantiation. Filters and
|
||||||
|
consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||||
|
<p>
|
||||||
|
Sometimes it is desirable to capture a current state of a
|
||||||
|
TokenStream, e. g. for buffering purposes (see {@link CachingTokenFilter},
|
||||||
|
{@link TeeTokenFilter}/{@link SinkTokenizer}). For this usecase
|
||||||
|
{@link AttributeSource#captureState()} and {@link AttributeSource#restoreState(AttributeSource)} can be used.
|
||||||
|
<p>
|
||||||
|
<b>NOTE:</b> In order to enable the new API the method
|
||||||
|
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||||
|
Otherwise the deprecated method {@link #next(Token)} will
|
||||||
|
be used by Lucene consumers (indexer and queryparser) to
|
||||||
|
consume the tokens. {@link #next(Token)} will be removed
|
||||||
|
in Lucene 3.0.
|
||||||
|
<p>
|
||||||
|
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||||
|
It's also OK to instead override {@link #next()} but that
|
||||||
|
method is slower compared to {@link #next(Token)}.
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class TokenStream {
|
public abstract class TokenStream extends AttributeSource {
|
||||||
|
private static boolean useNewAPIDefault = false;
|
||||||
|
private boolean useNewAPI = useNewAPIDefault;
|
||||||
|
|
||||||
|
protected TokenStream() {
|
||||||
|
super();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected TokenStream(AttributeSource input) {
|
||||||
|
super(input);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether or not the new TokenStream APIs are used
|
||||||
|
* by default.
|
||||||
|
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||||
|
*/
|
||||||
|
public static boolean useNewAPIDefault() {
|
||||||
|
return useNewAPIDefault;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use this API to enable or disable the new TokenStream API.
|
||||||
|
* by default. Can be overridden by calling {@link #setUseNewAPI(boolean)}.
|
||||||
|
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||||
|
* <p>
|
||||||
|
* If set to true, the indexer will call {@link #incrementToken()}
|
||||||
|
* to consume Tokens from this stream.
|
||||||
|
* <p>
|
||||||
|
* If set to false, the indexer will call {@link #next(Token)}
|
||||||
|
* instead.
|
||||||
|
*/
|
||||||
|
public static void setUseNewAPIDefault(boolean use) {
|
||||||
|
useNewAPIDefault = use;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns whether or not the new TokenStream APIs are used
|
||||||
|
* for this stream.
|
||||||
|
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||||
|
*/
|
||||||
|
public boolean useNewAPI() {
|
||||||
|
return useNewAPI;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use this API to enable or disable the new TokenStream API
|
||||||
|
* for this stream. Overrides {@link #setUseNewAPIDefault(boolean)}.
|
||||||
|
* (see {@link #incrementToken()}, {@link AttributeSource}).
|
||||||
|
* <p>
|
||||||
|
* If set to true, the indexer will call {@link #incrementToken()}
|
||||||
|
* to consume Tokens from this stream.
|
||||||
|
* <p>
|
||||||
|
* If set to false, the indexer will call {@link #next(Token)}
|
||||||
|
* instead.
|
||||||
|
* <p>
|
||||||
|
* <b>NOTE: All streams and filters in one chain must use the
|
||||||
|
* same API. </b>
|
||||||
|
*/
|
||||||
|
public void setUseNewAPI(boolean use) {
|
||||||
|
useNewAPI = use;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Consumers (e. g. the indexer) use this method to advance the stream
|
||||||
|
* to the next token. Implementing classes must implement this method
|
||||||
|
* and update the appropriate {@link Attribute}s with content of the
|
||||||
|
* next token.
|
||||||
|
* <p>
|
||||||
|
* This method is called for every token of a document, so an efficient
|
||||||
|
* implementation is crucial for good performance. To avoid calls to
|
||||||
|
* {@link #addAttribute(Class)} and {@link #getAttribute(Class)} and
|
||||||
|
* downcasts, references to all {@link Attribute}s that this stream uses
|
||||||
|
* should be retrieved during instantiation.
|
||||||
|
* <p>
|
||||||
|
* To make sure that filters and consumers know which attributes are available
|
||||||
|
* the attributes must be added during instantiation. Filters and
|
||||||
|
* consumers are not required to check for availability of attributes in {@link #incrementToken()}.
|
||||||
|
*
|
||||||
|
* @return false for end of stream; true otherwise
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* <b>Note that this method will be defined abstract in Lucene 3.0.<b>
|
||||||
|
*/
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
// subclasses must implement this method; will be made abstract in Lucene 3.0
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS.
|
/** Returns the next token in the stream, or null at EOS.
|
||||||
* @deprecated The returned Token is a "full private copy" (not
|
* @deprecated The returned Token is a "full private copy" (not
|
||||||
* re-used across calls to next()) but will be slower
|
* re-used across calls to next()) but will be slower
|
||||||
|
@ -84,6 +214,8 @@ public abstract class TokenStream {
|
||||||
* is not required to check for null before using it, but it is a
|
* is not required to check for null before using it, but it is a
|
||||||
* good idea to assert that it is not null.)
|
* good idea to assert that it is not null.)
|
||||||
* @return next token in the stream or null if end-of-stream was hit
|
* @return next token in the stream or null if end-of-stream was hit
|
||||||
|
* @deprecated The new {@link #incrementToken()} and {@link AttributeSource}
|
||||||
|
* APIs should be used instead. See also {@link #useNewAPI()}.
|
||||||
*/
|
*/
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
// We don't actually use inputToken, but still add this assert
|
// We don't actually use inputToken, but still add this assert
|
||||||
|
@ -107,4 +239,25 @@ public abstract class TokenStream {
|
||||||
|
|
||||||
/** Releases resources associated with this stream. */
|
/** Releases resources associated with this stream. */
|
||||||
public void close() throws IOException {}
|
public void close() throws IOException {}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
StringBuffer sb = new StringBuffer();
|
||||||
|
sb.append('(');
|
||||||
|
|
||||||
|
if (hasAttributes()) {
|
||||||
|
// TODO Java 1.5
|
||||||
|
//Iterator<Attribute> it = attributes.values().iterator();
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
if (it.hasNext()) {
|
||||||
|
sb.append(it.next().toString());
|
||||||
|
}
|
||||||
|
while (it.hasNext()) {
|
||||||
|
sb.append(',');
|
||||||
|
sb.append(it.next().toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
sb.append(')');
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,12 +24,23 @@ import java.io.IOException;
|
||||||
<p>
|
<p>
|
||||||
This is an abstract class.
|
This is an abstract class.
|
||||||
<p>
|
<p>
|
||||||
NOTE: subclasses must override {@link #next(Token)}. It's
|
<b>NOTE:</b> In order to enable the new API the method
|
||||||
also OK to instead override {@link #next()} but that
|
{@link #useNewAPI()} has to be called with useNewAPI=true.
|
||||||
method is now deprecated in favor of {@link #next(Token)}.
|
Otherwise the deprecated method {@link #next(Token)} will
|
||||||
|
be used by Lucene consumers (indexer and queryparser) to
|
||||||
|
consume the tokens. {@link #next(Token)} will be removed
|
||||||
|
in Lucene 3.0.
|
||||||
<p>
|
<p>
|
||||||
|
NOTE: To use the old API subclasses must override {@link #next(Token)}.
|
||||||
|
It's also OK to instead override {@link #next()} but that
|
||||||
|
method is slower compared to {@link #next(Token)}.
|
||||||
|
<p>
|
||||||
NOTE: subclasses overriding {@link #next(Token)} must
|
NOTE: subclasses overriding {@link #next(Token)} must
|
||||||
call {@link Token#clear()}.
|
call {@link Token#clear()}.
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public abstract class Tokenizer extends TokenStream {
|
public abstract class Tokenizer extends TokenStream {
|
||||||
|
|
|
@ -35,8 +35,7 @@ application using Lucene to use an appropriate <i>Parser</i> to convert the orig
|
||||||
<h2>Tokenization</h2>
|
<h2>Tokenization</h2>
|
||||||
<p>
|
<p>
|
||||||
Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
|
Plain text passed to Lucene for indexing goes through a process generally called tokenization – namely breaking of the
|
||||||
input text into small indexing elements –
|
input text into small indexing elements – tokens.
|
||||||
{@link org.apache.lucene.analysis.Token Tokens}.
|
|
||||||
The way input text is broken into tokens very
|
The way input text is broken into tokens very
|
||||||
much dictates further capabilities of search upon that text.
|
much dictates further capabilities of search upon that text.
|
||||||
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
|
For instance, sentences beginnings and endings can be identified to provide for more accurate phrase
|
||||||
|
@ -72,12 +71,13 @@ providing for several functions, including (but not limited to):
|
||||||
<li>{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
|
<li>{@link org.apache.lucene.analysis.Analyzer} – An Analyzer is responsible for building a {@link org.apache.lucene.analysis.TokenStream} which can be consumed
|
||||||
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
|
by the indexing and searching processes. See below for more information on implementing your own Analyzer.</li>
|
||||||
<li>{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
|
<li>{@link org.apache.lucene.analysis.Tokenizer} – A Tokenizer is a {@link org.apache.lucene.analysis.TokenStream} and is responsible for breaking
|
||||||
up incoming text into {@link org.apache.lucene.analysis.Token}s. In most cases, an Analyzer will use a Tokenizer as the first step in
|
up incoming text into tokens. In most cases, an Analyzer will use a Tokenizer as the first step in
|
||||||
the analysis process.</li>
|
the analysis process.</li>
|
||||||
<li>{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
|
<li>{@link org.apache.lucene.analysis.TokenFilter} – A TokenFilter is also a {@link org.apache.lucene.analysis.TokenStream} and is responsible
|
||||||
for modifying {@link org.apache.lucene.analysis.Token}s that have been created by the Tokenizer. Common modifications performed by a
|
for modifying tokenss that have been created by the Tokenizer. Common modifications performed by a
|
||||||
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
|
TokenFilter are: deletion, stemming, synonym injection, and down casing. Not all Analyzers require TokenFilters</li>
|
||||||
</ul>
|
</ul>
|
||||||
|
<b>Since Lucene 2.9 the TokenStream API was changed. Please see section "New TokenStream API" below for details.</b>
|
||||||
</p>
|
</p>
|
||||||
<h2>Hints, Tips and Traps</h2>
|
<h2>Hints, Tips and Traps</h2>
|
||||||
<p>
|
<p>
|
||||||
|
@ -140,9 +140,8 @@ providing for several functions, including (but not limited to):
|
||||||
<PRE>
|
<PRE>
|
||||||
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
|
Analyzer analyzer = new StandardAnalyzer(); // or any other analyzer
|
||||||
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
|
TokenStream ts = analyzer.tokenStream("myfield",new StringReader("some text goes here"));
|
||||||
Token t = ts.next();
|
while (ts.incrementToken()) {
|
||||||
while (t!=null) {
|
System.out.println("token: "+ts));
|
||||||
System.out.println("token: "+t));
|
|
||||||
t = ts.next();
|
t = ts.next();
|
||||||
}
|
}
|
||||||
</PRE>
|
</PRE>
|
||||||
|
@ -179,7 +178,7 @@ the source code of any one of the many samples located in this package.
|
||||||
<p>
|
<p>
|
||||||
The following sections discuss some aspects of implementing your own analyzer.
|
The following sections discuss some aspects of implementing your own analyzer.
|
||||||
</p>
|
</p>
|
||||||
<h3>Field Section Boundaries</h2>
|
<h3>Field Section Boundaries</h3>
|
||||||
<p>
|
<p>
|
||||||
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
|
When {@link org.apache.lucene.document.Document#add(org.apache.lucene.document.Fieldable) document.add(field)}
|
||||||
is called multiple times for the same field name, we could say that each such call creates a new
|
is called multiple times for the same field name, we could say that each such call creates a new
|
||||||
|
@ -208,10 +207,10 @@ the source code of any one of the many samples located in this package.
|
||||||
};
|
};
|
||||||
</PRE>
|
</PRE>
|
||||||
</p>
|
</p>
|
||||||
<h3>Token Position Increments</h2>
|
<h3>Token Position Increments</h3>
|
||||||
<p>
|
<p>
|
||||||
By default, all tokens created by Analyzers and Tokenizers have a
|
By default, all tokens created by Analyzers and Tokenizers have a
|
||||||
{@link org.apache.lucene.analysis.Token#getPositionIncrement() position increment} of one.
|
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() position increment} of one.
|
||||||
This means that the position stored for that token in the index would be one more than
|
This means that the position stored for that token in the index would be one more than
|
||||||
that of the previous token.
|
that of the previous token.
|
||||||
Recall that phrase and proximity searches rely on position info.
|
Recall that phrase and proximity searches rely on position info.
|
||||||
|
@ -227,26 +226,29 @@ the source code of any one of the many samples located in this package.
|
||||||
If this behavior does not fit the application needs,
|
If this behavior does not fit the application needs,
|
||||||
a modified analyzer can be used, that would increment further the positions of
|
a modified analyzer can be used, that would increment further the positions of
|
||||||
tokens following a removed stop word, using
|
tokens following a removed stop word, using
|
||||||
{@link org.apache.lucene.analysis.Token#setPositionIncrement(int)}.
|
{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#setPositionIncrement(int)}.
|
||||||
This can be done with something like:
|
This can be done with something like:
|
||||||
<PRE>
|
<PRE>
|
||||||
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
public TokenStream tokenStream(final String fieldName, Reader reader) {
|
||||||
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
|
final TokenStream ts = someAnalyzer.tokenStream(fieldName, reader);
|
||||||
TokenStream res = new TokenStream() {
|
TokenStream res = new TokenStream() {
|
||||||
public Token next() throws IOException {
|
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
int extraIncrement = 0;
|
int extraIncrement = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
Token t = ts.next();
|
boolean hasNext = ts.incrementToken();
|
||||||
if (t!=null) {
|
if (hasNext) {
|
||||||
if (stopWords.contains(t.termText())) {
|
if (stopWords.contains(termAtt.term())) {
|
||||||
extraIncrement++; // filter this word
|
extraIncrement++; // filter this word
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (extraIncrement>0) {
|
if (extraIncrement>0) {
|
||||||
t.setPositionIncrement(t.getPositionIncrement()+extraIncrement);
|
posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement()+extraIncrement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return t;
|
return hasNext;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -268,5 +270,336 @@ the source code of any one of the many samples located in this package.
|
||||||
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
same position as that token, and so would they be seen by phrase and proximity searches.</li>
|
||||||
</ol>
|
</ol>
|
||||||
</p>
|
</p>
|
||||||
|
<h2>New TokenStream API</h2>
|
||||||
|
<p>
|
||||||
|
With Lucene 2.9 we introduce a new TokenStream API. The old API used to produce Tokens. A Token
|
||||||
|
has getter and setter methods for different properties like positionIncrement and termText.
|
||||||
|
While this approach was sufficient for the default indexing format, it is not versatile enough for
|
||||||
|
Flexible Indexing, a term which summarizes the effort of making the Lucene indexer pluggable and extensible for custom
|
||||||
|
index formats.
|
||||||
|
</p>
|
||||||
|
<p>
|
||||||
|
A fully customizable indexer means that users will be able to store custom data structures on disk. Therefore an API
|
||||||
|
is necessary that can transport custom types of data from the documents to the indexer.
|
||||||
|
</p>
|
||||||
|
<h3>Attribute and AttributeSource</h3>
|
||||||
|
Lucene 2.9 therefore introduces a new pair of classes called {@link org.apache.lucene.util.Attribute} and
|
||||||
|
{@link org.apache.lucene.util.AttributeSource}. An Attribute serves as a
|
||||||
|
particular piece of information about a text token. For example, {@link org.apache.lucene.analysis.tokenattributes.TermAttribute}
|
||||||
|
contains the term text of a token, and {@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute} contains the start and end character offsets of a token.
|
||||||
|
An AttributeSource is a collection of Attributes with a restriction: there may be only one instance of each attribute type. TokenStream now extends AttributeSource, which
|
||||||
|
means that one can add Attributes to a TokenStream. Since TokenFilter extends TokenStream, all filters are also
|
||||||
|
AttributeSources.
|
||||||
|
<p>
|
||||||
|
Lucene now provides six Attributes out of the box, which replace the variables the Token class has:
|
||||||
|
<ul>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.TermAttribute}<p>The term text of a token.</p></li>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.OffsetAttribute}<p>The start and end offset of token in characters.</p></li>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute}<p>See above for detailed information about position increment.</p></li>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.PayloadAttribute}<p>The payload that a Token can optionally have.</p></li>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.TypeAttribute}<p>The type of the token. Default is 'word'.</p></li>
|
||||||
|
<li>{@link org.apache.lucene.analysis.tokenattributes.FlagsAttribute}<p>Optional flags a token can have.</p></li>
|
||||||
|
</ul>
|
||||||
|
</p>
|
||||||
|
<h3>Using the new TokenStream API</h3>
|
||||||
|
There are a few important things to know in order to use the new API efficiently which are summarized here. You may want
|
||||||
|
to walk through the example below first and come back to this section afterwards.
|
||||||
|
<ol><li>
|
||||||
|
Please keep in mind that an AttributeSource can only have one instance of a particular Attribute. Furthermore, if
|
||||||
|
a chain of a TokenStream and multiple TokenFilters is used, then all TokenFilters in that chain share the Attributes
|
||||||
|
with the TokenStream.
|
||||||
|
</li>
|
||||||
|
<br>
|
||||||
|
<li>
|
||||||
|
Attribute instances are reused for all tokens of a document. Thus, a TokenStream/-Filter needs to update
|
||||||
|
the appropriate Attribute(s) in incrementToken(). The consumer, commonly the Lucene indexer, consumes the data in the
|
||||||
|
Attributes and then calls incrementToken() again until it retuns false, which indicates that the end of the stream
|
||||||
|
was reached. This means that in each call of incrementToken() a TokenStream/-Filter can safely overwrite the data in
|
||||||
|
the Attribute instances.
|
||||||
|
</li>
|
||||||
|
<br>
|
||||||
|
<li>
|
||||||
|
For performance reasons a TokenStream/-Filter should add/get Attributes during instantiation; i.e., create an attribute in the
|
||||||
|
constructor and store references to it in an instance variable. Using an instance variable instead of calling addAttribute()/getAttribute()
|
||||||
|
in incrementToken() will avoid expensive casting and attribute lookups for every token in the document.
|
||||||
|
</li>
|
||||||
|
<br>
|
||||||
|
<li>
|
||||||
|
All methods in AttributeSource are idempotent, which means calling them multiple times always yields the same
|
||||||
|
result. This is especially important to know for addAttribute(). The method takes the <b>type</b> (<code>Class</code>)
|
||||||
|
of an Attribute as an argument and returns an <b>instance</b>. If an Attribute of the same type was previously added, then
|
||||||
|
the already existing instance is returned, otherwise a new instance is created and returned. Therefore TokenStreams/-Filters
|
||||||
|
can safely call addAttribute() with the same Attribute type multiple times.
|
||||||
|
</li></ol>
|
||||||
|
<h3>Example</h3>
|
||||||
|
In this example we will create a WhiteSpaceTokenizer and use a LengthFilter to suppress all words that only
|
||||||
|
have two or less characters. The LengthFilter is part of the Lucene core and its implementation will be explained
|
||||||
|
here to illustrate the usage of the new TokenStream API.<br>
|
||||||
|
Then we will develop a custom Attribute, a PartOfSpeechAttribute, and add another filter to the chain which
|
||||||
|
utilizes the new custom attribute, and call it PartOfSpeechTaggingFilter.
|
||||||
|
<h4>Whitespace tokenization</h4>
|
||||||
|
<pre>
|
||||||
|
public class MyAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
// text to tokenize
|
||||||
|
final String text = "This is a demo of the new TokenStream API";
|
||||||
|
|
||||||
|
MyAnalyzer analyzer = new MyAnalyzer();
|
||||||
|
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||||
|
|
||||||
|
// get the TermAttribute from the TokenStream
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
|
|
||||||
|
// print all tokens until stream is exhausted
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
System.out.println(termAtt.term());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
In this easy example a simple white space tokenization is performed. In main() a loop consumes the stream and
|
||||||
|
prints the term text of the tokens by accessing the TermAttribute that the WhitespaceTokenizer provides.
|
||||||
|
Here is the output:
|
||||||
|
<pre>
|
||||||
|
This
|
||||||
|
is
|
||||||
|
a
|
||||||
|
demo
|
||||||
|
of
|
||||||
|
the
|
||||||
|
new
|
||||||
|
TokenStream
|
||||||
|
API
|
||||||
|
</pre>
|
||||||
|
<h4>Adding a LengthFilter</h4>
|
||||||
|
We want to suppress all tokens that have 2 or less characters. We can do that easily by adding a LengthFilter
|
||||||
|
to the chain. Only the tokenStream() method in our analyzer needs to be changed:
|
||||||
|
<pre>
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||||
|
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
Note how now only words with 3 or more characters are contained in the output:
|
||||||
|
<pre>
|
||||||
|
This
|
||||||
|
demo
|
||||||
|
the
|
||||||
|
new
|
||||||
|
TokenStream
|
||||||
|
API
|
||||||
|
</pre>
|
||||||
|
Now let's take a look how the LengthFilter is implemented (it is part of Lucene's core):
|
||||||
|
<pre>
|
||||||
|
public final class LengthFilter extends TokenFilter {
|
||||||
|
|
||||||
|
final int min;
|
||||||
|
final int max;
|
||||||
|
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Build a filter that removes words that are too long or too
|
||||||
|
* short from the text.
|
||||||
|
*/
|
||||||
|
public LengthFilter(TokenStream in, int min, int max)
|
||||||
|
{
|
||||||
|
super(in);
|
||||||
|
this.min = min;
|
||||||
|
this.max = max;
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns the next input Token whose term() is the right len
|
||||||
|
*/
|
||||||
|
public final boolean incrementToken() throws IOException
|
||||||
|
{
|
||||||
|
assert termAtt != null;
|
||||||
|
// return the first non-stop word found
|
||||||
|
while (input.incrementToken()) {
|
||||||
|
int len = termAtt.termLength();
|
||||||
|
if (len >= min && len <= max) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// note: else we ignore it but should we index each part of it?
|
||||||
|
}
|
||||||
|
// reached EOS -- return null
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
The TermAttribute is added in the constructor and stored in the instance variable <code>termAtt</code>.
|
||||||
|
Remember that there can only be a single instance of TermAttribute in the chain, so in our example the
|
||||||
|
<code>addAttribute()</code> call in LengthFilter returns the TermAttribute that the WhitespaceTokenizer already added. The tokens
|
||||||
|
are retrieved from the input stream in the <code>incrementToken()</code> method. By looking at the term text
|
||||||
|
in the TermAttribute the length of the term can be determined and too short or too long tokens are skipped.
|
||||||
|
Note how <code>incrementToken()</code> can efficiently access the instance variable; no attribute lookup or downcasting
|
||||||
|
is neccessary. The same is true for the consumer, which can simply use local references to the Attributes.
|
||||||
|
<h4>Adding a custom Attribute</h4>
|
||||||
|
Now we're going to implement our own custom Attribute for part-of-speech tagging and call it consequently
|
||||||
|
<code>PartOfSpeechAttribute</code>:
|
||||||
|
<pre>
|
||||||
|
public static enum PartOfSpeech {
|
||||||
|
Noun, Verb, Adjective, Adverb, Pronoun, Preposition, Conjunction, Article, Unknown
|
||||||
|
}
|
||||||
|
|
||||||
|
public static final class PartOfSpeechAttribute extends Attribute {
|
||||||
|
|
||||||
|
private PartOfSpeech pos = PartOfSpeech.Unknown;
|
||||||
|
|
||||||
|
public void setPartOfSpeech(PartOfSpeech pos) {
|
||||||
|
this.pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PartOfSpeech getPartOfSpeech() {
|
||||||
|
return pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
pos = PartOfSpeech.Unknown;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
((PartOfSpeechAttribute) target).pos = pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof PartOfSpeechAttribute) {
|
||||||
|
return pos == ((PartOfSpeechAttribute) other).pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return pos.ordinal();
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "PartOfSpeech=" + pos;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
This is a simple Attribute that has only a single variable that stores the part-of-speech of a token. It extends the
|
||||||
|
new <code>Attribute</code> class and therefore implements its abstract methods <code>clear(), copyTo(), equals(), hashCode(), toString()</code>.
|
||||||
|
Now we need a TokenFilter that can set this new PartOfSpeechAttribute for each token. In this example we show a very naive filter
|
||||||
|
that tags every word with a leading upper-case letter as a 'Noun' and all other words as 'Unknown'.
|
||||||
|
<pre>
|
||||||
|
public static class PartOfSpeechTaggingFilter extends TokenFilter {
|
||||||
|
PartOfSpeechAttribute posAtt;
|
||||||
|
TermAttribute termAtt;
|
||||||
|
|
||||||
|
protected PartOfSpeechTaggingFilter(TokenStream input) {
|
||||||
|
super(input);
|
||||||
|
posAtt = (PartOfSpeechAttribute) addAttribute(PartOfSpeechAttribute.class);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
if (!input.incrementToken()) {return false;}
|
||||||
|
posAtt.setPartOfSpeech(determinePOS(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// determine the part of speech for the given term
|
||||||
|
protected PartOfSpeech determinePOS(char[] term, int offset, int length) {
|
||||||
|
// naive implementation that tags every uppercased word as noun
|
||||||
|
if (length > 0 && Character.isUpperCase(term[0])) {
|
||||||
|
return PartOfSpeech.Noun;
|
||||||
|
}
|
||||||
|
return PartOfSpeech.Unknown;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
Just like the LengthFilter, this new filter accesses the attributes it needs in the constructor and
|
||||||
|
stores references in instance variables. Now we need to add the filter to the chain:
|
||||||
|
<pre>
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream stream = new WhitespaceTokenizer(reader);
|
||||||
|
stream = new LengthFilter(stream, 3, Integer.MAX_VALUE);
|
||||||
|
stream = new PartOfSpeechTaggingFilter(stream);
|
||||||
|
return stream;
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
Now let's look at the output:
|
||||||
|
<pre>
|
||||||
|
This
|
||||||
|
demo
|
||||||
|
the
|
||||||
|
new
|
||||||
|
TokenStream
|
||||||
|
API
|
||||||
|
</pre>
|
||||||
|
Apparently it hasn't changed, which shows that adding a custom attribute to a TokenStream/Filter chain does not
|
||||||
|
affect any existing consumers, simply because they don't know the new Attribute. Now let's change the consumer
|
||||||
|
to make use of the new PartOfSpeechAttribute and print it out:
|
||||||
|
<pre>
|
||||||
|
public static void main(String[] args) throws IOException {
|
||||||
|
// text to tokenize
|
||||||
|
final String text = "This is a demo of the new TokenStream API";
|
||||||
|
|
||||||
|
MyAnalyzer analyzer = new MyAnalyzer();
|
||||||
|
TokenStream stream = analyzer.tokenStream("field", new StringReader(text));
|
||||||
|
|
||||||
|
// get the TermAttribute from the TokenStream
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
|
|
||||||
|
// get the PartOfSpeechAttribute from the TokenStream
|
||||||
|
PartOfSpeechAttribute posAtt = (PartOfSpeechAttribute) stream.getAttribute(PartOfSpeechAttribute.class);
|
||||||
|
|
||||||
|
// print all tokens until stream is exhausted
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
System.out.println(termAtt.term() + ": " + posAtt.getPartOfSpeech());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
</pre>
|
||||||
|
The change that was made is to get the PartOfSpeechAttribute from the TokenStream and print out its contents in
|
||||||
|
the while loop that consumes the stream. Here is the new output:
|
||||||
|
<pre>
|
||||||
|
This: Noun
|
||||||
|
demo: Unknown
|
||||||
|
the: Unknown
|
||||||
|
new: Unknown
|
||||||
|
TokenStream: Noun
|
||||||
|
API: Noun
|
||||||
|
</pre>
|
||||||
|
Each word is now followed by its assigned PartOfSpeech tag. Of course this is a naive
|
||||||
|
part-of-speech tagging. The word 'This' should not even be tagged as noun; it is only spelled capitalized because it
|
||||||
|
is the first word of a sentence. Actually this is a good opportunity for an excerise. To practice the usage of the new
|
||||||
|
API the reader could now write an Attribute and TokenFilter that can specify for each word if it was the first token
|
||||||
|
of a sentence or not. Then the PartOfSpeechTaggingFilter can make use of this knowledge and only tag capitalized words
|
||||||
|
as nouns if not the first word of a sentence (we know, this is still not a correct behavior, but hey, it's a good exercise).
|
||||||
|
As a small hint, this is how the new Attribute class could begin:
|
||||||
|
<pre>
|
||||||
|
public class FirstTokenOfSentenceAttribute extends Attribute {
|
||||||
|
|
||||||
|
private boolean firstToken;
|
||||||
|
|
||||||
|
public void setFirstToken(boolean firstToken) {
|
||||||
|
this.firstToken = firstToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getFirstToken() {
|
||||||
|
return firstToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
firstToken = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
...
|
||||||
|
</pre>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -17,9 +17,11 @@ package org.apache.lucene.analysis.standard;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
/** Normalizes tokens extracted with {@link StandardTokenizer}. */
|
||||||
|
|
||||||
|
@ -29,15 +31,54 @@ public final class StandardFilter extends TokenFilter {
|
||||||
/** Construct filtering <i>in</i>. */
|
/** Construct filtering <i>in</i>. */
|
||||||
public StandardFilter(TokenStream in) {
|
public StandardFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
|
private static final String APOSTROPHE_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.APOSTROPHE];
|
||||||
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
|
private static final String ACRONYM_TYPE = StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM];
|
||||||
|
|
||||||
|
// this filters uses attribute type
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS.
|
/** Returns the next token in the stream, or null at EOS.
|
||||||
* <p>Removes <tt>'s</tt> from the end of words.
|
* <p>Removes <tt>'s</tt> from the end of words.
|
||||||
* <p>Removes dots from acronyms.
|
* <p>Removes dots from acronyms.
|
||||||
*/
|
*/
|
||||||
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
|
if (!input.incrementToken()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
char[] buffer = termAtt.termBuffer();
|
||||||
|
final int bufferLength = termAtt.termLength();
|
||||||
|
final String type = typeAtt.type();
|
||||||
|
|
||||||
|
if (type == APOSTROPHE_TYPE && // remove 's
|
||||||
|
bufferLength >= 2 &&
|
||||||
|
buffer[bufferLength-2] == '\'' &&
|
||||||
|
(buffer[bufferLength-1] == 's' || buffer[bufferLength-1] == 'S')) {
|
||||||
|
// Strip last 2 characters off
|
||||||
|
termAtt.setTermLength(bufferLength - 2);
|
||||||
|
} else if (type == ACRONYM_TYPE) { // remove dots
|
||||||
|
int upto = 0;
|
||||||
|
for(int i=0;i<bufferLength;i++) {
|
||||||
|
char c = buffer[i];
|
||||||
|
if (c != '.')
|
||||||
|
buffer[upto++] = c;
|
||||||
|
}
|
||||||
|
termAtt.setTermLength(upto);
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the next token in the stream, or null at EOS.
|
||||||
|
* <p>Removes <tt>'s</tt> from the end of words.
|
||||||
|
* <p>Removes dots from acronyms.
|
||||||
|
* @deprecated
|
||||||
|
*/
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public final Token next(final Token reusableToken) throws java.io.IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
Token nextToken = input.next(reusableToken);
|
Token nextToken = input.next(reusableToken);
|
||||||
|
|
|
@ -22,6 +22,10 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
/** A grammar-based tokenizer constructed with JFlex
|
/** A grammar-based tokenizer constructed with JFlex
|
||||||
*
|
*
|
||||||
|
@ -84,7 +88,7 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
*
|
*
|
||||||
* @deprecated this should be removed in the next release (3.0).
|
* @deprecated this should be removed in the next release (3.0).
|
||||||
*/
|
*/
|
||||||
private boolean replaceInvalidAcronym = false;
|
private boolean replaceInvalidAcronym;
|
||||||
|
|
||||||
void setInput(Reader reader) {
|
void setInput(Reader reader) {
|
||||||
this.input = reader;
|
this.input = reader;
|
||||||
|
@ -103,14 +107,13 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
return maxTokenLength;
|
return maxTokenLength;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
* Creates a new instance of the {@link StandardTokenizer}. Attaches the
|
||||||
* <code>input</code> to a newly created JFlex scanner.
|
* <code>input</code> to a newly created JFlex scanner.
|
||||||
*/
|
*/
|
||||||
public StandardTokenizer(Reader input) {
|
public StandardTokenizer(Reader input) {
|
||||||
this.input = input;
|
this(input, false);
|
||||||
this.scanner = new StandardTokenizerImpl(input);
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
|
* Creates a new instance of the {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
|
||||||
|
@ -125,13 +128,68 @@ public class StandardTokenizer extends Tokenizer {
|
||||||
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
this.replaceInvalidAcronym = replaceInvalidAcronym;
|
||||||
this.input = input;
|
this.input = input;
|
||||||
this.scanner = new StandardTokenizerImpl(input);
|
this.scanner = new StandardTokenizerImpl(input);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// this tokenizer generates three attributes:
|
||||||
|
// offset, positionIncrement and type
|
||||||
|
private TermAttribute termAtt;
|
||||||
|
private OffsetAttribute offsetAtt;
|
||||||
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
|
private TypeAttribute typeAtt;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* (non-Javadoc)
|
||||||
*
|
*
|
||||||
* @see org.apache.lucene.analysis.TokenStream#next()
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
*/
|
*/
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
|
int posIncr = 1;
|
||||||
|
|
||||||
|
while(true) {
|
||||||
|
int tokenType = scanner.getNextToken();
|
||||||
|
|
||||||
|
if (tokenType == StandardTokenizerImpl.YYEOF) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (scanner.yylength() <= maxTokenLength) {
|
||||||
|
termAtt.clear();
|
||||||
|
posIncrAtt.setPositionIncrement(posIncr);
|
||||||
|
scanner.getText(termAtt);
|
||||||
|
final int start = scanner.yychar();
|
||||||
|
offsetAtt.setStartOffset(start);
|
||||||
|
offsetAtt.setEndOffset(start+termAtt.termLength());
|
||||||
|
// This 'if' should be removed in the next release. For now, it converts
|
||||||
|
// invalid acronyms to HOST. When removed, only the 'else' part should
|
||||||
|
// remain.
|
||||||
|
if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
|
||||||
|
if (replaceInvalidAcronym) {
|
||||||
|
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
|
||||||
|
termAtt.setTermLength(termAtt.termLength() - 1); // remove extra '.'
|
||||||
|
} else {
|
||||||
|
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
typeAtt.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else
|
||||||
|
// When we skip a too-long term, we still increment the
|
||||||
|
// position increment
|
||||||
|
posIncr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* (non-Javadoc)
|
||||||
|
*
|
||||||
|
* @see org.apache.lucene.analysis.TokenStream#next()
|
||||||
|
*/
|
||||||
|
/** @deprecated */
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public Token next(final Token reusableToken) throws IOException {
|
||||||
assert reusableToken != null;
|
assert reusableToken != null;
|
||||||
int posIncr = 1;
|
int posIncr = 1;
|
||||||
|
|
|
@ -30,6 +30,7 @@ NOTE: if you change this file and need to regenerate the tokenizer,
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -368,6 +369,13 @@ final void getText(Token t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills TermAttribute with the current token text.
|
||||||
|
*/
|
||||||
|
final void getText(TermAttribute t) {
|
||||||
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new scanner
|
* Creates a new scanner
|
||||||
|
|
|
@ -29,6 +29,7 @@ NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
@ -69,6 +70,14 @@ public final int yychar()
|
||||||
final void getText(Token t) {
|
final void getText(Token t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fills TermAttribute with the current token text.
|
||||||
|
*/
|
||||||
|
final void getText(TermAttribute t) {
|
||||||
|
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
|
}
|
||||||
|
|
||||||
%}
|
%}
|
||||||
|
|
||||||
THAI = [\u0E00-\u0E59]
|
THAI = [\u0E00-\u0E59]
|
||||||
|
|
|
@ -0,0 +1,86 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This attribute can be used to pass different flags down the tokenizer chain,
|
||||||
|
* e. g. from one TokenFilter to another one.
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
|
||||||
|
*/
|
||||||
|
public class FlagsAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private int flags = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* EXPERIMENTAL: While we think this is here to stay, we may want to change it to be a long.
|
||||||
|
* <p/>
|
||||||
|
*
|
||||||
|
* Get the bitset for any bits that have been set. This is completely distinct from {@link TypeAttribute#type()}, although they do share similar purposes.
|
||||||
|
* The flags can be used to encode information about the token for use by other {@link org.apache.lucene.analysis.TokenFilter}s.
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @return The bits
|
||||||
|
*/
|
||||||
|
public int getFlags() {
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @see #getFlags()
|
||||||
|
*/
|
||||||
|
public void setFlags(int flags) {
|
||||||
|
this.flags = flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
flags = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "flags=" + flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (this == other) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof FlagsAttribute) {
|
||||||
|
return ((FlagsAttribute) other).flags == flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
FlagsAttribute t = (FlagsAttribute) target;
|
||||||
|
t.setFlags(flags);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,98 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The start and end character offset of a Token.
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public class OffsetAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private int startOffset;
|
||||||
|
private int endOffset;
|
||||||
|
|
||||||
|
/** Returns this Token's starting offset, the position of the first character
|
||||||
|
corresponding to this token in the source text.
|
||||||
|
|
||||||
|
Note that the difference between endOffset() and startOffset() may not be
|
||||||
|
equal to termText.length(), as the term text may have been altered by a
|
||||||
|
stemmer or some other filter. */
|
||||||
|
public int startOffset() {
|
||||||
|
return startOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the starting offset.
|
||||||
|
@see #startOffset() */
|
||||||
|
public void setStartOffset(int offset) {
|
||||||
|
this.startOffset = offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns this Token's ending offset, one greater than the position of the
|
||||||
|
last character corresponding to this token in the source text. The length
|
||||||
|
of the token in the source text is (endOffset - startOffset). */
|
||||||
|
public int endOffset() {
|
||||||
|
return endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the ending offset.
|
||||||
|
@see #endOffset() */
|
||||||
|
public void setEndOffset(int offset) {
|
||||||
|
this.endOffset = offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
startOffset = 0;
|
||||||
|
endOffset = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "start=" + startOffset + ",end=" + endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof OffsetAttribute) {
|
||||||
|
OffsetAttribute o = (OffsetAttribute) other;
|
||||||
|
return o.startOffset == startOffset && o.endOffset == endOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
int code = startOffset;
|
||||||
|
code = code * 31 + endOffset;
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
OffsetAttribute t = (OffsetAttribute) target;
|
||||||
|
t.setStartOffset(startOffset);
|
||||||
|
t.setEndOffset(endOffset);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,109 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.index.Payload;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The payload of a Token. See also {@link Payload}.
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public class PayloadAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private Payload payload;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize this attribute with no payload.
|
||||||
|
*/
|
||||||
|
public PayloadAttribute() {}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize this attribute with the given payload.
|
||||||
|
*/
|
||||||
|
public PayloadAttribute(Payload payload) {
|
||||||
|
this.payload = payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns this Token's payload.
|
||||||
|
*/
|
||||||
|
public Payload getPayload() {
|
||||||
|
return this.payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets this Token's payload.
|
||||||
|
*/
|
||||||
|
public void setPayload(Payload payload) {
|
||||||
|
this.payload = payload;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
payload = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
if (payload == null) {
|
||||||
|
return "payload=null";
|
||||||
|
}
|
||||||
|
|
||||||
|
return "payload=" + payload.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object clone() {
|
||||||
|
PayloadAttribute clone = (PayloadAttribute) super.clone();
|
||||||
|
if (payload != null) {
|
||||||
|
clone.payload = (Payload) payload.clone();
|
||||||
|
}
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof PayloadAttribute) {
|
||||||
|
PayloadAttribute o = (PayloadAttribute) other;
|
||||||
|
if (o.payload == null || payload == null) {
|
||||||
|
return o.payload == null && payload == null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return o.payload.equals(payload);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return (payload == null) ? 0 : payload.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
PayloadAttribute t = (PayloadAttribute) target;
|
||||||
|
t.setPayload((payload == null) ? null : (Payload) payload.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,106 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/** The positionIncrement determines the position of this token
|
||||||
|
* relative to the previous Token in a {@link TokenStream}, used in phrase
|
||||||
|
* searching.
|
||||||
|
*
|
||||||
|
* <p>The default value is one.
|
||||||
|
*
|
||||||
|
* <p>Some common uses for this are:<ul>
|
||||||
|
*
|
||||||
|
* <li>Set it to zero to put multiple terms in the same position. This is
|
||||||
|
* useful if, e.g., a word has multiple stems. Searches for phrases
|
||||||
|
* including either stem will match. In this case, all but the first stem's
|
||||||
|
* increment should be set to zero: the increment of the first instance
|
||||||
|
* should be one. Repeating a token with an increment of zero can also be
|
||||||
|
* used to boost the scores of matches on that token.
|
||||||
|
*
|
||||||
|
* <li>Set it to values greater than one to inhibit exact phrase matches.
|
||||||
|
* If, for example, one does not want phrases to match across removed stop
|
||||||
|
* words, then one could build a stop word filter that removes stop words and
|
||||||
|
* also sets the increment to the number of stop words removed before each
|
||||||
|
* non-stop word. Then exact phrase queries will only match when the terms
|
||||||
|
* occur with no intervening stop words.
|
||||||
|
*
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*
|
||||||
|
* @see org.apache.lucene.index.TermPositions
|
||||||
|
*/
|
||||||
|
public class PositionIncrementAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private int positionIncrement = 1;
|
||||||
|
|
||||||
|
/** Set the position increment. The default value is one.
|
||||||
|
*
|
||||||
|
* @param positionIncrement the distance from the prior term
|
||||||
|
*/
|
||||||
|
public void setPositionIncrement(int positionIncrement) {
|
||||||
|
if (positionIncrement < 0)
|
||||||
|
throw new IllegalArgumentException
|
||||||
|
("Increment must be zero or greater: " + positionIncrement);
|
||||||
|
this.positionIncrement = positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the position increment of this Token.
|
||||||
|
* @see #setPositionIncrement
|
||||||
|
*/
|
||||||
|
public int getPositionIncrement() {
|
||||||
|
return positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
this.positionIncrement = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "positionIncrement=" + positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof PositionIncrementAttribute) {
|
||||||
|
return positionIncrement == ((PositionIncrementAttribute) other).positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return positionIncrement;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
PositionIncrementAttribute t = (PositionIncrementAttribute) target;
|
||||||
|
t.setPositionIncrement(positionIncrement);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,242 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The term text of a Token.
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public class TermAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private static int MIN_BUFFER_SIZE = 10;
|
||||||
|
|
||||||
|
private char[] termBuffer;
|
||||||
|
private int termLength;
|
||||||
|
|
||||||
|
/** Returns the Token's term text.
|
||||||
|
*
|
||||||
|
* This method has a performance penalty
|
||||||
|
* because the text is stored internally in a char[]. If
|
||||||
|
* possible, use {@link #termBuffer()} and {@link
|
||||||
|
* #termLength()} directly instead. If you really need a
|
||||||
|
* String, use this method, which is nothing more than
|
||||||
|
* a convenience call to <b>new String(token.termBuffer(), 0, token.termLength())</b>
|
||||||
|
*/
|
||||||
|
public String term() {
|
||||||
|
initTermBuffer();
|
||||||
|
return new String(termBuffer, 0, termLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copies the contents of buffer, starting at offset for
|
||||||
|
* length characters, into the termBuffer array.
|
||||||
|
* @param buffer the buffer to copy
|
||||||
|
* @param offset the index in the buffer of the first character to copy
|
||||||
|
* @param length the number of characters to copy
|
||||||
|
*/
|
||||||
|
public void setTermBuffer(char[] buffer, int offset, int length) {
|
||||||
|
char[] newCharBuffer = growTermBuffer(length);
|
||||||
|
if (newCharBuffer != null) {
|
||||||
|
termBuffer = newCharBuffer;
|
||||||
|
}
|
||||||
|
System.arraycopy(buffer, offset, termBuffer, 0, length);
|
||||||
|
termLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copies the contents of buffer into the termBuffer array.
|
||||||
|
* @param buffer the buffer to copy
|
||||||
|
*/
|
||||||
|
public void setTermBuffer(String buffer) {
|
||||||
|
int length = buffer.length();
|
||||||
|
char[] newCharBuffer = growTermBuffer(length);
|
||||||
|
if (newCharBuffer != null) {
|
||||||
|
termBuffer = newCharBuffer;
|
||||||
|
}
|
||||||
|
buffer.getChars(0, length, termBuffer, 0);
|
||||||
|
termLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Copies the contents of buffer, starting at offset and continuing
|
||||||
|
* for length characters, into the termBuffer array.
|
||||||
|
* @param buffer the buffer to copy
|
||||||
|
* @param offset the index in the buffer of the first character to copy
|
||||||
|
* @param length the number of characters to copy
|
||||||
|
*/
|
||||||
|
public void setTermBuffer(String buffer, int offset, int length) {
|
||||||
|
assert offset <= buffer.length();
|
||||||
|
assert offset + length <= buffer.length();
|
||||||
|
char[] newCharBuffer = growTermBuffer(length);
|
||||||
|
if (newCharBuffer != null) {
|
||||||
|
termBuffer = newCharBuffer;
|
||||||
|
}
|
||||||
|
buffer.getChars(offset, offset + length, termBuffer, 0);
|
||||||
|
termLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the internal termBuffer character array which
|
||||||
|
* you can then directly alter. If the array is too
|
||||||
|
* small for your token, use {@link
|
||||||
|
* #resizeTermBuffer(int)} to increase it. After
|
||||||
|
* altering the buffer be sure to call {@link
|
||||||
|
* #setTermLength} to record the number of valid
|
||||||
|
* characters that were placed into the termBuffer. */
|
||||||
|
public char[] termBuffer() {
|
||||||
|
initTermBuffer();
|
||||||
|
return termBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Grows the termBuffer to at least size newSize, preserving the
|
||||||
|
* existing content. Note: If the next operation is to change
|
||||||
|
* the contents of the term buffer use
|
||||||
|
* {@link #setTermBuffer(char[], int, int)},
|
||||||
|
* {@link #setTermBuffer(String)}, or
|
||||||
|
* {@link #setTermBuffer(String, int, int)}
|
||||||
|
* to optimally combine the resize with the setting of the termBuffer.
|
||||||
|
* @param newSize minimum size of the new termBuffer
|
||||||
|
* @return newly created termBuffer with length >= newSize
|
||||||
|
*/
|
||||||
|
public char[] resizeTermBuffer(int newSize) {
|
||||||
|
char[] newCharBuffer = growTermBuffer(newSize);
|
||||||
|
if (termBuffer == null) {
|
||||||
|
// If there were termText, then preserve it.
|
||||||
|
// note that if termBuffer is null then newCharBuffer cannot be null
|
||||||
|
assert newCharBuffer != null;
|
||||||
|
termBuffer = newCharBuffer;
|
||||||
|
} else if (newCharBuffer != null) {
|
||||||
|
// Note: if newCharBuffer != null then termBuffer needs to grow.
|
||||||
|
// If there were a termBuffer, then preserve it
|
||||||
|
System.arraycopy(termBuffer, 0, newCharBuffer, 0, termBuffer.length);
|
||||||
|
termBuffer = newCharBuffer;
|
||||||
|
}
|
||||||
|
return termBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Allocates a buffer char[] of at least newSize
|
||||||
|
* @param newSize minimum size of the buffer
|
||||||
|
* @return newly created buffer with length >= newSize or null if the current termBuffer is big enough
|
||||||
|
*/
|
||||||
|
private char[] growTermBuffer(int newSize) {
|
||||||
|
if (termBuffer != null) {
|
||||||
|
if (termBuffer.length >= newSize)
|
||||||
|
// Already big enough
|
||||||
|
return null;
|
||||||
|
else
|
||||||
|
// Not big enough; create a new array with slight
|
||||||
|
// over allocation:
|
||||||
|
return new char[ArrayUtil.getNextSize(newSize)];
|
||||||
|
} else {
|
||||||
|
|
||||||
|
// determine the best size
|
||||||
|
// The buffer is always at least MIN_BUFFER_SIZE
|
||||||
|
if (newSize < MIN_BUFFER_SIZE) {
|
||||||
|
newSize = MIN_BUFFER_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
return new char[newSize];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: once we remove the deprecated termText() method
|
||||||
|
// and switch entirely to char[] termBuffer we don't need
|
||||||
|
// to use this method anymore
|
||||||
|
private void initTermBuffer() {
|
||||||
|
if (termBuffer == null) {
|
||||||
|
termBuffer = new char[MIN_BUFFER_SIZE];
|
||||||
|
termLength = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return number of valid characters (length of the term)
|
||||||
|
* in the termBuffer array. */
|
||||||
|
public int termLength() {
|
||||||
|
initTermBuffer();
|
||||||
|
return termLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set number of valid characters (length of the term) in
|
||||||
|
* the termBuffer array. Use this to truncate the termBuffer
|
||||||
|
* or to synchronize with external manipulation of the termBuffer.
|
||||||
|
* Note: to grow the size of the array,
|
||||||
|
* use {@link #resizeTermBuffer(int)} first.
|
||||||
|
* @param length the truncated length
|
||||||
|
*/
|
||||||
|
public void setTermLength(int length) {
|
||||||
|
initTermBuffer();
|
||||||
|
if (length > termBuffer.length)
|
||||||
|
throw new IllegalArgumentException("length " + length + " exceeds the size of the termBuffer (" + termBuffer.length + ")");
|
||||||
|
termLength = length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
initTermBuffer();
|
||||||
|
int code = termLength;
|
||||||
|
code = code * 31 + ArrayUtil.hashCode(termBuffer, 0, termLength);
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
termLength = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Object clone() {
|
||||||
|
TermAttribute t = (TermAttribute)super.clone();
|
||||||
|
// Do a deep clone
|
||||||
|
if (termBuffer != null) {
|
||||||
|
t.termBuffer = (char[]) termBuffer.clone();
|
||||||
|
}
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof TermAttribute) {
|
||||||
|
initTermBuffer();
|
||||||
|
TermAttribute o = ((TermAttribute) other);
|
||||||
|
o.initTermBuffer();
|
||||||
|
|
||||||
|
for(int i=0;i<termLength;i++) {
|
||||||
|
if (termBuffer[i] != o.termBuffer[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
initTermBuffer();
|
||||||
|
return "term=" + new String(termBuffer, 0, termLength);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
TermAttribute t = (TermAttribute) target;
|
||||||
|
t.setTermBuffer(termBuffer, 0, termLength);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,83 @@
|
||||||
|
package org.apache.lucene.analysis.tokenattributes;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A Token's lexical type. The Default value is "word".
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public class TypeAttribute extends Attribute implements Cloneable, Serializable {
|
||||||
|
private String type;
|
||||||
|
public static final String DEFAULT_TYPE = "word";
|
||||||
|
|
||||||
|
public TypeAttribute() {
|
||||||
|
this(DEFAULT_TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypeAttribute(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns this Token's lexical type. Defaults to "word". */
|
||||||
|
public String type() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the lexical type.
|
||||||
|
@see #type() */
|
||||||
|
public void setType(String type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void clear() {
|
||||||
|
type = DEFAULT_TYPE;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString() {
|
||||||
|
return "type=" + type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object other) {
|
||||||
|
if (other == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (other instanceof TypeAttribute) {
|
||||||
|
return type.equals(((TypeAttribute) other).type);
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
return type.hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void copyTo(Attribute target) {
|
||||||
|
TypeAttribute t = (TypeAttribute) target;
|
||||||
|
t.setType(new String(type));
|
||||||
|
}
|
||||||
|
}
|
|
@ -17,12 +17,14 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.util.Map;
|
import java.io.IOException;
|
||||||
|
import java.util.Collection;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.io.IOException;
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/** This is a DocFieldConsumer that inverts each field,
|
/** This is a DocFieldConsumer that inverts each field,
|
||||||
* separately, from a Document, and accepts a
|
* separately, from a Document, and accepts a
|
||||||
|
|
|
@ -22,6 +22,8 @@ import java.io.Reader;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Holds state for inverting all occurrences of a single
|
* Holds state for inverting all occurrences of a single
|
||||||
|
@ -79,10 +81,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
if (!field.isTokenized()) { // un-tokenized field
|
if (!field.isTokenized()) { // un-tokenized field
|
||||||
String stringValue = field.stringValue();
|
String stringValue = field.stringValue();
|
||||||
final int valueLength = stringValue.length();
|
final int valueLength = stringValue.length();
|
||||||
Token token = perThread.localToken.reinit(stringValue, 0, valueLength);
|
perThread.singleTokenTokenStream.reinit(stringValue, 0, valueLength);
|
||||||
|
fieldState.attributeSource = perThread.singleTokenTokenStream;
|
||||||
|
perThread.localTokenStream.reset();
|
||||||
|
consumer.start(field);
|
||||||
|
|
||||||
boolean success = false;
|
boolean success = false;
|
||||||
try {
|
try {
|
||||||
consumer.add(token);
|
consumer.add();
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
if (!success)
|
if (!success)
|
||||||
|
@ -122,7 +128,22 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
int offsetEnd = fieldState.offset-1;
|
int offsetEnd = fieldState.offset-1;
|
||||||
final Token localToken = perThread.localToken;
|
|
||||||
|
boolean useNewTokenStreamAPI = stream.useNewAPI();
|
||||||
|
Token localToken = null;
|
||||||
|
|
||||||
|
if (useNewTokenStreamAPI) {
|
||||||
|
fieldState.attributeSource = stream;
|
||||||
|
} else {
|
||||||
|
fieldState.attributeSource = perThread.localTokenStream;
|
||||||
|
localToken = perThread.localToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
consumer.start(field);
|
||||||
|
|
||||||
|
OffsetAttribute offsetAttribute = (OffsetAttribute) fieldState.attributeSource.addAttribute(OffsetAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncrAttribute = (PositionIncrementAttribute) fieldState.attributeSource.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
for(;;) {
|
for(;;) {
|
||||||
|
|
||||||
// If we hit an exception in stream.next below
|
// If we hit an exception in stream.next below
|
||||||
|
@ -131,10 +152,16 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
// non-aborting and (above) this one document
|
// non-aborting and (above) this one document
|
||||||
// will be marked as deleted, but still
|
// will be marked as deleted, but still
|
||||||
// consume a docID
|
// consume a docID
|
||||||
Token token = stream.next(localToken);
|
Token token = null;
|
||||||
|
if (useNewTokenStreamAPI) {
|
||||||
if (token == null) break;
|
if (!stream.incrementToken()) break;
|
||||||
final int posIncr = token.getPositionIncrement();
|
} else {
|
||||||
|
token = stream.next(localToken);
|
||||||
|
if (token == null) break;
|
||||||
|
perThread.localTokenStream.set(token);
|
||||||
|
}
|
||||||
|
|
||||||
|
final int posIncr = posIncrAttribute.getPositionIncrement();
|
||||||
fieldState.position += posIncr - 1;
|
fieldState.position += posIncr - 1;
|
||||||
if (posIncr == 0)
|
if (posIncr == 0)
|
||||||
fieldState.numOverlap++;
|
fieldState.numOverlap++;
|
||||||
|
@ -147,14 +174,14 @@ final class DocInverterPerField extends DocFieldConsumerPerField {
|
||||||
// internal state of the consumer is now
|
// internal state of the consumer is now
|
||||||
// corrupt and should not be flushed to a
|
// corrupt and should not be flushed to a
|
||||||
// new segment:
|
// new segment:
|
||||||
consumer.add(token);
|
consumer.add();
|
||||||
success = true;
|
success = true;
|
||||||
} finally {
|
} finally {
|
||||||
if (!success)
|
if (!success)
|
||||||
docState.docWriter.setAborting();
|
docState.docWriter.setAborting();
|
||||||
}
|
}
|
||||||
fieldState.position++;
|
fieldState.position++;
|
||||||
offsetEnd = fieldState.offset + token.endOffset();
|
offsetEnd = fieldState.offset + offsetAttribute.endOffset();
|
||||||
if (++fieldState.length >= maxFieldLength) {
|
if (++fieldState.length >= maxFieldLength) {
|
||||||
if (docState.infoStream != null)
|
if (docState.infoStream != null)
|
||||||
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
docState.infoStream.println("maxFieldLength " +maxFieldLength+ " reached for field " + fieldInfo.name + ", ignoring following tokens");
|
||||||
|
|
|
@ -20,6 +20,14 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.util.Attribute;
|
||||||
|
|
||||||
/** This is a DocFieldConsumer that inverts each field,
|
/** This is a DocFieldConsumer that inverts each field,
|
||||||
* separately, from a Document, and accepts a
|
* separately, from a Document, and accepts a
|
||||||
|
@ -30,6 +38,94 @@ final class DocInverterPerThread extends DocFieldConsumerPerThread {
|
||||||
final InvertedDocConsumerPerThread consumer;
|
final InvertedDocConsumerPerThread consumer;
|
||||||
final InvertedDocEndConsumerPerThread endConsumer;
|
final InvertedDocEndConsumerPerThread endConsumer;
|
||||||
final Token localToken = new Token();
|
final Token localToken = new Token();
|
||||||
|
//TODO: change to SingleTokenTokenStream after Token was removed
|
||||||
|
final SingleTokenTokenStream singleTokenTokenStream = new SingleTokenTokenStream();
|
||||||
|
final BackwardsCompatibilityStream localTokenStream = new BackwardsCompatibilityStream();
|
||||||
|
|
||||||
|
static class SingleTokenTokenStream extends TokenStream {
|
||||||
|
TermAttribute termAttribute;
|
||||||
|
OffsetAttribute offsetAttribute;
|
||||||
|
|
||||||
|
SingleTokenTokenStream() {
|
||||||
|
termAttribute = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAttribute = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void reinit(String stringValue, int startOffset, int endOffset) {
|
||||||
|
termAttribute.setTermBuffer(stringValue);
|
||||||
|
offsetAttribute.setStartOffset(startOffset);
|
||||||
|
offsetAttribute.setEndOffset(endOffset);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** This stream wrapper is only used to maintain backwards compatibility with the
|
||||||
|
* old TokenStream API and can be removed in Lucene 3.0
|
||||||
|
* @deprecated
|
||||||
|
*/
|
||||||
|
static class BackwardsCompatibilityStream extends TokenStream {
|
||||||
|
private Token token;
|
||||||
|
|
||||||
|
TermAttribute termAttribute = new TermAttribute() {
|
||||||
|
public String term() {
|
||||||
|
return token.term();
|
||||||
|
}
|
||||||
|
|
||||||
|
public char[] termBuffer() {
|
||||||
|
return token.termBuffer();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int termLength() {
|
||||||
|
return token.termLength();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
OffsetAttribute offsetAttribute = new OffsetAttribute() {
|
||||||
|
public int startOffset() {
|
||||||
|
return token.startOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
public int endOffset() {
|
||||||
|
return token.endOffset();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
PositionIncrementAttribute positionIncrementAttribute = new PositionIncrementAttribute() {
|
||||||
|
public int getPositionIncrement() {
|
||||||
|
return token.getPositionIncrement();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
FlagsAttribute flagsAttribute = new FlagsAttribute() {
|
||||||
|
public int getFlags() {
|
||||||
|
return token.getFlags();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
PayloadAttribute payloadAttribute = new PayloadAttribute() {
|
||||||
|
public Payload getPayload() {
|
||||||
|
return token.getPayload();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
TypeAttribute typeAttribute = new TypeAttribute() {
|
||||||
|
public String type() {
|
||||||
|
return token.type();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
BackwardsCompatibilityStream() {
|
||||||
|
attributes.put(TermAttribute.class, termAttribute);
|
||||||
|
attributes.put(OffsetAttribute.class, offsetAttribute);
|
||||||
|
attributes.put(PositionIncrementAttribute.class, positionIncrementAttribute);
|
||||||
|
attributes.put(FlagsAttribute.class, flagsAttribute);
|
||||||
|
attributes.put(PayloadAttribute.class, payloadAttribute);
|
||||||
|
attributes.put(TypeAttribute.class, typeAttribute);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void set(Token token) {
|
||||||
|
this.token = token;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
final DocumentsWriter.DocState docState;
|
final DocumentsWriter.DocState docState;
|
||||||
|
|
||||||
final FieldInvertState fieldState = new FieldInvertState();
|
final FieldInvertState fieldState = new FieldInvertState();
|
||||||
|
|
|
@ -17,6 +17,7 @@
|
||||||
package org.apache.lucene.index;
|
package org.apache.lucene.index;
|
||||||
|
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class tracks the number and position / offset parameters of terms
|
* This class tracks the number and position / offset parameters of terms
|
||||||
|
@ -32,6 +33,7 @@ public final class FieldInvertState {
|
||||||
int numOverlap;
|
int numOverlap;
|
||||||
int offset;
|
int offset;
|
||||||
float boost;
|
float boost;
|
||||||
|
AttributeSource attributeSource;
|
||||||
|
|
||||||
public FieldInvertState() {
|
public FieldInvertState() {
|
||||||
}
|
}
|
||||||
|
@ -54,6 +56,7 @@ public final class FieldInvertState {
|
||||||
numOverlap = 0;
|
numOverlap = 0;
|
||||||
offset = 0;
|
offset = 0;
|
||||||
boost = docBoost;
|
boost = docBoost;
|
||||||
|
attributeSource = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -97,4 +100,8 @@ public final class FieldInvertState {
|
||||||
public float getBoost() {
|
public float getBoost() {
|
||||||
return boost;
|
return boost;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public AttributeSource getAttributeSource() {
|
||||||
|
return attributeSource;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
|
||||||
// TODO: break into separate freq and prox writers as
|
// TODO: break into separate freq and prox writers as
|
||||||
// codecs; make separate container (tii/tis/skip/*) that can
|
// codecs; make separate container (tii/tis/skip/*) that can
|
||||||
|
@ -32,6 +32,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
final DocumentsWriter.DocState docState;
|
final DocumentsWriter.DocState docState;
|
||||||
final FieldInvertState fieldState;
|
final FieldInvertState fieldState;
|
||||||
boolean omitTf;
|
boolean omitTf;
|
||||||
|
PayloadAttribute payloadAttribute;
|
||||||
|
|
||||||
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
public FreqProxTermsWriterPerField(TermsHashPerField termsHashPerField, FreqProxTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
||||||
this.termsHashPerField = termsHashPerField;
|
this.termsHashPerField = termsHashPerField;
|
||||||
|
@ -53,7 +54,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
|
|
||||||
boolean hasPayloads;
|
boolean hasPayloads;
|
||||||
|
|
||||||
void skippingLongTerm(Token t) throws IOException {}
|
void skippingLongTerm() throws IOException {}
|
||||||
|
|
||||||
public int compareTo(Object other0) {
|
public int compareTo(Object other0) {
|
||||||
FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0;
|
FreqProxTermsWriterPerField other = (FreqProxTermsWriterPerField) other0;
|
||||||
|
@ -64,6 +65,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
// Record, up front, whether our in-RAM format will be
|
// Record, up front, whether our in-RAM format will be
|
||||||
// with or without term freqs:
|
// with or without term freqs:
|
||||||
omitTf = fieldInfo.omitTf;
|
omitTf = fieldInfo.omitTf;
|
||||||
|
payloadAttribute = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean start(Fieldable[] fields, int count) {
|
boolean start(Fieldable[] fields, int count) {
|
||||||
|
@ -72,9 +74,23 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void start(Fieldable f) {
|
||||||
|
if (fieldState.attributeSource.hasAttribute(PayloadAttribute.class)) {
|
||||||
|
payloadAttribute = (PayloadAttribute) fieldState.attributeSource.getAttribute(PayloadAttribute.class);
|
||||||
|
} else {
|
||||||
|
payloadAttribute = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
final void writeProx(Token t, FreqProxTermsWriter.PostingList p, int proxCode) {
|
final void writeProx(FreqProxTermsWriter.PostingList p, int proxCode) {
|
||||||
final Payload payload = t.getPayload();
|
final Payload payload;
|
||||||
|
if (payloadAttribute == null) {
|
||||||
|
payload = null;
|
||||||
|
} else {
|
||||||
|
payload = payloadAttribute.getPayload();
|
||||||
|
}
|
||||||
|
|
||||||
if (payload != null && payload.length > 0) {
|
if (payload != null && payload.length > 0) {
|
||||||
termsHashPerField.writeVInt(1, (proxCode<<1)|1);
|
termsHashPerField.writeVInt(1, (proxCode<<1)|1);
|
||||||
termsHashPerField.writeVInt(1, payload.length);
|
termsHashPerField.writeVInt(1, payload.length);
|
||||||
|
@ -85,7 +101,7 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
p.lastPosition = fieldState.position;
|
p.lastPosition = fieldState.position;
|
||||||
}
|
}
|
||||||
|
|
||||||
final void newTerm(Token t, RawPostingList p0) {
|
final void newTerm(RawPostingList p0) {
|
||||||
// First time we're seeing this term since the last
|
// First time we're seeing this term since the last
|
||||||
// flush
|
// flush
|
||||||
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
|
assert docState.testPoint("FreqProxTermsWriterPerField.newTerm start");
|
||||||
|
@ -96,11 +112,11 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
} else {
|
} else {
|
||||||
p.lastDocCode = docState.docID << 1;
|
p.lastDocCode = docState.docID << 1;
|
||||||
p.docFreq = 1;
|
p.docFreq = 1;
|
||||||
writeProx(t, p, fieldState.position);
|
writeProx(p, fieldState.position);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
final void addTerm(Token t, RawPostingList p0) {
|
final void addTerm(RawPostingList p0) {
|
||||||
|
|
||||||
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
|
assert docState.testPoint("FreqProxTermsWriterPerField.addTerm start");
|
||||||
|
|
||||||
|
@ -132,10 +148,10 @@ final class FreqProxTermsWriterPerField extends TermsHashConsumerPerField implem
|
||||||
p.docFreq = 1;
|
p.docFreq = 1;
|
||||||
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
|
p.lastDocCode = (docState.docID - p.lastDocID) << 1;
|
||||||
p.lastDocID = docState.docID;
|
p.lastDocID = docState.docID;
|
||||||
writeProx(t, p, fieldState.position);
|
writeProx(p, fieldState.position);
|
||||||
} else {
|
} else {
|
||||||
p.docFreq++;
|
p.docFreq++;
|
||||||
writeProx(t, p, fieldState.position-p.lastPosition);
|
writeProx(p, fieldState.position-p.lastPosition);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,10 +17,10 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.document.Fieldable;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.lucene.document.Fieldable;
|
||||||
|
|
||||||
abstract class InvertedDocConsumerPerField {
|
abstract class InvertedDocConsumerPerField {
|
||||||
|
|
||||||
// Called once per field, and is given all Fieldable
|
// Called once per field, and is given all Fieldable
|
||||||
|
@ -29,8 +29,11 @@ abstract class InvertedDocConsumerPerField {
|
||||||
// fields:
|
// fields:
|
||||||
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
||||||
|
|
||||||
|
// Called before a field instance is being processed
|
||||||
|
abstract void start(Fieldable field);
|
||||||
|
|
||||||
// Called once per inverted token
|
// Called once per inverted token
|
||||||
abstract void add(Token token) throws IOException;
|
abstract void add() throws IOException;
|
||||||
|
|
||||||
// Called once per field per document, after all Fieldable
|
// Called once per field per document, after all Fieldable
|
||||||
// occurrences are inverted
|
// occurrences are inverted
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.index;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.ArrayUtil;
|
import org.apache.lucene.util.ArrayUtil;
|
||||||
|
|
||||||
|
@ -29,7 +28,7 @@ import org.apache.lucene.util.ArrayUtil;
|
||||||
* specific term.
|
* specific term.
|
||||||
* <p>
|
* <p>
|
||||||
* To store payloads in the index a {@link TokenStream} has to be used that
|
* To store payloads in the index a {@link TokenStream} has to be used that
|
||||||
* produces {@link Token}s containing payload data.
|
* produces payload data.
|
||||||
* <p>
|
* <p>
|
||||||
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
|
* Use {@link TermPositions#getPayloadLength()} and {@link TermPositions#getPayload(byte[], int)}
|
||||||
* to retrieve the payloads from the index.<br>
|
* to retrieve the payloads from the index.<br>
|
||||||
|
|
|
@ -18,10 +18,11 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
|
|
||||||
|
@ -37,7 +38,8 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
boolean doVectorOffsets;
|
boolean doVectorOffsets;
|
||||||
|
|
||||||
int maxNumPostings;
|
int maxNumPostings;
|
||||||
|
OffsetAttribute offsetAttribute = null;
|
||||||
|
|
||||||
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
public TermVectorsTermsWriterPerField(TermsHashPerField termsHashPerField, TermVectorsTermsWriterPerThread perThread, FieldInfo fieldInfo) {
|
||||||
this.termsHashPerField = termsHashPerField;
|
this.termsHashPerField = termsHashPerField;
|
||||||
this.perThread = perThread;
|
this.perThread = perThread;
|
||||||
|
@ -191,8 +193,16 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
termsHashPerField.shrinkHash(maxNumPostings);
|
termsHashPerField.shrinkHash(maxNumPostings);
|
||||||
maxNumPostings = 0;
|
maxNumPostings = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void start(Fieldable f) {
|
||||||
|
if (doVectorOffsets && fieldState.attributeSource.hasAttribute(OffsetAttribute.class)) {
|
||||||
|
offsetAttribute = (OffsetAttribute) fieldState.attributeSource.getAttribute(OffsetAttribute.class);
|
||||||
|
} else {
|
||||||
|
offsetAttribute = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void newTerm(Token t, RawPostingList p0) {
|
void newTerm(RawPostingList p0) {
|
||||||
|
|
||||||
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
|
assert docState.testPoint("TermVectorsTermsWriterPerField.newTerm start");
|
||||||
|
|
||||||
|
@ -201,8 +211,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
p.freq = 1;
|
p.freq = 1;
|
||||||
|
|
||||||
if (doVectorOffsets) {
|
if (doVectorOffsets) {
|
||||||
final int startOffset = fieldState.offset + t.startOffset();
|
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
|
||||||
final int endOffset = fieldState.offset + t.endOffset();
|
int endOffset = fieldState.offset + offsetAttribute.endOffset();
|
||||||
|
|
||||||
termsHashPerField.writeVInt(1, startOffset);
|
termsHashPerField.writeVInt(1, startOffset);
|
||||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||||
p.lastOffset = endOffset;
|
p.lastOffset = endOffset;
|
||||||
|
@ -214,7 +225,7 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void addTerm(Token t, RawPostingList p0) {
|
void addTerm(RawPostingList p0) {
|
||||||
|
|
||||||
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
|
assert docState.testPoint("TermVectorsTermsWriterPerField.addTerm start");
|
||||||
|
|
||||||
|
@ -222,8 +233,9 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
p.freq++;
|
p.freq++;
|
||||||
|
|
||||||
if (doVectorOffsets) {
|
if (doVectorOffsets) {
|
||||||
final int startOffset = fieldState.offset + t.startOffset();
|
int startOffset = fieldState.offset + offsetAttribute.startOffset();;
|
||||||
final int endOffset = fieldState.offset + t.endOffset();
|
int endOffset = fieldState.offset + offsetAttribute.endOffset();
|
||||||
|
|
||||||
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
|
termsHashPerField.writeVInt(1, startOffset - p.lastOffset);
|
||||||
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
termsHashPerField.writeVInt(1, endOffset - startOffset);
|
||||||
p.lastOffset = endOffset;
|
p.lastOffset = endOffset;
|
||||||
|
@ -235,5 +247,5 @@ final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void skippingLongTerm(Token t) {}
|
void skippingLongTerm() {}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,14 +23,15 @@ package org.apache.lucene.index;
|
||||||
* multiple streams for each unique Token. */
|
* multiple streams for each unique Token. */
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
abstract class TermsHashConsumerPerField {
|
abstract class TermsHashConsumerPerField {
|
||||||
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
abstract boolean start(Fieldable[] fields, int count) throws IOException;
|
||||||
abstract void finish() throws IOException;
|
abstract void finish() throws IOException;
|
||||||
abstract void skippingLongTerm(Token t) throws IOException;
|
abstract void skippingLongTerm() throws IOException;
|
||||||
abstract void newTerm(Token t, RawPostingList p) throws IOException;
|
abstract void start(Fieldable field);
|
||||||
abstract void addTerm(Token t, RawPostingList p) throws IOException;
|
abstract void newTerm(RawPostingList p) throws IOException;
|
||||||
|
abstract void addTerm(RawPostingList p) throws IOException;
|
||||||
abstract int getStreamCount();
|
abstract int getStreamCount();
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,8 +20,8 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
final class TermsHashPerField extends InvertedDocConsumerPerField {
|
final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
|
@ -31,7 +31,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
final TermsHashPerThread perThread;
|
final TermsHashPerThread perThread;
|
||||||
final DocumentsWriter.DocState docState;
|
final DocumentsWriter.DocState docState;
|
||||||
final FieldInvertState fieldState;
|
final FieldInvertState fieldState;
|
||||||
|
TermAttribute termAtt;
|
||||||
|
|
||||||
// Copied from our perThread
|
// Copied from our perThread
|
||||||
final CharBlockPool charPool;
|
final CharBlockPool charPool;
|
||||||
final IntBlockPool intPool;
|
final IntBlockPool intPool;
|
||||||
|
@ -49,7 +50,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
private int postingsHashMask = postingsHashSize-1;
|
private int postingsHashMask = postingsHashSize-1;
|
||||||
private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
|
private RawPostingList[] postingsHash = new RawPostingList[postingsHashSize];
|
||||||
private RawPostingList p;
|
private RawPostingList p;
|
||||||
|
|
||||||
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
|
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
|
||||||
this.perThread = perThread;
|
this.perThread = perThread;
|
||||||
intPool = perThread.intPool;
|
intPool = perThread.intPool;
|
||||||
|
@ -247,6 +248,14 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
private boolean doCall;
|
private boolean doCall;
|
||||||
private boolean doNextCall;
|
private boolean doNextCall;
|
||||||
|
|
||||||
|
void start(Fieldable f) {
|
||||||
|
termAtt = (TermAttribute) fieldState.attributeSource.getAttribute(TermAttribute.class);
|
||||||
|
consumer.start(f);
|
||||||
|
if (nextPerField != null) {
|
||||||
|
nextPerField.start(f);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
boolean start(Fieldable[] fields, int count) throws IOException {
|
boolean start(Fieldable[] fields, int count) throws IOException {
|
||||||
doCall = consumer.start(fields, count);
|
doCall = consumer.start(fields, count);
|
||||||
if (nextPerField != null)
|
if (nextPerField != null)
|
||||||
|
@ -257,7 +266,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
// Secondary entry point (for 2nd & subsequent TermsHash),
|
// Secondary entry point (for 2nd & subsequent TermsHash),
|
||||||
// because token text has already been "interned" into
|
// because token text has already been "interned" into
|
||||||
// textStart, so we hash by textStart
|
// textStart, so we hash by textStart
|
||||||
public void add(Token token, int textStart) throws IOException {
|
public void add(int textStart) throws IOException {
|
||||||
|
|
||||||
int code = textStart;
|
int code = textStart;
|
||||||
|
|
||||||
|
@ -320,17 +329,17 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
}
|
}
|
||||||
p.byteStart = intUptos[intUptoStart];
|
p.byteStart = intUptos[intUptoStart];
|
||||||
|
|
||||||
consumer.newTerm(token, p);
|
consumer.newTerm(p);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
||||||
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
||||||
consumer.addTerm(token, p);
|
consumer.addTerm(p);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Primary entry point (for first TermsHash)
|
// Primary entry point (for first TermsHash)
|
||||||
void add(Token token) throws IOException {
|
void add() throws IOException {
|
||||||
|
|
||||||
assert !postingsCompacted;
|
assert !postingsCompacted;
|
||||||
|
|
||||||
|
@ -338,8 +347,8 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
// term text into textStart address
|
// term text into textStart address
|
||||||
|
|
||||||
// Get the text of this term.
|
// Get the text of this term.
|
||||||
final char[] tokenText = token.termBuffer();
|
final char[] tokenText = termAtt.termBuffer();;
|
||||||
final int tokenTextLen = token.termLength();
|
final int tokenTextLen = termAtt.termLength();
|
||||||
|
|
||||||
// Compute hashcode & replace any invalid UTF16 sequences
|
// Compute hashcode & replace any invalid UTF16 sequences
|
||||||
int downto = tokenTextLen;
|
int downto = tokenTextLen;
|
||||||
|
@ -403,7 +412,7 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
if (docState.maxTermPrefix == null)
|
if (docState.maxTermPrefix == null)
|
||||||
docState.maxTermPrefix = new String(tokenText, 0, 30);
|
docState.maxTermPrefix = new String(tokenText, 0, 30);
|
||||||
|
|
||||||
consumer.skippingLongTerm(token);
|
consumer.skippingLongTerm();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
charPool.nextBuffer();
|
charPool.nextBuffer();
|
||||||
|
@ -450,16 +459,16 @@ final class TermsHashPerField extends InvertedDocConsumerPerField {
|
||||||
}
|
}
|
||||||
p.byteStart = intUptos[intUptoStart];
|
p.byteStart = intUptos[intUptoStart];
|
||||||
|
|
||||||
consumer.newTerm(token, p);
|
consumer.newTerm(p);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
intUptos = intPool.buffers[p.intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
|
||||||
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
intUptoStart = p.intStart & DocumentsWriter.INT_BLOCK_MASK;
|
||||||
consumer.addTerm(token, p);
|
consumer.addTerm(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (doNextCall)
|
if (doNextCall)
|
||||||
nextPerField.add(token, p.textStart);
|
nextPerField.add(p.textStart);
|
||||||
}
|
}
|
||||||
|
|
||||||
int[] intUptos;
|
int[] intUptos;
|
||||||
|
|
|
@ -3,8 +3,8 @@ package org.apache.lucene.queryParser;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.text.Collator;
|
import java.text.Collator;
|
||||||
|
import java.text.DateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
@ -15,7 +15,10 @@ import java.util.Map;
|
||||||
import java.util.Vector;
|
import java.util.Vector;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -518,48 +521,126 @@ public class QueryParser implements QueryParserConstants {
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||||
List list = new ArrayList();
|
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
TermAttribute termAtt = null;
|
||||||
org.apache.lucene.analysis.Token nextToken;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
|
int numTokens = 0;
|
||||||
|
|
||||||
|
org.apache.lucene.analysis.Token reusableToken = null;
|
||||||
|
org.apache.lucene.analysis.Token nextToken = null;
|
||||||
|
|
||||||
|
|
||||||
|
boolean useNewAPI = TokenStream.useNewAPIDefault();
|
||||||
|
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
buffer.reset();
|
||||||
|
success = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
// success==false if we hit an exception
|
||||||
|
}
|
||||||
|
if (success) {
|
||||||
|
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||||
|
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
reusableToken = new org.apache.lucene.analysis.Token();
|
||||||
|
}
|
||||||
|
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
|
||||||
while (true) {
|
if (useNewAPI) {
|
||||||
try {
|
if (termAtt != null) {
|
||||||
nextToken = source.next(reusableToken);
|
try {
|
||||||
|
while (buffer.incrementToken()) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
} else {
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
} else {
|
||||||
nextToken = null;
|
while (true) {
|
||||||
|
try {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
}
|
||||||
|
catch (IOException e) {
|
||||||
|
nextToken = null;
|
||||||
|
}
|
||||||
|
if (nextToken == null)
|
||||||
|
break;
|
||||||
|
numTokens++;
|
||||||
|
if (nextToken.getPositionIncrement() != 0)
|
||||||
|
positionCount += nextToken.getPositionIncrement();
|
||||||
|
else
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
}
|
}
|
||||||
if (nextToken == null)
|
|
||||||
break;
|
|
||||||
list.add(nextToken.clone());
|
|
||||||
if (nextToken.getPositionIncrement() != 0)
|
|
||||||
positionCount += nextToken.getPositionIncrement();
|
|
||||||
else
|
|
||||||
severalTokensAtSamePosition = true;
|
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
// rewind the buffer stream
|
||||||
|
buffer.reset();
|
||||||
|
|
||||||
|
// close original stream - all tokens buffered
|
||||||
source.close();
|
source.close();
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
// ignore
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
if (list.size() == 0)
|
if (numTokens == 0)
|
||||||
return null;
|
return null;
|
||||||
else if (list.size() == 1) {
|
else if (numTokens == 1) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
|
String term = null;
|
||||||
return newTermQuery(new Term(field, nextToken.term()));
|
try {
|
||||||
|
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
return newTermQuery(new Term(field, term));
|
||||||
} else {
|
} else {
|
||||||
if (severalTokensAtSamePosition) {
|
if (severalTokensAtSamePosition) {
|
||||||
if (positionCount == 1) {
|
if (positionCount == 1) {
|
||||||
// no phrase query:
|
// no phrase query:
|
||||||
BooleanQuery q = newBooleanQuery(true);
|
BooleanQuery q = newBooleanQuery(true);
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < numTokens; i++) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
String term = null;
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
Query currentQuery = newTermQuery(
|
Query currentQuery = newTermQuery(
|
||||||
new Term(field, nextToken.term()));
|
new Term(field, term));
|
||||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
|
@ -570,9 +651,28 @@ public class QueryParser implements QueryParserConstants {
|
||||||
mpq.setSlop(phraseSlop);
|
mpq.setSlop(phraseSlop);
|
||||||
List multiTerms = new ArrayList();
|
List multiTerms = new ArrayList();
|
||||||
int position = -1;
|
int position = -1;
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < numTokens; i++) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
String term = null;
|
||||||
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
|
int positionIncrement = 1;
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
if (posIncrAtt != null) {
|
||||||
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
positionIncrement = nextToken.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
if (positionIncrement > 0 && multiTerms.size() > 0) {
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||||
} else {
|
} else {
|
||||||
|
@ -580,8 +680,8 @@ public class QueryParser implements QueryParserConstants {
|
||||||
}
|
}
|
||||||
multiTerms.clear();
|
multiTerms.clear();
|
||||||
}
|
}
|
||||||
position += nextToken.getPositionIncrement();
|
position += positionIncrement;
|
||||||
multiTerms.add(new Term(field, nextToken.term()));
|
multiTerms.add(new Term(field, term));
|
||||||
}
|
}
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||||
|
@ -595,13 +695,36 @@ public class QueryParser implements QueryParserConstants {
|
||||||
PhraseQuery pq = newPhraseQuery();
|
PhraseQuery pq = newPhraseQuery();
|
||||||
pq.setSlop(phraseSlop);
|
pq.setSlop(phraseSlop);
|
||||||
int position = -1;
|
int position = -1;
|
||||||
for (int i = 0; i < list.size(); i++) {
|
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
|
||||||
|
for (int i = 0; i < numTokens; i++) {
|
||||||
|
String term = null;
|
||||||
|
int positionIncrement = 1;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
if (posIncrAtt != null) {
|
||||||
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
positionIncrement = nextToken.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
position += nextToken.getPositionIncrement();
|
position += positionIncrement;
|
||||||
pq.add(new Term(field, nextToken.term()),position);
|
pq.add(new Term(field, term),position);
|
||||||
} else {
|
} else {
|
||||||
pq.add(new Term(field, nextToken.term()));
|
pq.add(new Term(field, term));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return pq;
|
return pq;
|
||||||
|
@ -610,6 +733,7 @@ public class QueryParser implements QueryParserConstants {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
||||||
* This method may be overridden, for example, to return
|
* This method may be overridden, for example, to return
|
||||||
|
@ -1503,12 +1627,6 @@ public class QueryParser implements QueryParserConstants {
|
||||||
finally { jj_save(0, xla); }
|
finally { jj_save(0, xla); }
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean jj_3R_3() {
|
|
||||||
if (jj_scan_token(STAR)) return true;
|
|
||||||
if (jj_scan_token(COLON)) return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean jj_3R_2() {
|
private boolean jj_3R_2() {
|
||||||
if (jj_scan_token(TERM)) return true;
|
if (jj_scan_token(TERM)) return true;
|
||||||
if (jj_scan_token(COLON)) return true;
|
if (jj_scan_token(COLON)) return true;
|
||||||
|
@ -1525,6 +1643,12 @@ public class QueryParser implements QueryParserConstants {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean jj_3R_3() {
|
||||||
|
if (jj_scan_token(STAR)) return true;
|
||||||
|
if (jj_scan_token(COLON)) return true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/** Generated Token Manager. */
|
/** Generated Token Manager. */
|
||||||
public QueryParserTokenManager token_source;
|
public QueryParserTokenManager token_source;
|
||||||
/** Current token. */
|
/** Current token. */
|
||||||
|
|
|
@ -27,8 +27,8 @@ package org.apache.lucene.queryParser;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.text.Collator;
|
import java.text.Collator;
|
||||||
|
import java.text.DateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
@ -39,7 +39,10 @@ import java.util.Map;
|
||||||
import java.util.Vector;
|
import java.util.Vector;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -542,48 +545,126 @@ public class QueryParser {
|
||||||
// PhraseQuery, or nothing based on the term count
|
// PhraseQuery, or nothing based on the term count
|
||||||
|
|
||||||
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
|
||||||
List list = new ArrayList();
|
CachingTokenFilter buffer = new CachingTokenFilter(source);
|
||||||
final org.apache.lucene.analysis.Token reusableToken = new org.apache.lucene.analysis.Token();
|
TermAttribute termAtt = null;
|
||||||
org.apache.lucene.analysis.Token nextToken;
|
PositionIncrementAttribute posIncrAtt = null;
|
||||||
|
int numTokens = 0;
|
||||||
|
|
||||||
|
org.apache.lucene.analysis.Token reusableToken = null;
|
||||||
|
org.apache.lucene.analysis.Token nextToken = null;
|
||||||
|
|
||||||
|
|
||||||
|
boolean useNewAPI = TokenStream.useNewAPI();
|
||||||
|
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean success = false;
|
||||||
|
try {
|
||||||
|
buffer.start();
|
||||||
|
success = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
// success==false if we hit an exception
|
||||||
|
}
|
||||||
|
if (success) {
|
||||||
|
if (buffer.hasAttribute(TermAttribute.class)) {
|
||||||
|
termAtt = (TermAttribute) buffer.getAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) buffer.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
reusableToken = new org.apache.lucene.analysis.Token();
|
||||||
|
}
|
||||||
|
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
|
|
||||||
while (true) {
|
if (useNewAPI) {
|
||||||
try {
|
if (termAtt != null) {
|
||||||
nextToken = source.next(reusableToken);
|
try {
|
||||||
|
while (buffer.incrementToken()) {
|
||||||
|
numTokens++;
|
||||||
|
int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
|
||||||
|
if (positionIncrement != 0) {
|
||||||
|
positionCount += positionIncrement;
|
||||||
|
} else {
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
} else {
|
||||||
nextToken = null;
|
while (true) {
|
||||||
}
|
try {
|
||||||
if (nextToken == null)
|
nextToken = buffer.next(reusableToken);
|
||||||
break;
|
}
|
||||||
list.add(nextToken.clone());
|
catch (IOException e) {
|
||||||
if (nextToken.getPositionIncrement() != 0)
|
nextToken = null;
|
||||||
positionCount += nextToken.getPositionIncrement();
|
}
|
||||||
else
|
if (nextToken == null)
|
||||||
severalTokensAtSamePosition = true;
|
break;
|
||||||
|
numTokens++;
|
||||||
|
if (nextToken.getPositionIncrement() != 0)
|
||||||
|
positionCount += nextToken.getPositionIncrement();
|
||||||
|
else
|
||||||
|
severalTokensAtSamePosition = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
// rewind the buffer stream
|
||||||
|
buffer.reset();
|
||||||
|
|
||||||
|
// close original stream - all tokens buffered
|
||||||
source.close();
|
source.close();
|
||||||
}
|
}
|
||||||
catch (IOException e) {
|
catch (IOException e) {
|
||||||
// ignore
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
if (list.size() == 0)
|
if (numTokens == 0)
|
||||||
return null;
|
return null;
|
||||||
else if (list.size() == 1) {
|
else if (numTokens == 1) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(0);
|
String term = null;
|
||||||
return newTermQuery(new Term(field, nextToken.term()));
|
try {
|
||||||
|
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
return newTermQuery(new Term(field, term));
|
||||||
} else {
|
} else {
|
||||||
if (severalTokensAtSamePosition) {
|
if (severalTokensAtSamePosition) {
|
||||||
if (positionCount == 1) {
|
if (positionCount == 1) {
|
||||||
// no phrase query:
|
// no phrase query:
|
||||||
BooleanQuery q = newBooleanQuery(true);
|
BooleanQuery q = newBooleanQuery(true);
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < numTokens; i++) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
String term = null;
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
Query currentQuery = newTermQuery(
|
Query currentQuery = newTermQuery(
|
||||||
new Term(field, nextToken.term()));
|
new Term(field, term));
|
||||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
|
@ -594,9 +675,28 @@ public class QueryParser {
|
||||||
mpq.setSlop(phraseSlop);
|
mpq.setSlop(phraseSlop);
|
||||||
List multiTerms = new ArrayList();
|
List multiTerms = new ArrayList();
|
||||||
int position = -1;
|
int position = -1;
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < numTokens; i++) {
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
String term = null;
|
||||||
if (nextToken.getPositionIncrement() > 0 && multiTerms.size() > 0) {
|
int positionIncrement = 1;
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
if (posIncrAtt != null) {
|
||||||
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
positionIncrement = nextToken.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
|
if (positionIncrement > 0 && multiTerms.size() > 0) {
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||||
} else {
|
} else {
|
||||||
|
@ -604,8 +704,8 @@ public class QueryParser {
|
||||||
}
|
}
|
||||||
multiTerms.clear();
|
multiTerms.clear();
|
||||||
}
|
}
|
||||||
position += nextToken.getPositionIncrement();
|
position += positionIncrement;
|
||||||
multiTerms.add(new Term(field, nextToken.term()));
|
multiTerms.add(new Term(field, term));
|
||||||
}
|
}
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
|
||||||
|
@ -619,13 +719,36 @@ public class QueryParser {
|
||||||
PhraseQuery pq = newPhraseQuery();
|
PhraseQuery pq = newPhraseQuery();
|
||||||
pq.setSlop(phraseSlop);
|
pq.setSlop(phraseSlop);
|
||||||
int position = -1;
|
int position = -1;
|
||||||
for (int i = 0; i < list.size(); i++) {
|
|
||||||
nextToken = (org.apache.lucene.analysis.Token) list.get(i);
|
|
||||||
|
for (int i = 0; i < numTokens; i++) {
|
||||||
|
String term = null;
|
||||||
|
int positionIncrement = 1;
|
||||||
|
|
||||||
|
try {
|
||||||
|
if (useNewAPI) {
|
||||||
|
|
||||||
|
boolean hasNext = buffer.incrementToken();
|
||||||
|
assert hasNext == true;
|
||||||
|
term = termAtt.term();
|
||||||
|
if (posIncrAtt != null) {
|
||||||
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
nextToken = buffer.next(reusableToken);
|
||||||
|
assert nextToken != null;
|
||||||
|
term = nextToken.term();
|
||||||
|
positionIncrement = nextToken.getPositionIncrement();
|
||||||
|
}
|
||||||
|
} catch (IOException e) {
|
||||||
|
// safe to ignore, because we know the number of tokens
|
||||||
|
}
|
||||||
|
|
||||||
if (enablePositionIncrements) {
|
if (enablePositionIncrements) {
|
||||||
position += nextToken.getPositionIncrement();
|
position += positionIncrement;
|
||||||
pq.add(new Term(field, nextToken.term()),position);
|
pq.add(new Term(field, term),position);
|
||||||
} else {
|
} else {
|
||||||
pq.add(new Term(field, nextToken.term()));
|
pq.add(new Term(field, term));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return pq;
|
return pq;
|
||||||
|
@ -634,6 +757,7 @@ public class QueryParser {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
* Base implementation delegates to {@link #getFieldQuery(String,String)}.
|
||||||
* This method may be overridden, for example, to return
|
* This method may be overridden, for example, to return
|
||||||
|
|
|
@ -2,8 +2,8 @@
|
||||||
package org.apache.lucene.queryParser;
|
package org.apache.lucene.queryParser;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.text.Collator;
|
import java.text.Collator;
|
||||||
|
import java.text.DateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
@ -13,7 +13,10 @@ import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Vector;
|
import java.util.Vector;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
|
|
@ -29,6 +29,7 @@ import java.util.Map;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.TermFreqVector;
|
import org.apache.lucene.index.TermFreqVector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -58,9 +59,17 @@ public class QueryTermVector implements TermFreqVector {
|
||||||
{
|
{
|
||||||
List terms = new ArrayList();
|
List terms = new ArrayList();
|
||||||
try {
|
try {
|
||||||
final Token reusableToken = new Token();
|
if (stream.useNewAPI()) {
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
stream.reset();
|
||||||
terms.add(nextToken.term());
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
terms.add(termAtt.term());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final Token reusableToken = new Token();
|
||||||
|
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
||||||
|
terms.add(nextToken.term());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
processTerms((String[])terms.toArray(new String[terms.size()]));
|
processTerms((String[])terms.toArray(new String[terms.size()]));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -0,0 +1,95 @@
|
||||||
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base class for Attributes that can be added to a
|
||||||
|
* {@link org.apache.lucene.util.AttributeSource}.
|
||||||
|
* <p>
|
||||||
|
* Attributes are used to add data in a dynamic, yet type-safe way to a source
|
||||||
|
* of usually streamed objects, e. g. a {@link org.apache.lucene.analysis.TokenStream}.
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public abstract class Attribute implements Cloneable, Serializable {
|
||||||
|
/**
|
||||||
|
* Clears the values in this Attribute and resets it to its
|
||||||
|
* default value.
|
||||||
|
*/
|
||||||
|
public abstract void clear();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclasses must implement this method and should follow a syntax
|
||||||
|
* similar to this one:
|
||||||
|
*
|
||||||
|
* <pre>
|
||||||
|
* public String toString() {
|
||||||
|
* return "start=" + startOffset + ",end=" + endOffset;
|
||||||
|
* }
|
||||||
|
* </pre>
|
||||||
|
*/
|
||||||
|
public abstract String toString();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Subclasses must implement this method and should compute
|
||||||
|
* a hashCode similar to this:
|
||||||
|
* <pre>
|
||||||
|
* public int hashCode() {
|
||||||
|
* int code = startOffset;
|
||||||
|
* code = code * 31 + endOffset;
|
||||||
|
* return code;
|
||||||
|
* }
|
||||||
|
* </pre>
|
||||||
|
*
|
||||||
|
* see also {@link #equals(Object)}
|
||||||
|
*/
|
||||||
|
public abstract int hashCode();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* All values used for computation of {@link #hashCode()}
|
||||||
|
* should be checked here for equality.
|
||||||
|
*
|
||||||
|
* see also {@link Object#equals(Object)}
|
||||||
|
*/
|
||||||
|
public abstract boolean equals(Object other);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Copies the values from this Attribute into the passed-in
|
||||||
|
* target attribute. The type of the target must match the type
|
||||||
|
* of this attribute.
|
||||||
|
*/
|
||||||
|
public abstract void copyTo(Attribute target);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Shallow clone. Subclasses must override this if they
|
||||||
|
* need to clone any members deeply,
|
||||||
|
*/
|
||||||
|
public Object clone() {
|
||||||
|
Object clone = null;
|
||||||
|
try {
|
||||||
|
clone = super.clone();
|
||||||
|
} catch (CloneNotSupportedException e) {
|
||||||
|
throw new RuntimeException(e); // shouldn't happen
|
||||||
|
}
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,274 @@
|
||||||
|
package org.apache.lucene.util;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.LinkedHashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* An AttributeSource contains a list of different {@link Attribute}s,
|
||||||
|
* and methods to add and get them. There can only be a single instance
|
||||||
|
* of an attribute in the same AttributeSource instance. This is ensured
|
||||||
|
* by passing in the actual type of the Attribute (Class<Attribute>) to
|
||||||
|
* the {@link #addAttribute(Class)}, which then checks if an instance of
|
||||||
|
* that type is already present. If yes, it returns the instance, otherwise
|
||||||
|
* it creates a new instance and returns it.
|
||||||
|
*
|
||||||
|
* <p><font color="#FF0000">
|
||||||
|
* WARNING: The status of the new TokenStream, AttributeSource and Attributes is experimental.
|
||||||
|
* The APIs introduced in these classes with Lucene 2.9 might change in the future.
|
||||||
|
* We will make our best efforts to keep the APIs backwards-compatible.</font>
|
||||||
|
*/
|
||||||
|
public class AttributeSource {
|
||||||
|
/**
|
||||||
|
* An AttributeAcceptor defines only a single method {@link #accept(Class)}.
|
||||||
|
* It can be used for e. g. buffering purposes to specify which attributes
|
||||||
|
* to buffer.
|
||||||
|
*/
|
||||||
|
public static abstract class AttributeAcceptor {
|
||||||
|
/** Return true, to accept this attribute; false otherwise */
|
||||||
|
public abstract boolean accept(Class attClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default AttributeAcceptor that accepts all attributes.
|
||||||
|
*/
|
||||||
|
public static final AttributeAcceptor AllAcceptor = new AttributeAcceptor() {
|
||||||
|
public boolean accept(Class attClass) {return true;}
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Holds the Class<Attribute> -> Attribute mapping
|
||||||
|
*/
|
||||||
|
protected Map attributes;
|
||||||
|
|
||||||
|
public AttributeSource() {
|
||||||
|
this.attributes = new LinkedHashMap();
|
||||||
|
}
|
||||||
|
|
||||||
|
public AttributeSource(AttributeSource input) {
|
||||||
|
this.attributes = input.attributes;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns an iterator that iterates the attributes
|
||||||
|
* in the same order they were added in.
|
||||||
|
*/
|
||||||
|
public Iterator getAttributesIterator() {
|
||||||
|
return attributes.values().iterator();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The caller must pass in a Class<? extends Attribute> value.
|
||||||
|
* This method first checks if an instance of that class is
|
||||||
|
* already in this AttributeSource and returns it. Otherwise a
|
||||||
|
* new instance is created, added to this AttributeSource and returned.
|
||||||
|
*/
|
||||||
|
public Attribute addAttribute(Class attClass) {
|
||||||
|
Attribute att = (Attribute) attributes.get(attClass);
|
||||||
|
if (att == null) {
|
||||||
|
try {
|
||||||
|
att = (Attribute) attClass.newInstance();
|
||||||
|
} catch (InstantiationException e) {
|
||||||
|
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||||
|
} catch (IllegalAccessException e) {
|
||||||
|
throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes.put(attClass, att);
|
||||||
|
}
|
||||||
|
return att;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true, iff this AttributeSource has any attributes */
|
||||||
|
public boolean hasAttributes() {
|
||||||
|
return !this.attributes.isEmpty();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The caller must pass in a Class<? extends Attribute> value.
|
||||||
|
* Returns true, iff this AttributeSource contains the passed-in Attribute.
|
||||||
|
*/
|
||||||
|
public boolean hasAttribute(Class attClass) {
|
||||||
|
return this.attributes.containsKey(attClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The caller must pass in a Class<? extends Attribute> value.
|
||||||
|
* Returns the instance of the passed in Attribute contained in this AttributeSource
|
||||||
|
*
|
||||||
|
* @throws IllegalArgumentException if this AttributeSource does not contain the
|
||||||
|
* Attribute
|
||||||
|
*/
|
||||||
|
public Attribute getAttribute(Class attClass) {
|
||||||
|
Attribute att = (Attribute) this.attributes.get(attClass);
|
||||||
|
if (att == null) {
|
||||||
|
throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||||
|
}
|
||||||
|
|
||||||
|
return att;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resets all Attributes in this AttributeSource by calling
|
||||||
|
* {@link Attribute#clear()} on each Attribute.
|
||||||
|
*/
|
||||||
|
public void clearAttributes() {
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
((Attribute) it.next()).clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captures the current state of the passed in TokenStream.
|
||||||
|
* <p>
|
||||||
|
* This state will contain all of the passed in TokenStream's
|
||||||
|
* {@link Attribute}s. If only a subset of the attributes is needed
|
||||||
|
* please use {@link #captureState(AttributeAcceptor)}
|
||||||
|
*/
|
||||||
|
public AttributeSource captureState() {
|
||||||
|
return captureState(AllAcceptor);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Captures the current state of the passed in TokenStream.
|
||||||
|
* <p>
|
||||||
|
* This state will contain all of the passed in TokenStream's
|
||||||
|
* {@link Attribute}s which the {@link AttributeAcceptor} accepts.
|
||||||
|
*/
|
||||||
|
public AttributeSource captureState(AttributeAcceptor acceptor) {
|
||||||
|
AttributeSource state = new AttributeSource();
|
||||||
|
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while(it.hasNext()) {
|
||||||
|
Attribute att = (Attribute) it.next();
|
||||||
|
if (acceptor.accept(att.getClass())) {
|
||||||
|
Attribute clone = (Attribute) att.clone();
|
||||||
|
state.attributes.put(att.getClass(), clone);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return state;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Restores this state by copying the values of all attributes
|
||||||
|
* that this state contains into the attributes of the targetStream.
|
||||||
|
* The targetStream must contain a corresponding instance for each argument
|
||||||
|
* contained in this state.
|
||||||
|
* <p>
|
||||||
|
* Note that this method does not affect attributes of the targetStream
|
||||||
|
* that are not contained in this state. In other words, if for example
|
||||||
|
* the targetStream contains an OffsetAttribute, but this state doesn't, then
|
||||||
|
* the value of the OffsetAttribute remains unchanged. It might be desirable to
|
||||||
|
* reset its value to the default, in which case the caller should first
|
||||||
|
* call {@link TokenStream#clearAttributes()} on the targetStream.
|
||||||
|
*/
|
||||||
|
public void restoreState(AttributeSource target) {
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
Attribute att = (Attribute) it.next();
|
||||||
|
Attribute targetAtt = target.getAttribute(att.getClass());
|
||||||
|
att.copyTo(targetAtt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int hashCode() {
|
||||||
|
int code = 0;
|
||||||
|
if (hasAttributes()) {
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
code = code * 31 + it.next().hashCode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return code;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (obj instanceof AttributeSource) {
|
||||||
|
AttributeSource other = (AttributeSource) obj;
|
||||||
|
|
||||||
|
if (hasAttributes()) {
|
||||||
|
if (!other.hasAttributes()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attributes.size() != other.attributes.size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Iterator it = getAttributesIterator();
|
||||||
|
while (it.hasNext()) {
|
||||||
|
Class attName = it.next().getClass();
|
||||||
|
|
||||||
|
Attribute otherAtt = (Attribute) other.attributes.get(attName);
|
||||||
|
if (otherAtt == null || !otherAtt.equals(attributes.get(attName))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return !other.hasAttributes();
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// TODO: Java 1.5
|
||||||
|
// private Map<Class<? extends Attribute>, Attribute> attributes;
|
||||||
|
// public <T extends Attribute> T addAttribute(Class<T> attClass) {
|
||||||
|
// T att = (T) attributes.get(attClass);
|
||||||
|
// if (att == null) {
|
||||||
|
// try {
|
||||||
|
// att = attClass.newInstance();
|
||||||
|
// } catch (InstantiationException e) {
|
||||||
|
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||||
|
// } catch (IllegalAccessException e) {
|
||||||
|
// throw new IllegalArgumentException("Could not instantiate class " + attClass);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// attributes.put(attClass, att);
|
||||||
|
// }
|
||||||
|
// return att;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public boolean hasAttribute(Class<? extends Attribute> attClass) {
|
||||||
|
// return this.attributes.containsKey(attClass);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public <T extends Attribute> T getAttribute(Class<T> attClass) {
|
||||||
|
// Attribute att = this.attributes.get(attClass);
|
||||||
|
// if (att == null) {
|
||||||
|
// throw new IllegalArgumentException("This token does not have the attribute '" + attClass + "'.");
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// return (T) att;
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
|
||||||
|
}
|
|
@ -17,19 +17,20 @@ package org.apache.lucene;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
import java.io.BufferedReader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
import java.io.Reader;
|
||||||
|
import java.io.StringReader;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
class AnalysisTest {
|
class AnalysisTest {
|
||||||
static File tmpFile;
|
static File tmpFile;
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
|
@ -70,12 +71,15 @@ class AnalysisTest {
|
||||||
Date start = new Date();
|
Date start = new Date();
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
stream.reset();
|
||||||
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
|
||||||
|
while (stream.incrementToken()) {
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
System.out.println("Text=" + nextToken.term()
|
System.out.println("Text=" + termAtt.term()
|
||||||
+ " start=" + nextToken.startOffset()
|
+ " start=" + offsetAtt.startOffset()
|
||||||
+ " end=" + nextToken.endOffset());
|
+ " end=" + offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,9 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -40,7 +43,8 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
super(s);
|
super(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void setUp() {
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
tokens1 = new String[]{"The", "quick", "Burgundy", "Fox", "jumped", "over", "the", "lazy", "Red", "Dogs"};
|
||||||
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
tokens2 = new String[]{"The", "Lazy", "Dogs", "should", "stay", "on", "the", "porch"};
|
||||||
buffer1 = new StringBuffer();
|
buffer1 = new StringBuffer();
|
||||||
|
@ -62,24 +66,29 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
|
|
||||||
SinkTokenizer sink1 = new SinkTokenizer(null) {
|
SinkTokenizer sink1 = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(AttributeSource a) throws IOException {
|
||||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
TermAttribute termAtt = null;
|
||||||
super.add(t);
|
if (a.hasAttribute(TermAttribute.class)) {
|
||||||
|
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||||
|
super.add(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
TokenStream source = new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), sink1);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = source.next(reusableToken); nextToken != null; nextToken = source.next(reusableToken)) {
|
while (source.incrementToken()) {
|
||||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
assertTrue("sink1 Size: " + sink1.getTokens().size() + " is not: " + 2, sink1.getTokens().size() == 2);
|
||||||
i = 0;
|
i = 0;
|
||||||
for (Token token = sink1.next(reusableToken); token != null; token = sink1.next(reusableToken)) {
|
termAtt = (TermAttribute) sink1.getAttribute(TermAttribute.class);
|
||||||
assertTrue(token.term() + " is not equal to " + "The", token.term().equalsIgnoreCase("The") == true);
|
while (sink1.incrementToken()) {
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
|
assertTrue(i + " does not equal: " + sink1.getTokens().size(), i == sink1.getTokens().size());
|
||||||
|
@ -87,55 +96,67 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testMultipleSources() throws Exception {
|
public void testMultipleSources() throws Exception {
|
||||||
SinkTokenizer theDetector = new SinkTokenizer(null) {
|
SinkTokenizer theDetector = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(AttributeSource a) throws IOException {
|
||||||
if (t != null && t.term().equalsIgnoreCase("The")) {
|
TermAttribute termAtt = null;
|
||||||
super.add(t);
|
if (a.hasAttribute(TermAttribute.class)) {
|
||||||
|
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
if (termAtt != null && termAtt.term().equalsIgnoreCase("The")) {
|
||||||
|
super.add(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
SinkTokenizer dogDetector = new SinkTokenizer(null) {
|
||||||
public void add(Token t) {
|
public void add(AttributeSource a) throws IOException {
|
||||||
if (t != null && t.term().equalsIgnoreCase("Dogs")) {
|
TermAttribute termAtt = null;
|
||||||
super.add(t);
|
if (a.hasAttribute(TermAttribute.class)) {
|
||||||
|
termAtt = (TermAttribute) a.getAttribute(TermAttribute.class);
|
||||||
|
}
|
||||||
|
if (termAtt != null && termAtt.term().equalsIgnoreCase("Dogs")) {
|
||||||
|
super.add(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
|
TokenStream source1 = new CachingTokenFilter(new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer1.toString())), theDetector), dogDetector));
|
||||||
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
TokenStream source2 = new TeeTokenFilter(new TeeTokenFilter(new WhitespaceTokenizer(new StringReader(buffer2.toString())), theDetector), dogDetector);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) source1.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = source1.next(reusableToken); nextToken != null; nextToken = source1.next(reusableToken)) {
|
while (source1.incrementToken()) {
|
||||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i], nextToken.term().equals(tokens1[i]) == true);
|
assertTrue(termAtt.term() + " is not equal to " + tokens1[i], termAtt.term().equals(tokens1[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 2, theDetector.getTokens().size() == 2);
|
||||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 1, dogDetector.getTokens().size() == 1);
|
||||||
i = 0;
|
i = 0;
|
||||||
for (Token nextToken = source2.next(reusableToken); nextToken != null; nextToken = source2.next(reusableToken)) {
|
termAtt = (TermAttribute) source2.getAttribute(TermAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + tokens2[i], nextToken.term().equals(tokens2[i]) == true);
|
while (source2.incrementToken()) {
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + tokens2[i], termAtt.term().equals(tokens2[i]) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
|
assertTrue(i + " does not equal: " + tokens2.length, i == tokens2.length);
|
||||||
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
assertTrue("theDetector Size: " + theDetector.getTokens().size() + " is not: " + 4, theDetector.getTokens().size() == 4);
|
||||||
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
assertTrue("dogDetector Size: " + dogDetector.getTokens().size() + " is not: " + 2, dogDetector.getTokens().size() == 2);
|
||||||
i = 0;
|
i = 0;
|
||||||
for (Token nextToken = theDetector.next(reusableToken); nextToken != null; nextToken = theDetector.next(reusableToken)) {
|
termAtt = (TermAttribute) theDetector.getAttribute(TermAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "The", nextToken.term().equalsIgnoreCase("The") == true);
|
while (theDetector.incrementToken()) {
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "The", termAtt.term().equalsIgnoreCase("The") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
assertTrue(i + " does not equal: " + theDetector.getTokens().size(), i == theDetector.getTokens().size());
|
||||||
i = 0;
|
i = 0;
|
||||||
for (Token nextToken = dogDetector.next(reusableToken); nextToken != null; nextToken = dogDetector.next(reusableToken)) {
|
termAtt = (TermAttribute) dogDetector.getAttribute(TermAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + "Dogs", nextToken.term().equalsIgnoreCase("Dogs") == true);
|
while (dogDetector.incrementToken()) {
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + "Dogs", termAtt.term().equalsIgnoreCase("Dogs") == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
|
assertTrue(i + " does not equal: " + dogDetector.getTokens().size(), i == dogDetector.getTokens().size());
|
||||||
source1.reset();
|
source1.reset();
|
||||||
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
TokenStream lowerCasing = new LowerCaseFilter(source1);
|
||||||
i = 0;
|
i = 0;
|
||||||
for (Token nextToken = lowerCasing.next(reusableToken); nextToken != null; nextToken = lowerCasing.next(reusableToken)) {
|
termAtt = (TermAttribute) lowerCasing.getAttribute(TermAttribute.class);
|
||||||
assertTrue(nextToken.term() + " is not equal to " + tokens1[i].toLowerCase(), nextToken.term().equals(tokens1[i].toLowerCase()) == true);
|
while (lowerCasing.incrementToken()) {
|
||||||
|
assertTrue(termAtt.term() + " is not equal to " + tokens1[i].toLowerCase(), termAtt.term().equals(tokens1[i].toLowerCase()) == true);
|
||||||
i++;
|
i++;
|
||||||
}
|
}
|
||||||
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
assertTrue(i + " does not equal: " + tokens1.length, i == tokens1.length);
|
||||||
|
@ -157,21 +178,20 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
//make sure we produce the same tokens
|
//make sure we produce the same tokens
|
||||||
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
|
ModuloSinkTokenizer sink = new ModuloSinkTokenizer(tokCount[k], 100);
|
||||||
final Token reusableToken = new Token();
|
|
||||||
TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
TokenStream stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||||
while (stream.next(reusableToken) != null) {
|
while (stream.incrementToken()) {
|
||||||
}
|
}
|
||||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), 100);
|
||||||
List tmp = new ArrayList();
|
List tmp = new ArrayList();
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
while (stream.incrementToken()) {
|
||||||
tmp.add(nextToken.clone());
|
tmp.add(stream.captureState());
|
||||||
}
|
}
|
||||||
List sinkList = sink.getTokens();
|
List sinkList = sink.getTokens();
|
||||||
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
|
assertTrue("tmp Size: " + tmp.size() + " is not: " + sinkList.size(), tmp.size() == sinkList.size());
|
||||||
for (int i = 0; i < tmp.size(); i++) {
|
for (int i = 0; i < tmp.size(); i++) {
|
||||||
Token tfTok = (Token) tmp.get(i);
|
AttributeSource tfTok = (AttributeSource) tmp.get(i);
|
||||||
Token sinkTok = (Token) sinkList.get(i);
|
AttributeSource sinkTok = (AttributeSource) sinkList.get(i);
|
||||||
assertTrue(tfTok.term() + " is not equal to " + sinkTok.term() + " at token: " + i, tfTok.term().equals(sinkTok.term()) == true);
|
assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
|
||||||
}
|
}
|
||||||
//simulate two fields, each being analyzed once, for 20 documents
|
//simulate two fields, each being analyzed once, for 20 documents
|
||||||
|
|
||||||
|
@ -180,12 +200,14 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
long start = System.currentTimeMillis();
|
long start = System.currentTimeMillis();
|
||||||
for (int i = 0; i < 20; i++) {
|
for (int i = 0; i < 20; i++) {
|
||||||
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
stream = new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString())));
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||||
tfPos += nextToken.getPositionIncrement();
|
while (stream.incrementToken()) {
|
||||||
|
tfPos += posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
stream = new ModuloTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), modCounts[j]);
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||||
tfPos += nextToken.getPositionIncrement();
|
while (stream.incrementToken()) {
|
||||||
|
tfPos += posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
long finish = System.currentTimeMillis();
|
long finish = System.currentTimeMillis();
|
||||||
|
@ -196,13 +218,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
for (int i = 0; i < 20; i++) {
|
for (int i = 0; i < 20; i++) {
|
||||||
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
|
sink = new ModuloSinkTokenizer(tokCount[k], modCounts[j]);
|
||||||
stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
stream = new TeeTokenFilter(new StandardFilter(new StandardTokenizer(new StringReader(buffer.toString()))), sink);
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||||
sinkPos += nextToken.getPositionIncrement();
|
while (stream.incrementToken()) {
|
||||||
|
sinkPos += posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
//System.out.println("Modulo--------");
|
//System.out.println("Modulo--------");
|
||||||
stream = sink;
|
stream = sink;
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
posIncrAtt = (PositionIncrementAttribute) stream.getAttribute(PositionIncrementAttribute.class);
|
||||||
sinkPos += nextToken.getPositionIncrement();
|
while (stream.incrementToken()) {
|
||||||
|
sinkPos += posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
finish = System.currentTimeMillis();
|
finish = System.currentTimeMillis();
|
||||||
|
@ -228,15 +252,15 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
//return every 100 tokens
|
//return every 100 tokens
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token nextToken = null;
|
boolean hasNext;
|
||||||
for (nextToken = input.next(reusableToken);
|
for (hasNext = input.incrementToken();
|
||||||
nextToken != null && count % modCount != 0;
|
hasNext && count % modCount != 0;
|
||||||
nextToken = input.next(reusableToken)) {
|
hasNext = input.incrementToken()) {
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
count++;
|
count++;
|
||||||
return nextToken;
|
return hasNext;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,9 +274,9 @@ public class TeeSinkTokenTest extends LuceneTestCase {
|
||||||
lst = new ArrayList(numToks % mc);
|
lst = new ArrayList(numToks % mc);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void add(Token t) {
|
public void add(AttributeSource a) throws IOException {
|
||||||
if (t != null && count % modCount == 0) {
|
if (a != null && count % modCount == 0) {
|
||||||
super.add(t);
|
super.add(a);
|
||||||
}
|
}
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,10 +19,10 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.util.LinkedList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -36,13 +36,12 @@ public class TestAnalyzers extends LuceneTestCase {
|
||||||
String input,
|
String input,
|
||||||
String[] output) throws Exception {
|
String[] output) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
for (int i=0; i<output.length; i++) {
|
for (int i=0; i<output.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(termAtt.term(), output[i]);
|
||||||
assertEquals(nextToken.term(), output[i]);
|
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -95,14 +94,13 @@ public class TestAnalyzers extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void verifyPayload(TokenStream ts) throws IOException {
|
void verifyPayload(TokenStream ts) throws IOException {
|
||||||
final Token reusableToken = new Token();
|
PayloadAttribute payloadAtt = (PayloadAttribute) ts.getAttribute(PayloadAttribute.class);
|
||||||
for(byte b=1;;b++) {
|
for(byte b=1;;b++) {
|
||||||
reusableToken.clear();
|
boolean hasNext = ts.incrementToken();
|
||||||
Token nextToken = ts.next(reusableToken);
|
if (!hasNext) break;
|
||||||
if (nextToken==null) break;
|
|
||||||
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
|
// System.out.println("id="+System.identityHashCode(nextToken) + " " + t);
|
||||||
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
|
// System.out.println("payload=" + (int)nextToken.getPayload().toByteArray()[0]);
|
||||||
assertEquals(b, nextToken.getPayload().toByteArray()[0]);
|
assertEquals(b, payloadAtt.getPayload().toByteArray()[0]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -111,13 +109,11 @@ public class TestAnalyzers extends LuceneTestCase {
|
||||||
String s = "how now brown cow";
|
String s = "how now brown cow";
|
||||||
TokenStream ts;
|
TokenStream ts;
|
||||||
ts = new WhitespaceTokenizer(new StringReader(s));
|
ts = new WhitespaceTokenizer(new StringReader(s));
|
||||||
ts = new BuffTokenFilter(ts);
|
|
||||||
ts = new PayloadSetter(ts);
|
ts = new PayloadSetter(ts);
|
||||||
verifyPayload(ts);
|
verifyPayload(ts);
|
||||||
|
|
||||||
ts = new WhitespaceTokenizer(new StringReader(s));
|
ts = new WhitespaceTokenizer(new StringReader(s));
|
||||||
ts = new PayloadSetter(ts);
|
ts = new PayloadSetter(ts);
|
||||||
ts = new BuffTokenFilter(ts);
|
|
||||||
verifyPayload(ts);
|
verifyPayload(ts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -136,38 +132,21 @@ public class TestAnalyzers extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
class BuffTokenFilter extends TokenFilter {
|
|
||||||
List lst;
|
|
||||||
|
|
||||||
public BuffTokenFilter(TokenStream input) {
|
|
||||||
super(input);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
|
||||||
if (lst == null) {
|
|
||||||
lst = new LinkedList();
|
|
||||||
for(Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
|
||||||
lst.add(nextToken.clone());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return lst.size()==0 ? null : (Token)lst.remove(0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
class PayloadSetter extends TokenFilter {
|
class PayloadSetter extends TokenFilter {
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
public PayloadSetter(TokenStream input) {
|
public PayloadSetter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte[] data = new byte[1];
|
byte[] data = new byte[1];
|
||||||
Payload p = new Payload(data,0,1);
|
Payload p = new Payload(data,0,1);
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
boolean hasNext = input.incrementToken();
|
||||||
Token nextToken = input.next(reusableToken);
|
if (!hasNext) return false;
|
||||||
if (nextToken==null) return null;
|
payloadAtt.setPayload(p); // reuse the payload / byte[]
|
||||||
nextToken.setPayload(p); // reuse the payload / byte[]
|
|
||||||
data[0]++;
|
data[0]++;
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -22,6 +22,8 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Field.TermVector;
|
import org.apache.lucene.document.Field.TermVector;
|
||||||
|
@ -41,13 +43,17 @@ public class TestCachingTokenFilter extends LuceneTestCase {
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
TokenStream stream = new TokenStream() {
|
TokenStream stream = new TokenStream() {
|
||||||
private int index = 0;
|
private int index = 0;
|
||||||
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
private OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (index == tokens.length) {
|
if (index == tokens.length) {
|
||||||
return null;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
return reusableToken.reinit(tokens[index++], 0, 0);
|
termAtt.setTermBuffer(tokens[index++]);
|
||||||
|
offsetAtt.setStartOffset(0);
|
||||||
|
offsetAtt.setEndOffset(0);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -92,10 +98,12 @@ public class TestCachingTokenFilter extends LuceneTestCase {
|
||||||
|
|
||||||
private void checkTokens(TokenStream stream) throws IOException {
|
private void checkTokens(TokenStream stream) throws IOException {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
|
assertNotNull(termAtt);
|
||||||
|
while (stream.incrementToken()) {
|
||||||
assertTrue(count < tokens.length);
|
assertTrue(count < tokens.length);
|
||||||
assertEquals(tokens[count], nextToken.term());
|
assertEquals(tokens[count], termAtt.term());
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -25,82 +26,87 @@ public class TestISOLatin1AccentFilter extends LuceneTestCase {
|
||||||
public void testU() throws Exception {
|
public void testU() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
TokenStream stream = new WhitespaceTokenizer(new StringReader("Des mot clés À LA CHAÎNE À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï IJ Ð Ñ Ò Ó Ô Õ Ö Ø Œ Þ Ù Ú Û Ü Ý Ÿ à á â ã ä å æ ç è é ê ë ì í î ï ij ð ñ ò ó ô õ ö ø œ ß þ ù ú û ü ý ÿ fi fl"));
|
||||||
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
ISOLatin1AccentFilter filter = new ISOLatin1AccentFilter(stream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
assertEquals("Des", filter.next(reusableToken).term());
|
assertTermEquals("Des", filter, termAtt);
|
||||||
assertEquals("mot", filter.next(reusableToken).term());
|
assertTermEquals("mot", filter, termAtt);
|
||||||
assertEquals("cles", filter.next(reusableToken).term());
|
assertTermEquals("cles", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("LA", filter.next(reusableToken).term());
|
assertTermEquals("LA", filter, termAtt);
|
||||||
assertEquals("CHAINE", filter.next(reusableToken).term());
|
assertTermEquals("CHAINE", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("A", filter.next(reusableToken).term());
|
assertTermEquals("A", filter, termAtt);
|
||||||
assertEquals("AE", filter.next(reusableToken).term());
|
assertTermEquals("AE", filter, termAtt);
|
||||||
assertEquals("C", filter.next(reusableToken).term());
|
assertTermEquals("C", filter, termAtt);
|
||||||
assertEquals("E", filter.next(reusableToken).term());
|
assertTermEquals("E", filter, termAtt);
|
||||||
assertEquals("E", filter.next(reusableToken).term());
|
assertTermEquals("E", filter, termAtt);
|
||||||
assertEquals("E", filter.next(reusableToken).term());
|
assertTermEquals("E", filter, termAtt);
|
||||||
assertEquals("E", filter.next(reusableToken).term());
|
assertTermEquals("E", filter, termAtt);
|
||||||
assertEquals("I", filter.next(reusableToken).term());
|
assertTermEquals("I", filter, termAtt);
|
||||||
assertEquals("I", filter.next(reusableToken).term());
|
assertTermEquals("I", filter, termAtt);
|
||||||
assertEquals("I", filter.next(reusableToken).term());
|
assertTermEquals("I", filter, termAtt);
|
||||||
assertEquals("I", filter.next(reusableToken).term());
|
assertTermEquals("I", filter, termAtt);
|
||||||
assertEquals("IJ", filter.next(reusableToken).term());
|
assertTermEquals("IJ", filter, termAtt);
|
||||||
assertEquals("D", filter.next(reusableToken).term());
|
assertTermEquals("D", filter, termAtt);
|
||||||
assertEquals("N", filter.next(reusableToken).term());
|
assertTermEquals("N", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("O", filter.next(reusableToken).term());
|
assertTermEquals("O", filter, termAtt);
|
||||||
assertEquals("OE", filter.next(reusableToken).term());
|
assertTermEquals("OE", filter, termAtt);
|
||||||
assertEquals("TH", filter.next(reusableToken).term());
|
assertTermEquals("TH", filter, termAtt);
|
||||||
assertEquals("U", filter.next(reusableToken).term());
|
assertTermEquals("U", filter, termAtt);
|
||||||
assertEquals("U", filter.next(reusableToken).term());
|
assertTermEquals("U", filter, termAtt);
|
||||||
assertEquals("U", filter.next(reusableToken).term());
|
assertTermEquals("U", filter, termAtt);
|
||||||
assertEquals("U", filter.next(reusableToken).term());
|
assertTermEquals("U", filter, termAtt);
|
||||||
assertEquals("Y", filter.next(reusableToken).term());
|
assertTermEquals("Y", filter, termAtt);
|
||||||
assertEquals("Y", filter.next(reusableToken).term());
|
assertTermEquals("Y", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("a", filter.next(reusableToken).term());
|
assertTermEquals("a", filter, termAtt);
|
||||||
assertEquals("ae", filter.next(reusableToken).term());
|
assertTermEquals("ae", filter, termAtt);
|
||||||
assertEquals("c", filter.next(reusableToken).term());
|
assertTermEquals("c", filter, termAtt);
|
||||||
assertEquals("e", filter.next(reusableToken).term());
|
assertTermEquals("e", filter, termAtt);
|
||||||
assertEquals("e", filter.next(reusableToken).term());
|
assertTermEquals("e", filter, termAtt);
|
||||||
assertEquals("e", filter.next(reusableToken).term());
|
assertTermEquals("e", filter, termAtt);
|
||||||
assertEquals("e", filter.next(reusableToken).term());
|
assertTermEquals("e", filter, termAtt);
|
||||||
assertEquals("i", filter.next(reusableToken).term());
|
assertTermEquals("i", filter, termAtt);
|
||||||
assertEquals("i", filter.next(reusableToken).term());
|
assertTermEquals("i", filter, termAtt);
|
||||||
assertEquals("i", filter.next(reusableToken).term());
|
assertTermEquals("i", filter, termAtt);
|
||||||
assertEquals("i", filter.next(reusableToken).term());
|
assertTermEquals("i", filter, termAtt);
|
||||||
assertEquals("ij", filter.next(reusableToken).term());
|
assertTermEquals("ij", filter, termAtt);
|
||||||
assertEquals("d", filter.next(reusableToken).term());
|
assertTermEquals("d", filter, termAtt);
|
||||||
assertEquals("n", filter.next(reusableToken).term());
|
assertTermEquals("n", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("o", filter.next(reusableToken).term());
|
assertTermEquals("o", filter, termAtt);
|
||||||
assertEquals("oe", filter.next(reusableToken).term());
|
assertTermEquals("oe", filter, termAtt);
|
||||||
assertEquals("ss", filter.next(reusableToken).term());
|
assertTermEquals("ss", filter, termAtt);
|
||||||
assertEquals("th", filter.next(reusableToken).term());
|
assertTermEquals("th", filter, termAtt);
|
||||||
assertEquals("u", filter.next(reusableToken).term());
|
assertTermEquals("u", filter, termAtt);
|
||||||
assertEquals("u", filter.next(reusableToken).term());
|
assertTermEquals("u", filter, termAtt);
|
||||||
assertEquals("u", filter.next(reusableToken).term());
|
assertTermEquals("u", filter, termAtt);
|
||||||
assertEquals("u", filter.next(reusableToken).term());
|
assertTermEquals("u", filter, termAtt);
|
||||||
assertEquals("y", filter.next(reusableToken).term());
|
assertTermEquals("y", filter, termAtt);
|
||||||
assertEquals("y", filter.next(reusableToken).term());
|
assertTermEquals("y", filter, termAtt);
|
||||||
assertEquals("fi", filter.next(reusableToken).term());
|
assertTermEquals("fi", filter, termAtt);
|
||||||
assertEquals("fl", filter.next(reusableToken).term());
|
assertTermEquals("fl", filter, termAtt);
|
||||||
assertNull(filter.next(reusableToken));
|
assertFalse(filter.incrementToken());
|
||||||
|
}
|
||||||
|
|
||||||
|
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt) throws Exception {
|
||||||
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals(expected, termAtt.term());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -88,9 +89,9 @@ public class TestKeywordAnalyzer extends LuceneTestCase {
|
||||||
// LUCENE-1441
|
// LUCENE-1441
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
|
TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
|
||||||
Token token = new Token();
|
OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);
|
||||||
assertTrue(stream.next(token) != null);
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals(0, token.startOffset);
|
assertEquals(0, offsetAtt.startOffset());
|
||||||
assertEquals(4, token.endOffset);
|
assertEquals(4, offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -27,11 +28,15 @@ public class TestLengthFilter extends LuceneTestCase {
|
||||||
TokenStream stream = new WhitespaceTokenizer(
|
TokenStream stream = new WhitespaceTokenizer(
|
||||||
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
new StringReader("short toolong evenmuchlongertext a ab toolong foo"));
|
||||||
LengthFilter filter = new LengthFilter(stream, 2, 6);
|
LengthFilter filter = new LengthFilter(stream, 2, 6);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class);
|
||||||
assertEquals("short", filter.next(reusableToken).term());
|
|
||||||
assertEquals("ab", filter.next(reusableToken).term());
|
assertTrue(filter.incrementToken());
|
||||||
assertEquals("foo", filter.next(reusableToken).term());
|
assertEquals("short", termAtt.term());
|
||||||
assertNull(filter.next(reusableToken));
|
assertTrue(filter.incrementToken());
|
||||||
|
assertEquals("ab", termAtt.term());
|
||||||
|
assertTrue(filter.incrementToken());
|
||||||
|
assertEquals("foo", termAtt.term());
|
||||||
|
assertFalse(filter.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,10 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
* contributor license agreements. See the NOTICE file distributed with
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
@ -29,17 +31,19 @@ public class TestPerFieldAnalzyerWrapper extends LuceneTestCase {
|
||||||
|
|
||||||
TokenStream tokenStream = analyzer.tokenStream("field",
|
TokenStream tokenStream = analyzer.tokenStream("field",
|
||||||
new StringReader(text));
|
new StringReader(text));
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
|
||||||
Token nextToken = tokenStream.next(reusableToken);
|
|
||||||
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals("WhitespaceAnalyzer does not lowercase",
|
assertEquals("WhitespaceAnalyzer does not lowercase",
|
||||||
"Qwerty",
|
"Qwerty",
|
||||||
nextToken.term());
|
termAtt.term());
|
||||||
|
|
||||||
tokenStream = analyzer.tokenStream("special",
|
tokenStream = analyzer.tokenStream("special",
|
||||||
new StringReader(text));
|
new StringReader(text));
|
||||||
nextToken = tokenStream.next(reusableToken);
|
termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
|
||||||
|
assertTrue(tokenStream.incrementToken());
|
||||||
assertEquals("SimpleAnalyzer lowercases",
|
assertEquals("SimpleAnalyzer lowercases",
|
||||||
"qwerty",
|
"qwerty",
|
||||||
nextToken.term());
|
termAtt.term());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,10 @@
|
||||||
package org.apache.lucene.analysis;
|
package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -35,19 +39,25 @@ public class TestStandardAnalyzer extends LuceneTestCase {
|
||||||
|
|
||||||
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
|
public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[] expectedTypes, int[] expectedPosIncrs) throws Exception {
|
||||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||||
final Token reusableToken = new Token();
|
// TODO Java 1.5
|
||||||
|
//final TypeAttribute typeAtt = reusableToken.getAttribute(TypeAttribute.class);
|
||||||
|
//final PositionIncrementAttribute posIncrAtt = reusableToken.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
final TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||||
|
final TypeAttribute typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||||
|
final PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
for (int i = 0; i < expectedImages.length; i++) {
|
for (int i = 0; i < expectedImages.length; i++) {
|
||||||
Token nextToken = ts.next(reusableToken);
|
assertTrue(ts.incrementToken());
|
||||||
assertNotNull(nextToken);
|
assertEquals(expectedImages[i], new String(termAtt.termBuffer(), 0, termAtt.termLength()));
|
||||||
assertEquals(expectedImages[i], nextToken.term());
|
|
||||||
if (expectedTypes != null) {
|
if (expectedTypes != null) {
|
||||||
assertEquals(expectedTypes[i], nextToken.type());
|
assertEquals(expectedTypes[i], typeAtt.type());
|
||||||
}
|
}
|
||||||
if (expectedPosIncrs != null) {
|
if (expectedPosIncrs != null) {
|
||||||
assertEquals(expectedPosIncrs[i], nextToken.getPositionIncrement());
|
assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertNull(ts.next(reusableToken));
|
assertFalse(ts.incrementToken());
|
||||||
ts.close();
|
ts.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,6 +17,8 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -45,9 +47,10 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
||||||
StringReader reader = new StringReader("This is a test of the english stop analyzer");
|
StringReader reader = new StringReader("This is a test of the english stop analyzer");
|
||||||
TokenStream stream = stop.tokenStream("test", reader);
|
TokenStream stream = stop.tokenStream("test", reader);
|
||||||
assertTrue(stream != null);
|
assertTrue(stream != null);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
|
||||||
assertFalse(inValidTokens.contains(nextToken.term()));
|
while (stream.incrementToken()) {
|
||||||
|
assertFalse(inValidTokens.contains(termAtt.term()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -60,11 +63,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
||||||
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
StringReader reader = new StringReader("This is a good test of the english stop analyzer");
|
||||||
TokenStream stream = newStop.tokenStream("test", reader);
|
TokenStream stream = newStop.tokenStream("test", reader);
|
||||||
assertNotNull(stream);
|
assertNotNull(stream);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
String text = nextToken.term();
|
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
String text = termAtt.term();
|
||||||
assertFalse(stopWordsSet.contains(text));
|
assertFalse(stopWordsSet.contains(text));
|
||||||
assertEquals(1,nextToken.getPositionIncrement()); // by default stop tokenizer does not apply increments.
|
assertEquals(1,posIncrAtt.getPositionIncrement()); // by default stop tokenizer does not apply increments.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,11 +87,13 @@ public class TestStopAnalyzer extends LuceneTestCase {
|
||||||
TokenStream stream = newStop.tokenStream("test", reader);
|
TokenStream stream = newStop.tokenStream("test", reader);
|
||||||
assertNotNull(stream);
|
assertNotNull(stream);
|
||||||
int i = 0;
|
int i = 0;
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
for (Token nextToken = stream.next(reusableToken); nextToken != null; nextToken = stream.next(reusableToken)) {
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
String text = nextToken.term();
|
|
||||||
|
while (stream.incrementToken()) {
|
||||||
|
String text = termAtt.term();
|
||||||
assertFalse(stopWordsSet.contains(text));
|
assertFalse(stopWordsSet.contains(text));
|
||||||
assertEquals(expectedIncr[i++],nextToken.getPositionIncrement());
|
assertEquals(expectedIncr[i++],posIncrAtt.getPositionIncrement());
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
StopFilter.setEnablePositionIncrementsDefault(defaultEnable);
|
StopFilter.setEnablePositionIncrementsDefault(defaultEnable);
|
||||||
|
|
|
@ -16,6 +16,8 @@ package org.apache.lucene.analysis;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.English;
|
import org.apache.lucene.util.English;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -35,19 +37,22 @@ public class TestStopFilter extends LuceneTestCase {
|
||||||
StringReader reader = new StringReader("Now is The Time");
|
StringReader reader = new StringReader("Now is The Time");
|
||||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords);
|
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords);
|
||||||
final Token reusableToken = new Token();
|
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
assertEquals("Now", stream.next(reusableToken).term());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals("The", stream.next(reusableToken).term());
|
assertEquals("Now", termAtt.term());
|
||||||
assertEquals(null, stream.next(reusableToken));
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals("The", termAtt.term());
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testIgnoreCase() throws IOException {
|
public void testIgnoreCase() throws IOException {
|
||||||
StringReader reader = new StringReader("Now is The Time");
|
StringReader reader = new StringReader("Now is The Time");
|
||||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
|
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopWords, true);
|
||||||
final Token reusableToken = new Token();
|
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
assertEquals("Now", stream.next(reusableToken).term());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals(null,stream.next(reusableToken));
|
assertEquals("Now", termAtt.term());
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopFilt() throws IOException {
|
public void testStopFilt() throws IOException {
|
||||||
|
@ -55,10 +60,12 @@ public class TestStopFilter extends LuceneTestCase {
|
||||||
String[] stopWords = new String[] { "is", "the", "Time" };
|
String[] stopWords = new String[] { "is", "the", "Time" };
|
||||||
Set stopSet = StopFilter.makeStopSet(stopWords);
|
Set stopSet = StopFilter.makeStopSet(stopWords);
|
||||||
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
|
TokenStream stream = new StopFilter(new WhitespaceTokenizer(reader), stopSet);
|
||||||
final Token reusableToken = new Token();
|
final TermAttribute termAtt = (TermAttribute) stream.getAttribute(TermAttribute.class);
|
||||||
assertEquals("Now", stream.next(reusableToken).term());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals("The", stream.next(reusableToken).term());
|
assertEquals("Now", termAtt.term());
|
||||||
assertEquals(null, stream.next(reusableToken));
|
assertTrue(stream.incrementToken());
|
||||||
|
assertEquals("The", termAtt.term());
|
||||||
|
assertFalse(stream.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -110,15 +117,16 @@ public class TestStopFilter extends LuceneTestCase {
|
||||||
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
|
private void doTestStopPositons(StopFilter stpf, boolean enableIcrements) throws IOException {
|
||||||
log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
|
log("---> test with enable-increments-"+(enableIcrements?"enabled":"disabled"));
|
||||||
stpf.setEnablePositionIncrements(enableIcrements);
|
stpf.setEnablePositionIncrements(enableIcrements);
|
||||||
final Token reusableToken = new Token();
|
TermAttribute termAtt = (TermAttribute) stpf.getAttribute(TermAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stpf.getAttribute(PositionIncrementAttribute.class);
|
||||||
for (int i=0; i<20; i+=3) {
|
for (int i=0; i<20; i+=3) {
|
||||||
Token nextToken = stpf.next(reusableToken);
|
assertTrue(stpf.incrementToken());
|
||||||
log("Token "+i+": "+nextToken);
|
log("Token "+i+": "+stpf);
|
||||||
String w = English.intToEnglish(i).trim();
|
String w = English.intToEnglish(i).trim();
|
||||||
assertEquals("expecting token "+i+" to be "+w,w,nextToken.term());
|
assertEquals("expecting token "+i+" to be "+w,w,termAtt.term());
|
||||||
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,nextToken.getPositionIncrement());
|
assertEquals("all but first token must have position increment of 3",enableIcrements?(i==0?1:3):1,posIncrAtt.getPositionIncrement());
|
||||||
}
|
}
|
||||||
assertNull(stpf.next(reusableToken));
|
assertFalse(stpf.incrementToken());
|
||||||
}
|
}
|
||||||
|
|
||||||
// print debug info depending on VERBOSE
|
// print debug info depending on VERBOSE
|
||||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.lucene.analysis;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
/** @deprecated */
|
||||||
public class TestToken extends LuceneTestCase {
|
public class TestToken extends LuceneTestCase {
|
||||||
|
|
||||||
public TestToken(String name) {
|
public TestToken(String name) {
|
||||||
|
|
|
@ -22,12 +22,14 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
|
@ -35,6 +37,7 @@ import org.apache.lucene.document.Field.Index;
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.document.Field.TermVector;
|
import org.apache.lucene.document.Field.TermVector;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util._TestUtil;
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
|
@ -138,33 +141,38 @@ public class TestDocumentWriter extends LuceneTestCase {
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
return new TokenFilter(new WhitespaceTokenizer(reader)) {
|
||||||
boolean first=true;
|
boolean first=true;
|
||||||
Token buffered;
|
AttributeSource state;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (buffered != null) {
|
if (state != null) {
|
||||||
Token nextToken = buffered;
|
state.restoreState(this);
|
||||||
buffered=null;
|
payloadAtt.setPayload(null);
|
||||||
return nextToken;
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
termAtt.setTermBuffer(new char[]{'b'}, 0, 1);
|
||||||
|
state = null;
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
Token nextToken = input.next(reusableToken);
|
|
||||||
if (nextToken==null) return null;
|
boolean hasNext = input.incrementToken();
|
||||||
if (Character.isDigit(nextToken.termBuffer()[0])) {
|
if (!hasNext) return false;
|
||||||
nextToken.setPositionIncrement(nextToken.termBuffer()[0] - '0');
|
if (Character.isDigit(termAtt.termBuffer()[0])) {
|
||||||
|
posIncrAtt.setPositionIncrement(termAtt.termBuffer()[0] - '0');
|
||||||
}
|
}
|
||||||
if (first) {
|
if (first) {
|
||||||
// set payload on first position only
|
// set payload on first position only
|
||||||
nextToken.setPayload(new Payload(new byte[]{100}));
|
payloadAtt.setPayload(new Payload(new byte[]{100}));
|
||||||
first = false;
|
first = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// index a "synonym" for every token
|
// index a "synonym" for every token
|
||||||
buffered = (Token)nextToken.clone();
|
state = captureState();
|
||||||
buffered.setPayload(null);
|
return true;
|
||||||
buffered.setPositionIncrement(0);
|
|
||||||
buffered.setTermBuffer(new char[]{'b'}, 0, 1);
|
|
||||||
|
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
@ -201,12 +209,14 @@ public class TestDocumentWriter extends LuceneTestCase {
|
||||||
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
|
private String[] tokens = new String[] {"term1", "term2", "term3", "term2"};
|
||||||
private int index = 0;
|
private int index = 0;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
private TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
assert reusableToken != null;
|
|
||||||
|
public boolean incrementToken() throws IOException {
|
||||||
if (index == tokens.length) {
|
if (index == tokens.length) {
|
||||||
return null;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
return reusableToken.reinit(tokens[index++], 0, 0);
|
termAtt.setTermBuffer(tokens[index++]);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,48 +17,48 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.PrintStream;
|
import java.io.PrintStream;
|
||||||
import java.util.Arrays;
|
import java.io.Reader;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.SinkTokenizer;
|
import org.apache.lucene.analysis.SinkTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
|
import org.apache.lucene.search.PhraseQuery;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.search.ScoreDoc;
|
import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.search.spans.SpanTermQuery;
|
import org.apache.lucene.search.spans.SpanTermQuery;
|
||||||
import org.apache.lucene.search.PhraseQuery;
|
import org.apache.lucene.store.AlreadyClosedException;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.IndexOutput;
|
import org.apache.lucene.store.IndexOutput;
|
||||||
import org.apache.lucene.store.AlreadyClosedException;
|
|
||||||
import org.apache.lucene.util._TestUtil;
|
|
||||||
|
|
||||||
import org.apache.lucene.store.MockRAMDirectory;
|
|
||||||
import org.apache.lucene.store.LockFactory;
|
|
||||||
import org.apache.lucene.store.Lock;
|
import org.apache.lucene.store.Lock;
|
||||||
|
import org.apache.lucene.store.LockFactory;
|
||||||
|
import org.apache.lucene.store.MockRAMDirectory;
|
||||||
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.store.SingleInstanceLockFactory;
|
import org.apache.lucene.store.SingleInstanceLockFactory;
|
||||||
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
import org.apache.lucene.util._TestUtil;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -1793,11 +1793,11 @@ public class TestIndexWriter extends LuceneTestCase
|
||||||
return new TokenFilter(new StandardTokenizer(reader)) {
|
return new TokenFilter(new StandardTokenizer(reader)) {
|
||||||
private int count = 0;
|
private int count = 0;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (count++ == 5) {
|
if (count++ == 5) {
|
||||||
throw new IOException();
|
throw new IOException();
|
||||||
}
|
}
|
||||||
return input.next(reusableToken);
|
return input.incrementToken();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -1916,10 +1916,10 @@ public class TestIndexWriter extends LuceneTestCase
|
||||||
this.fieldName = fieldName;
|
this.fieldName = fieldName;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (this.fieldName.equals("crash") && count++ >= 4)
|
if (this.fieldName.equals("crash") && count++ >= 4)
|
||||||
throw new IOException("I'm experiencing problems");
|
throw new IOException("I'm experiencing problems");
|
||||||
return input.next(reusableToken);
|
return input.incrementToken();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void reset() throws IOException {
|
public void reset() throws IOException {
|
||||||
|
@ -3577,21 +3577,47 @@ public class TestIndexWriter extends LuceneTestCase
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static class MyAnalyzer extends Analyzer {
|
||||||
|
|
||||||
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
|
TokenStream s = new WhitespaceTokenizer(reader);
|
||||||
|
s.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
// LUCENE-1255
|
// LUCENE-1255
|
||||||
public void testNegativePositions() throws Throwable {
|
public void testNegativePositions() throws Throwable {
|
||||||
SinkTokenizer tokens = new SinkTokenizer();
|
SinkTokenizer tokens = new SinkTokenizer();
|
||||||
Token t = new Token();
|
tokens.addAttribute(TermAttribute.class);
|
||||||
t.setTermBuffer("a");
|
tokens.addAttribute(PositionIncrementAttribute.class);
|
||||||
t.setPositionIncrement(0);
|
|
||||||
tokens.add(t);
|
AttributeSource state = new AttributeSource();
|
||||||
t.setTermBuffer("b");
|
TermAttribute termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||||
t.setPositionIncrement(1);
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||||
tokens.add(t);
|
termAtt.setTermBuffer("a");
|
||||||
t.setTermBuffer("c");
|
posIncrAtt.setPositionIncrement(0);
|
||||||
tokens.add(t);
|
tokens.add(state);
|
||||||
|
|
||||||
|
state = new AttributeSource();
|
||||||
|
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
termAtt.setTermBuffer("b");
|
||||||
|
posIncrAtt.setPositionIncrement(1);
|
||||||
|
tokens.add(state);
|
||||||
|
|
||||||
|
state = new AttributeSource();
|
||||||
|
termAtt = (TermAttribute) state.addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) state.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
|
termAtt.setTermBuffer("c");
|
||||||
|
posIncrAtt.setPositionIncrement(1);
|
||||||
|
tokens.add(state);
|
||||||
|
|
||||||
MockRAMDirectory dir = new MockRAMDirectory();
|
MockRAMDirectory dir = new MockRAMDirectory();
|
||||||
IndexWriter w = new IndexWriter(dir, new WhitespaceAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
IndexWriter w = new IndexWriter(dir, new MyAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||||
Document doc = new Document();
|
Document doc = new Document();
|
||||||
doc.add(new Field("field", tokens));
|
doc.add(new Field("field", tokens));
|
||||||
w.addDocument(doc);
|
w.addDocument(doc);
|
||||||
|
|
|
@ -20,19 +20,18 @@ package org.apache.lucene.index;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Field.Index;
|
import org.apache.lucene.document.Field.Index;
|
||||||
import org.apache.lucene.document.Field.Store;
|
import org.apache.lucene.document.Field.Store;
|
||||||
import org.apache.lucene.store.IndexInput;
|
import org.apache.lucene.store.IndexInput;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This testcase tests whether multi-level skipping is being used
|
* This testcase tests whether multi-level skipping is being used
|
||||||
|
@ -99,17 +98,19 @@ public class TestMultiLevelSkipList extends LuceneTestCase {
|
||||||
private static class PayloadFilter extends TokenFilter {
|
private static class PayloadFilter extends TokenFilter {
|
||||||
static int count = 0;
|
static int count = 0;
|
||||||
|
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
protected PayloadFilter(TokenStream input) {
|
protected PayloadFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
boolean hasNext = input.incrementToken();
|
||||||
Token nextToken = input.next(reusableToken);
|
if (hasNext) {
|
||||||
if (nextToken != null) {
|
payloadAtt.setPayload(new Payload(new byte[] { (byte) count++ }));
|
||||||
nextToken.setPayload(new Payload(new byte[] { (byte) count++ }));
|
}
|
||||||
}
|
return hasNext;
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -27,20 +27,20 @@ import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
import org.apache.lucene.util.UnicodeUtil;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
import org.apache.lucene.util.UnicodeUtil;
|
||||||
|
|
||||||
|
|
||||||
public class TestPayloads extends LuceneTestCase {
|
public class TestPayloads extends LuceneTestCase {
|
||||||
|
@ -442,32 +442,33 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
private int length;
|
private int length;
|
||||||
private int offset;
|
private int offset;
|
||||||
Payload payload = new Payload();
|
Payload payload = new Payload();
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
|
public PayloadFilter(TokenStream in, byte[] data, int offset, int length) {
|
||||||
super(in);
|
super(in);
|
||||||
this.data = data;
|
this.data = data;
|
||||||
this.length = length;
|
this.length = length;
|
||||||
this.offset = offset;
|
this.offset = offset;
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
boolean hasNext = input.incrementToken();
|
||||||
Token nextToken = input.next(reusableToken);
|
if (hasNext) {
|
||||||
if (nextToken != null) {
|
|
||||||
if (offset + length <= data.length) {
|
if (offset + length <= data.length) {
|
||||||
Payload p = null;
|
Payload p = null;
|
||||||
if (p == null) {
|
if (p == null) {
|
||||||
p = new Payload();
|
p = new Payload();
|
||||||
nextToken.setPayload(p);
|
payloadAtt.setPayload(p);
|
||||||
}
|
}
|
||||||
p.setData(data, offset, length);
|
p.setData(data, offset, length);
|
||||||
offset += length;
|
offset += length;
|
||||||
} else {
|
} else {
|
||||||
nextToken.setPayload(null);
|
payloadAtt.setPayload(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return nextToken;
|
return hasNext;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -529,19 +530,25 @@ public class TestPayloads extends LuceneTestCase {
|
||||||
private boolean first;
|
private boolean first;
|
||||||
private ByteArrayPool pool;
|
private ByteArrayPool pool;
|
||||||
private String term;
|
private String term;
|
||||||
|
|
||||||
|
TermAttribute termAtt;
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
PoolingPayloadTokenStream(ByteArrayPool pool) {
|
PoolingPayloadTokenStream(ByteArrayPool pool) {
|
||||||
this.pool = pool;
|
this.pool = pool;
|
||||||
payload = pool.get();
|
payload = pool.get();
|
||||||
generateRandomData(payload);
|
generateRandomData(payload);
|
||||||
term = pool.bytesToString(payload);
|
term = pool.bytesToString(payload);
|
||||||
first = true;
|
first = true;
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (!first) return null;
|
if (!first) return false;
|
||||||
reusableToken.reinit(term, 0, 0);
|
termAtt.setTermBuffer(term);
|
||||||
reusableToken.setPayload(new Payload(payload));
|
payloadAtt.setPayload(new Payload(payload));
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
|
|
@ -17,14 +17,6 @@ package org.apache.lucene.index;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
|
||||||
import org.apache.lucene.document.Document;
|
|
||||||
import org.apache.lucene.document.Field;
|
|
||||||
import org.apache.lucene.store.MockRAMDirectory;
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
@ -32,6 +24,16 @@ import java.util.Iterator;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.document.Document;
|
||||||
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.store.MockRAMDirectory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestTermVectorsReader extends LuceneTestCase {
|
public class TestTermVectorsReader extends LuceneTestCase {
|
||||||
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
//Must be lexicographically sorted, will do in setup, versus trying to maintain here
|
||||||
private String[] testFields = {"f1", "f2", "f3", "f4"};
|
private String[] testFields = {"f1", "f2", "f3", "f4"};
|
||||||
|
@ -118,17 +120,31 @@ public class TestTermVectorsReader extends LuceneTestCase {
|
||||||
|
|
||||||
private class MyTokenStream extends TokenStream {
|
private class MyTokenStream extends TokenStream {
|
||||||
int tokenUpto;
|
int tokenUpto;
|
||||||
public Token next(final Token reusableToken) {
|
|
||||||
|
TermAttribute termAtt;
|
||||||
|
PositionIncrementAttribute posIncrAtt;
|
||||||
|
OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
public MyTokenStream() {
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean incrementToken() {
|
||||||
if (tokenUpto >= tokens.length)
|
if (tokenUpto >= tokens.length)
|
||||||
return null;
|
return false;
|
||||||
else {
|
else {
|
||||||
final TestToken testToken = tokens[tokenUpto++];
|
final TestToken testToken = tokens[tokenUpto++];
|
||||||
reusableToken.reinit(testToken.text, testToken.startOffset, testToken.endOffset);
|
termAtt.setTermBuffer(testToken.text);
|
||||||
if (tokenUpto > 1)
|
offsetAtt.setStartOffset(testToken.startOffset);
|
||||||
reusableToken.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
|
offsetAtt.setEndOffset(testToken.endOffset);
|
||||||
else
|
if (tokenUpto > 1) {
|
||||||
reusableToken.setPositionIncrement(testToken.pos+1);
|
posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
|
||||||
return reusableToken;
|
} else {
|
||||||
|
posIncrAtt.setPositionIncrement(testToken.pos+1);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -17,18 +17,18 @@ package org.apache.lucene.index;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import java.io.IOException;
|
||||||
import org.apache.lucene.store.Directory;
|
import java.io.Reader;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import java.util.Random;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
|
import org.apache.lucene.store.Directory;
|
||||||
import java.io.Reader;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import java.io.IOException;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import java.util.Random;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @version $Id$
|
* @version $Id$
|
||||||
|
@ -36,15 +36,21 @@ import java.util.Random;
|
||||||
|
|
||||||
class RepeatingTokenStream extends TokenStream {
|
class RepeatingTokenStream extends TokenStream {
|
||||||
public int num;
|
public int num;
|
||||||
Token t;
|
TermAttribute termAtt;
|
||||||
|
String value;
|
||||||
|
|
||||||
public RepeatingTokenStream(String val) {
|
public RepeatingTokenStream(String val) {
|
||||||
t = new Token(0,val.length());
|
this.value = val;
|
||||||
t.setTermBuffer(val);
|
this.termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
return --num<0 ? null : (Token) t.clone();
|
num--;
|
||||||
|
if (num >= 0) {
|
||||||
|
termAtt.setTermBuffer(value);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,17 +17,20 @@ package org.apache.lucene.queryParser;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
|
||||||
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
import org.apache.lucene.search.Query;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test QueryParser's ability to deal with Analyzers that return more
|
* Test QueryParser's ability to deal with Analyzers that return more
|
||||||
|
@ -140,34 +143,49 @@ public class TestMultiAnalyzer extends LuceneTestCase {
|
||||||
|
|
||||||
private final class TestFilter extends TokenFilter {
|
private final class TestFilter extends TokenFilter {
|
||||||
|
|
||||||
private Token prevToken;
|
private String prevType;
|
||||||
|
private int prevStartOffset;
|
||||||
|
private int prevEndOffset;
|
||||||
|
|
||||||
|
TermAttribute termAtt;
|
||||||
|
PositionIncrementAttribute posIncrAtt;
|
||||||
|
OffsetAttribute offsetAtt;
|
||||||
|
TypeAttribute typeAtt;
|
||||||
|
|
||||||
public TestFilter(TokenStream in) {
|
public TestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
if (multiToken > 0) {
|
if (multiToken > 0) {
|
||||||
reusableToken.reinit("multi"+(multiToken+1), prevToken.startOffset(), prevToken.endOffset(), prevToken.type());
|
termAtt.setTermBuffer("multi"+(multiToken+1));
|
||||||
reusableToken.setPositionIncrement(0);
|
offsetAtt.setStartOffset(prevStartOffset);
|
||||||
|
offsetAtt.setEndOffset(prevEndOffset);
|
||||||
|
typeAtt.setType(prevType);
|
||||||
|
posIncrAtt.setPositionIncrement(0);
|
||||||
multiToken--;
|
multiToken--;
|
||||||
return reusableToken;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
Token nextToken = input.next(reusableToken);
|
boolean next = input.incrementToken();
|
||||||
if (nextToken == null) {
|
if (next == false) {
|
||||||
prevToken = null;
|
return false;
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
prevToken = (Token) nextToken.clone();
|
prevType = typeAtt.type();
|
||||||
String text = nextToken.term();
|
prevStartOffset = offsetAtt.startOffset();
|
||||||
|
prevEndOffset = offsetAtt.endOffset();
|
||||||
|
String text = termAtt.term();
|
||||||
if (text.equals("triplemulti")) {
|
if (text.equals("triplemulti")) {
|
||||||
multiToken = 2;
|
multiToken = 2;
|
||||||
return nextToken;
|
return true;
|
||||||
} else if (text.equals("multi")) {
|
} else if (text.equals("multi")) {
|
||||||
multiToken = 1;
|
multiToken = 1;
|
||||||
return nextToken;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -192,23 +210,28 @@ public class TestMultiAnalyzer extends LuceneTestCase {
|
||||||
|
|
||||||
private final class TestPosIncrementFilter extends TokenFilter {
|
private final class TestPosIncrementFilter extends TokenFilter {
|
||||||
|
|
||||||
|
TermAttribute termAtt;
|
||||||
|
PositionIncrementAttribute posIncrAtt;
|
||||||
|
|
||||||
public TestPosIncrementFilter(TokenStream in) {
|
public TestPosIncrementFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws java.io.IOException {
|
public final boolean incrementToken () throws java.io.IOException {
|
||||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
while(input.incrementToken()) {
|
||||||
if (nextToken.term().equals("the")) {
|
if (termAtt.term().equals("the")) {
|
||||||
// stopword, do nothing
|
// stopword, do nothing
|
||||||
} else if (nextToken.term().equals("quick")) {
|
} else if (termAtt.term().equals("quick")) {
|
||||||
nextToken.setPositionIncrement(2);
|
posIncrAtt.setPositionIncrement(2);
|
||||||
return nextToken;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
nextToken.setPositionIncrement(1);
|
posIncrAtt.setPositionIncrement(1);
|
||||||
return nextToken;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.queryParser;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.text.DateFormat;
|
|
||||||
import java.text.Collator;
|
import java.text.Collator;
|
||||||
|
import java.text.DateFormat;
|
||||||
import java.util.Calendar;
|
import java.util.Calendar;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
@ -31,11 +31,12 @@ import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.SimpleAnalyzer;
|
import org.apache.lucene.analysis.SimpleAnalyzer;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -64,36 +65,47 @@ public class TestQueryParser extends LuceneTestCase {
|
||||||
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
||||||
|
|
||||||
public static class QPTestFilter extends TokenFilter {
|
public static class QPTestFilter extends TokenFilter {
|
||||||
|
TermAttribute termAtt;
|
||||||
|
OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter which discards the token 'stop' and which expands the
|
* Filter which discards the token 'stop' and which expands the
|
||||||
* token 'phrase' into 'phrase1 phrase2'
|
* token 'phrase' into 'phrase1 phrase2'
|
||||||
*/
|
*/
|
||||||
public QPTestFilter(TokenStream in) {
|
public QPTestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean inPhrase = false;
|
boolean inPhrase = false;
|
||||||
int savedStart = 0, savedEnd = 0;
|
int savedStart = 0, savedEnd = 0;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
|
||||||
if (inPhrase) {
|
if (inPhrase) {
|
||||||
inPhrase = false;
|
inPhrase = false;
|
||||||
return reusableToken.reinit("phrase2", savedStart, savedEnd);
|
termAtt.setTermBuffer("phrase2");
|
||||||
|
offsetAtt.setStartOffset(savedStart);
|
||||||
|
offsetAtt.setEndOffset(savedEnd);
|
||||||
|
return true;
|
||||||
} else
|
} else
|
||||||
for (Token nextToken = input.next(reusableToken); nextToken != null; nextToken = input.next(reusableToken)) {
|
while (input.incrementToken()) {
|
||||||
if (nextToken.term().equals("phrase")) {
|
if (termAtt.term().equals("phrase")) {
|
||||||
inPhrase = true;
|
inPhrase = true;
|
||||||
savedStart = nextToken.startOffset();
|
savedStart = offsetAtt.startOffset();
|
||||||
savedEnd = nextToken.endOffset();
|
savedEnd = offsetAtt.endOffset();
|
||||||
return nextToken.reinit("phrase1", savedStart, savedEnd);
|
termAtt.setTermBuffer("phrase1");
|
||||||
} else if (!nextToken.term().equals("stop"))
|
offsetAtt.setStartOffset(savedStart);
|
||||||
return nextToken;
|
offsetAtt.setEndOffset(savedEnd);
|
||||||
|
return true;
|
||||||
|
} else if (!termAtt.term().equals("stop"))
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
return null;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static class QPTestAnalyzer extends Analyzer {
|
public static class QPTestAnalyzer extends Analyzer {
|
||||||
|
|
||||||
/** Filters LowerCaseTokenizer with StopFilter. */
|
/** Filters LowerCaseTokenizer with StopFilter. */
|
||||||
|
|
|
@ -17,14 +17,16 @@ package org.apache.lucene.search;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.WhitespaceAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
@ -49,14 +51,19 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
|
private final int[] INCREMENTS = {1, 2, 1, 0, 1};
|
||||||
private int i = 0;
|
private int i = 0;
|
||||||
|
|
||||||
public Token next(final Token reusableToken) {
|
PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
assert reusableToken != null;
|
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
|
public boolean incrementToken() {
|
||||||
if (i == TOKENS.length)
|
if (i == TOKENS.length)
|
||||||
return null;
|
return false;
|
||||||
reusableToken.reinit(TOKENS[i], i, i);
|
termAtt.setTermBuffer(TOKENS[i]);
|
||||||
reusableToken.setPositionIncrement(INCREMENTS[i]);
|
offsetAtt.setStartOffset(i);
|
||||||
|
offsetAtt.setEndOffset(i);
|
||||||
|
posIncrAtt.setPositionIncrement(INCREMENTS[i]);
|
||||||
i++;
|
i++;
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -196,18 +203,4 @@ public class TestPositionIncrement extends LuceneTestCase {
|
||||||
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
StopFilter.setEnablePositionIncrementsDefault(dflt);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Basic analyzer behavior should be to keep sequential terms in one
|
|
||||||
* increment from one another.
|
|
||||||
*/
|
|
||||||
public void testIncrementingPositions() throws Exception {
|
|
||||||
Analyzer analyzer = new WhitespaceAnalyzer();
|
|
||||||
TokenStream ts = analyzer.tokenStream("field",
|
|
||||||
new StringReader("one two three four five"));
|
|
||||||
final Token reusableToken = new Token();
|
|
||||||
for (Token nextToken = ts.next(reusableToken); nextToken != null; nextToken = ts.next(reusableToken)) {
|
|
||||||
assertEquals(nextToken.term(), 1, nextToken.getPositionIncrement());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,7 +26,7 @@ import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -236,23 +236,25 @@ public class TestRangeQuery extends LuceneTestCase {
|
||||||
private static class SingleCharTokenizer extends Tokenizer {
|
private static class SingleCharTokenizer extends Tokenizer {
|
||||||
char[] buffer = new char[1];
|
char[] buffer = new char[1];
|
||||||
boolean done;
|
boolean done;
|
||||||
|
TermAttribute termAtt;
|
||||||
|
|
||||||
public SingleCharTokenizer(Reader r) {
|
public SingleCharTokenizer(Reader r) {
|
||||||
super(r);
|
super(r);
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public final Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
int count = input.read(buffer);
|
int count = input.read(buffer);
|
||||||
if (done)
|
if (done)
|
||||||
return null;
|
return false;
|
||||||
else {
|
else {
|
||||||
done = true;
|
done = true;
|
||||||
if (count == 1) {
|
if (count == 1) {
|
||||||
reusableToken.termBuffer()[0] = buffer[0];
|
termAtt.termBuffer()[0] = buffer[0];
|
||||||
reusableToken.setTermLength(1);
|
termAtt.setTermLength(1);
|
||||||
} else
|
} else
|
||||||
reusableToken.setTermLength(0);
|
termAtt.setTermLength(0);
|
||||||
return reusableToken;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ package org.apache.lucene.search.payloads;
|
||||||
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.*;
|
import org.apache.lucene.analysis.*;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
@ -41,34 +42,36 @@ public class PayloadHelper {
|
||||||
public class PayloadFilter extends TokenFilter {
|
public class PayloadFilter extends TokenFilter {
|
||||||
String fieldName;
|
String fieldName;
|
||||||
int numSeen = 0;
|
int numSeen = 0;
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
public PayloadFilter(TokenStream input, String fieldName) {
|
public PayloadFilter(TokenStream input, String fieldName) {
|
||||||
super(input);
|
super(input);
|
||||||
this.fieldName = fieldName;
|
this.fieldName = fieldName;
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token result = input.next();
|
|
||||||
if (result != null) {
|
if (input.incrementToken()) {
|
||||||
if (fieldName.equals(FIELD))
|
if (fieldName.equals(FIELD))
|
||||||
{
|
{
|
||||||
result.setPayload(new Payload(payloadField));
|
payloadAtt.setPayload(new Payload(payloadField));
|
||||||
}
|
}
|
||||||
else if (fieldName.equals(MULTI_FIELD))
|
else if (fieldName.equals(MULTI_FIELD))
|
||||||
{
|
{
|
||||||
if (numSeen % 2 == 0)
|
if (numSeen % 2 == 0)
|
||||||
{
|
{
|
||||||
result.setPayload(new Payload(payloadMultiField1));
|
payloadAtt.setPayload(new Payload(payloadMultiField1));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
result.setPayload(new Payload(payloadMultiField2));
|
payloadAtt.setPayload(new Payload(payloadMultiField2));
|
||||||
}
|
}
|
||||||
numSeen++;
|
numSeen++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
return result;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -21,9 +21,9 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
@ -66,29 +66,32 @@ public class TestBoostingTermQuery extends LuceneTestCase {
|
||||||
private class PayloadFilter extends TokenFilter {
|
private class PayloadFilter extends TokenFilter {
|
||||||
String fieldName;
|
String fieldName;
|
||||||
int numSeen = 0;
|
int numSeen = 0;
|
||||||
|
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
|
||||||
public PayloadFilter(TokenStream input, String fieldName) {
|
public PayloadFilter(TokenStream input, String fieldName) {
|
||||||
super(input);
|
super(input);
|
||||||
this.fieldName = fieldName;
|
this.fieldName = fieldName;
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next(final Token reusableToken) throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
assert reusableToken != null;
|
boolean hasNext = input.incrementToken();
|
||||||
Token nextToken = input.next(reusableToken);
|
if (hasNext) {
|
||||||
if (nextToken != null) {
|
|
||||||
if (fieldName.equals("field")) {
|
if (fieldName.equals("field")) {
|
||||||
nextToken.setPayload(new Payload(payloadField));
|
payloadAtt.setPayload(new Payload(payloadField));
|
||||||
} else if (fieldName.equals("multiField")) {
|
} else if (fieldName.equals("multiField")) {
|
||||||
if (numSeen % 2 == 0) {
|
if (numSeen % 2 == 0) {
|
||||||
nextToken.setPayload(new Payload(payloadMultiField1));
|
payloadAtt.setPayload(new Payload(payloadMultiField1));
|
||||||
} else {
|
} else {
|
||||||
nextToken.setPayload(new Payload(payloadMultiField2));
|
payloadAtt.setPayload(new Payload(payloadMultiField2));
|
||||||
}
|
}
|
||||||
numSeen++;
|
numSeen++;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
return nextToken;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,9 +27,11 @@ import junit.framework.TestCase;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.analysis.Token;
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -43,8 +45,9 @@ import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.search.payloads.PayloadHelper;
|
import org.apache.lucene.search.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
import org.apache.lucene.search.payloads.PayloadSpanUtil;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
public class TestPayloadSpans extends TestCase {
|
public class TestPayloadSpans extends LuceneTestCase {
|
||||||
private final static boolean DEBUG = false;
|
private final static boolean DEBUG = false;
|
||||||
private IndexSearcher searcher;
|
private IndexSearcher searcher;
|
||||||
private Similarity similarity = new DefaultSimilarity();
|
private Similarity similarity = new DefaultSimilarity();
|
||||||
|
@ -54,7 +57,8 @@ public class TestPayloadSpans extends TestCase {
|
||||||
super(s);
|
super(s);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void setUp() throws IOException {
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
PayloadHelper helper = new PayloadHelper();
|
PayloadHelper helper = new PayloadHelper();
|
||||||
searcher = helper.setUp(similarity, 1000);
|
searcher = helper.setUp(similarity, 1000);
|
||||||
indexReader = searcher.getIndexReader();
|
indexReader = searcher.getIndexReader();
|
||||||
|
@ -345,6 +349,9 @@ public class TestPayloadSpans extends TestCase {
|
||||||
Set entities = new HashSet();
|
Set entities = new HashSet();
|
||||||
Set nopayload = new HashSet();
|
Set nopayload = new HashSet();
|
||||||
int pos;
|
int pos;
|
||||||
|
PayloadAttribute payloadAtt;
|
||||||
|
TermAttribute termAtt;
|
||||||
|
PositionIncrementAttribute posIncrAtt;
|
||||||
|
|
||||||
public PayloadFilter(TokenStream input, String fieldName) {
|
public PayloadFilter(TokenStream input, String fieldName) {
|
||||||
super(input);
|
super(input);
|
||||||
|
@ -354,24 +361,26 @@ public class TestPayloadSpans extends TestCase {
|
||||||
entities.add("one");
|
entities.add("one");
|
||||||
nopayload.add("nopayload");
|
nopayload.add("nopayload");
|
||||||
nopayload.add("np");
|
nopayload.add("np");
|
||||||
|
termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||||
|
posIncrAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||||
|
payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
public Token next() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
Token result = input.next();
|
if (input.incrementToken()) {
|
||||||
if (result != null) {
|
String token = new String(termAtt.termBuffer(), 0, termAtt.termLength());
|
||||||
String token = new String(result.termBuffer(), 0, result.termLength());
|
|
||||||
|
|
||||||
if (!nopayload.contains(token)) {
|
if (!nopayload.contains(token)) {
|
||||||
if (entities.contains(token)) {
|
if (entities.contains(token)) {
|
||||||
result.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
|
payloadAtt.setPayload(new Payload((token + ":Entity:"+ pos ).getBytes()));
|
||||||
} else {
|
} else {
|
||||||
result.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
|
payloadAtt.setPayload(new Payload((token + ":Noise:" + pos ).getBytes()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pos += result.getPositionIncrement();
|
pos += posIncrAtt.getPositionIncrement();
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
return result;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -17,6 +17,7 @@ package org.apache.lucene.util;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
import org.apache.lucene.index.ConcurrentMergeScheduler;
|
||||||
import junit.framework.TestCase;
|
import junit.framework.TestCase;
|
||||||
|
|
||||||
|
@ -42,6 +43,7 @@ public abstract class LuceneTestCase extends TestCase {
|
||||||
|
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
ConcurrentMergeScheduler.setTestMode();
|
ConcurrentMergeScheduler.setTestMode();
|
||||||
|
TokenStream.setUseNewAPIDefault(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
protected void tearDown() throws Exception {
|
protected void tearDown() throws Exception {
|
||||||
|
|
Loading…
Reference in New Issue