mirror of https://github.com/apache/lucene.git
LUCENE-2372: switch over remaining uses of TermAttribute
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@950008 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
7661fe8c04
commit
ad0e495911
|
@ -26,9 +26,10 @@ import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
import org.apache.lucene.benchmark.BenchmarkTestCase;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
|
||||||
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
import org.apache.lucene.benchmark.byTask.feeds.ReutersQueryMaker;
|
||||||
|
@ -918,11 +919,11 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
||||||
TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
|
TokenStream ts2 = a2.tokenStream("bogus", new StringReader(text));
|
||||||
ts1.reset();
|
ts1.reset();
|
||||||
ts2.reset();
|
ts2.reset();
|
||||||
TermAttribute termAtt1 = ts1.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt1 = ts1.addAttribute(CharTermAttribute.class);
|
||||||
TermAttribute termAtt2 = ts2.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt2 = ts2.addAttribute(CharTermAttribute.class);
|
||||||
assertTrue(ts1.incrementToken());
|
assertTrue(ts1.incrementToken());
|
||||||
assertTrue(ts2.incrementToken());
|
assertTrue(ts2.incrementToken());
|
||||||
assertEquals(termAtt1.term(), termAtt2.term());
|
assertEquals(termAtt1.toString(), termAtt2.toString());
|
||||||
assertFalse(ts1.incrementToken());
|
assertFalse(ts1.incrementToken());
|
||||||
assertFalse(ts2.incrementToken());
|
assertFalse(ts2.incrementToken());
|
||||||
ts1.close();
|
ts1.close();
|
||||||
|
@ -994,21 +995,7 @@ public class TestPerfTasksLogic extends BenchmarkTestCase {
|
||||||
|
|
||||||
private void assertEqualShingle
|
private void assertEqualShingle
|
||||||
(Analyzer analyzer, String text, String[] expected) throws Exception {
|
(Analyzer analyzer, String text, String[] expected) throws Exception {
|
||||||
TokenStream stream = analyzer.tokenStream("bogus", new StringReader(text));
|
BaseTokenStreamTestCase.assertAnalyzesTo(analyzer, text, expected);
|
||||||
stream.reset();
|
|
||||||
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
|
||||||
int termNum = 0;
|
|
||||||
while (stream.incrementToken()) {
|
|
||||||
assertTrue("Extra output term(s), starting with '"
|
|
||||||
+ new String(termAtt.termBuffer(), 0, termAtt.termLength()) + "'",
|
|
||||||
termNum < expected.length);
|
|
||||||
assertEquals("Mismatch in output term # " + termNum + " - ",
|
|
||||||
expected[termNum],
|
|
||||||
new String(termAtt.termBuffer(), 0, termAtt.termLength()));
|
|
||||||
++termNum;
|
|
||||||
}
|
|
||||||
assertEquals("Too few output terms", expected.length, termNum);
|
|
||||||
stream.close();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String[] getShingleConfig(String params) {
|
private String[] getShingleConfig(String params) {
|
||||||
|
|
|
@ -23,9 +23,9 @@ import java.util.Iterator;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.util.PriorityQueue;
|
import org.apache.lucene.util.PriorityQueue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -191,7 +191,7 @@ public class Highlighter
|
||||||
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
|
ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
|
||||||
StringBuilder newText=new StringBuilder();
|
StringBuilder newText=new StringBuilder();
|
||||||
|
|
||||||
TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||||
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||||
tokenStream.reset();
|
tokenStream.reset();
|
||||||
|
@ -225,7 +225,7 @@ public class Highlighter
|
||||||
(offsetAtt.startOffset()>text.length())
|
(offsetAtt.startOffset()>text.length())
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
|
throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
|
||||||
+" exceeds length of provided text sized "+text.length());
|
+" exceeds length of provided text sized "+text.length());
|
||||||
}
|
}
|
||||||
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
|
if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
|
||||||
|
|
|
@ -25,8 +25,8 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.memory.MemoryIndex;
|
import org.apache.lucene.index.memory.MemoryIndex;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
@ -46,7 +46,7 @@ public class QueryScorer implements Scorer {
|
||||||
private float maxTermWeight;
|
private float maxTermWeight;
|
||||||
private int position = -1;
|
private int position = -1;
|
||||||
private String defaultField;
|
private String defaultField;
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
private PositionIncrementAttribute posIncAtt;
|
private PositionIncrementAttribute posIncAtt;
|
||||||
private boolean expandMultiTermQuery = true;
|
private boolean expandMultiTermQuery = true;
|
||||||
private Query query;
|
private Query query;
|
||||||
|
@ -145,7 +145,7 @@ public class QueryScorer implements Scorer {
|
||||||
*/
|
*/
|
||||||
public float getTokenScore() {
|
public float getTokenScore() {
|
||||||
position += posIncAtt.getPositionIncrement();
|
position += posIncAtt.getPositionIncrement();
|
||||||
String termText = termAtt.term();
|
String termText = termAtt.toString();
|
||||||
|
|
||||||
WeightedSpanTerm weightedSpanTerm;
|
WeightedSpanTerm weightedSpanTerm;
|
||||||
|
|
||||||
|
@ -175,7 +175,7 @@ public class QueryScorer implements Scorer {
|
||||||
*/
|
*/
|
||||||
public TokenStream init(TokenStream tokenStream) throws IOException {
|
public TokenStream init(TokenStream tokenStream) throws IOException {
|
||||||
position = -1;
|
position = -1;
|
||||||
termAtt = tokenStream.addAttribute(TermAttribute.class);
|
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||||
if(!skipInitExtractor) {
|
if(!skipInitExtractor) {
|
||||||
if(fieldWeightedSpanTerms != null) {
|
if(fieldWeightedSpanTerms != null) {
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ public class QueryTermScorer implements Scorer {
|
||||||
float maxTermWeight = 0;
|
float maxTermWeight = 0;
|
||||||
private HashMap<String,WeightedTerm> termsToFind;
|
private HashMap<String,WeightedTerm> termsToFind;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -95,7 +95,7 @@ public class QueryTermScorer implements Scorer {
|
||||||
* @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
|
* @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
|
||||||
*/
|
*/
|
||||||
public TokenStream init(TokenStream tokenStream) {
|
public TokenStream init(TokenStream tokenStream) {
|
||||||
termAtt = tokenStream.addAttribute(TermAttribute.class);
|
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -118,7 +118,7 @@ public class QueryTermScorer implements Scorer {
|
||||||
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
|
* @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
|
||||||
*/
|
*/
|
||||||
public float getTokenScore() {
|
public float getTokenScore() {
|
||||||
String termText = termAtt.term();
|
String termText = termAtt.toString();
|
||||||
|
|
||||||
WeightedTerm queryTerm = termsToFind.get(termText);
|
WeightedTerm queryTerm = termsToFind.get(termText);
|
||||||
if (queryTerm == null) {
|
if (queryTerm == null) {
|
||||||
|
|
|
@ -20,9 +20,9 @@ package org.apache.lucene.search.highlight;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.search.spans.Spans;
|
import org.apache.lucene.search.spans.Spans;
|
||||||
|
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
|
||||||
private QueryScorer queryScorer;
|
private QueryScorer queryScorer;
|
||||||
private int waitForPos = -1;
|
private int waitForPos = -1;
|
||||||
private int textSize;
|
private int textSize;
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
private PositionIncrementAttribute posIncAtt;
|
private PositionIncrementAttribute posIncAtt;
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
WeightedSpanTerm wSpanTerm = queryScorer.getWeightedSpanTerm(termAtt.term());
|
WeightedSpanTerm wSpanTerm = queryScorer.getWeightedSpanTerm(termAtt.toString());
|
||||||
|
|
||||||
if (wSpanTerm != null) {
|
if (wSpanTerm != null) {
|
||||||
List<PositionSpan> positionSpans = wSpanTerm.getPositionSpans();
|
List<PositionSpan> positionSpans = wSpanTerm.getPositionSpans();
|
||||||
|
@ -101,7 +101,7 @@ public class SimpleSpanFragmenter implements Fragmenter {
|
||||||
position = -1;
|
position = -1;
|
||||||
currentNumFrags = 1;
|
currentNumFrags = 1;
|
||||||
textSize = originalText.length();
|
textSize = originalText.length();
|
||||||
termAtt = tokenStream.addAttribute(TermAttribute.class);
|
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
|
||||||
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,8 +19,8 @@ package org.apache.lucene.search.highlight;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* One, or several overlapping tokens, along with the score(s) and the scope of
|
* One, or several overlapping tokens, along with the score(s) and the scope of
|
||||||
|
@ -38,11 +38,11 @@ public class TokenGroup {
|
||||||
int matchStartOffset, matchEndOffset;
|
int matchStartOffset, matchEndOffset;
|
||||||
|
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
|
|
||||||
public TokenGroup(TokenStream tokenStream) {
|
public TokenGroup(TokenStream tokenStream) {
|
||||||
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
|
||||||
termAtt = tokenStream.addAttribute(TermAttribute.class);
|
termAtt = tokenStream.addAttribute(CharTermAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
void addToken(float score) {
|
void addToken(float score) {
|
||||||
|
@ -68,7 +68,7 @@ public class TokenGroup {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Token token = new Token(termStartOffset, termEndOffset);
|
Token token = new Token(termStartOffset, termEndOffset);
|
||||||
token.setTermBuffer(termAtt.term());
|
token.setEmpty().append(termAtt);
|
||||||
tokens[numTokens] = token;
|
tokens[numTokens] = token;
|
||||||
scores[numTokens] = score;
|
scores[numTokens] = score;
|
||||||
numTokens++;
|
numTokens++;
|
||||||
|
|
|
@ -29,8 +29,8 @@ import java.util.Comparator;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.TermFreqVector;
|
import org.apache.lucene.index.TermFreqVector;
|
||||||
|
@ -153,13 +153,13 @@ public class TokenSources {
|
||||||
|
|
||||||
int currentToken = 0;
|
int currentToken = 0;
|
||||||
|
|
||||||
TermAttribute termAtt;
|
CharTermAttribute termAtt;
|
||||||
|
|
||||||
OffsetAttribute offsetAtt;
|
OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
StoredTokenStream(Token tokens[]) {
|
StoredTokenStream(Token tokens[]) {
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -170,7 +170,7 @@ public class TokenSources {
|
||||||
}
|
}
|
||||||
Token token = tokens[currentToken++];
|
Token token = tokens[currentToken++];
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.term());
|
termAtt.setEmpty().append(token);
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -204,9 +204,8 @@ public class TokenSources {
|
||||||
unsortedTokens = new ArrayList<Token>();
|
unsortedTokens = new ArrayList<Token>();
|
||||||
}
|
}
|
||||||
for (int tp = 0; tp < offsets.length; tp++) {
|
for (int tp = 0; tp < offsets.length; tp++) {
|
||||||
Token token = new Token(offsets[tp].getStartOffset(), offsets[tp]
|
Token token = new Token(terms[t], offsets[tp].getStartOffset(), offsets[tp]
|
||||||
.getEndOffset());
|
.getEndOffset());
|
||||||
token.setTermBuffer(terms[t]);
|
|
||||||
unsortedTokens.add(token);
|
unsortedTokens.add(token);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -25,9 +25,9 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.index.TermPositionVector;
|
import org.apache.lucene.index.TermPositionVector;
|
||||||
import org.apache.lucene.index.TermVectorOffsetInfo;
|
import org.apache.lucene.index.TermVectorOffsetInfo;
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
|
|
||||||
private Iterator<Token> tokensAtCurrentPosition;
|
private Iterator<Token> tokensAtCurrentPosition;
|
||||||
|
|
||||||
private TermAttribute termAttribute;
|
private CharTermAttribute termAttribute;
|
||||||
|
|
||||||
private PositionIncrementAttribute positionIncrementAttribute;
|
private PositionIncrementAttribute positionIncrementAttribute;
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
*/
|
*/
|
||||||
public TokenStreamFromTermPositionVector(
|
public TokenStreamFromTermPositionVector(
|
||||||
final TermPositionVector termPositionVector) {
|
final TermPositionVector termPositionVector) {
|
||||||
termAttribute = addAttribute(TermAttribute.class);
|
termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
final String[] terms = termPositionVector.getTerms();
|
final String[] terms = termPositionVector.getTerms();
|
||||||
|
@ -65,7 +65,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
offsets[j].getStartOffset(), offsets[j].getEndOffset());
|
offsets[j].getStartOffset(), offsets[j].getEndOffset());
|
||||||
} else {
|
} else {
|
||||||
token = new Token();
|
token = new Token();
|
||||||
token.setTermBuffer(terms[i]);
|
token.setEmpty().append(terms[i]);
|
||||||
}
|
}
|
||||||
// Yes - this is the position, not the increment! This is for
|
// Yes - this is the position, not the increment! This is for
|
||||||
// sorting. This value
|
// sorting. This value
|
||||||
|
@ -100,7 +100,7 @@ public final class TokenStreamFromTermPositionVector extends TokenStream {
|
||||||
if (this.tokensAtCurrentPosition.hasNext()) {
|
if (this.tokensAtCurrentPosition.hasNext()) {
|
||||||
final Token next = this.tokensAtCurrentPosition.next();
|
final Token next = this.tokensAtCurrentPosition.next();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAttribute.setTermBuffer(next.term());
|
termAttribute.setEmpty().append(next);
|
||||||
positionIncrementAttribute.setPositionIncrement(next
|
positionIncrementAttribute.setPositionIncrement(next
|
||||||
.getPositionIncrement());
|
.getPositionIncrement());
|
||||||
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
offsetAttribute.setOffset(next.startOffset(), next.endOffset());
|
||||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Field.Index;
|
import org.apache.lucene.document.Field.Index;
|
||||||
|
@ -296,16 +296,11 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
|
|
||||||
private int i = -1;
|
private int i = -1;
|
||||||
|
|
||||||
private TermAttribute termAttribute;
|
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private OffsetAttribute offsetAttribute;
|
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
private PositionIncrementAttribute positionIncrementAttribute;
|
|
||||||
|
|
||||||
public TokenStreamSparse() {
|
public TokenStreamSparse() {
|
||||||
termAttribute = addAttribute(TermAttribute.class);
|
|
||||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
|
||||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -316,8 +311,7 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAttribute.setTermBuffer(this.tokens[i].term(), 0, this.tokens[i]
|
termAttribute.setEmpty().append(this.tokens[i]);
|
||||||
.term().length());
|
|
||||||
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
|
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
|
||||||
.endOffset());
|
.endOffset());
|
||||||
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
|
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
|
||||||
|
@ -342,16 +336,11 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
|
|
||||||
private int i = -1;
|
private int i = -1;
|
||||||
|
|
||||||
private TermAttribute termAttribute;
|
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
|
||||||
|
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
|
||||||
private OffsetAttribute offsetAttribute;
|
private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
private PositionIncrementAttribute positionIncrementAttribute;
|
|
||||||
|
|
||||||
public TokenStreamConcurrent() {
|
public TokenStreamConcurrent() {
|
||||||
termAttribute = addAttribute(TermAttribute.class);
|
|
||||||
offsetAttribute = addAttribute(OffsetAttribute.class);
|
|
||||||
positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
reset();
|
reset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -362,8 +351,7 @@ public class HighlighterPhraseTest extends LuceneTestCase {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAttribute.setTermBuffer(this.tokens[i].term(), 0, this.tokens[i]
|
termAttribute.setEmpty().append(this.tokens[i]);
|
||||||
.term().length());
|
|
||||||
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
|
offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
|
||||||
.endOffset());
|
.endOffset());
|
||||||
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
|
positionIncrementAttribute.setPositionIncrement(this.tokens[i]
|
||||||
|
|
|
@ -41,7 +41,7 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.NumericField;
|
import org.apache.lucene.document.NumericField;
|
||||||
|
@ -1424,13 +1424,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
Iterator<Token> iter;
|
Iterator<Token> iter;
|
||||||
List<Token> lst;
|
List<Token> lst;
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
{
|
{
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
lst = new ArrayList<Token>();
|
lst = new ArrayList<Token>();
|
||||||
Token t;
|
Token t;
|
||||||
t = createToken("hi", 0, 2);
|
t = createToken("hi", 0, 2);
|
||||||
|
@ -1456,7 +1453,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
if(iter.hasNext()) {
|
if(iter.hasNext()) {
|
||||||
Token token = iter.next();
|
Token token = iter.next();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.term());
|
termAtt.setEmpty().append(token);
|
||||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
return true;
|
return true;
|
||||||
|
@ -1473,13 +1470,10 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
Iterator<Token> iter;
|
Iterator<Token> iter;
|
||||||
List<Token> lst;
|
List<Token> lst;
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
{
|
{
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
lst = new ArrayList<Token>();
|
lst = new ArrayList<Token>();
|
||||||
Token t;
|
Token t;
|
||||||
t = createToken("hispeed", 0, 8);
|
t = createToken("hispeed", 0, 8);
|
||||||
|
@ -1505,7 +1499,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
if(iter.hasNext()) {
|
if(iter.hasNext()) {
|
||||||
Token token = iter.next();
|
Token token = iter.next();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.term());
|
termAtt.setEmpty().append(token);
|
||||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
return true;
|
return true;
|
||||||
|
@ -1762,9 +1756,7 @@ public class HighlighterTest extends BaseTokenStreamTestCase implements Formatte
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset);
|
return new Token(term, start, offset);
|
||||||
token.setTermBuffer(term);
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1795,7 +1787,7 @@ final class SynonymAnalyzer extends Analyzer {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String arg0, Reader arg1) {
|
public TokenStream tokenStream(String arg0, Reader arg1) {
|
||||||
Tokenizer stream = new MockTokenizer(arg1, MockTokenizer.SIMPLE, true);
|
Tokenizer stream = new MockTokenizer(arg1, MockTokenizer.SIMPLE, true);
|
||||||
stream.addAttribute(TermAttribute.class);
|
stream.addAttribute(CharTermAttribute.class);
|
||||||
stream.addAttribute(PositionIncrementAttribute.class);
|
stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
stream.addAttribute(OffsetAttribute.class);
|
stream.addAttribute(OffsetAttribute.class);
|
||||||
return new SynonymTokenizer(stream, synonyms);
|
return new SynonymTokenizer(stream, synonyms);
|
||||||
|
@ -1811,21 +1803,21 @@ final class SynonymTokenizer extends TokenStream {
|
||||||
private Token currentRealToken = null;
|
private Token currentRealToken = null;
|
||||||
private Map<String,String> synonyms;
|
private Map<String,String> synonyms;
|
||||||
StringTokenizer st = null;
|
StringTokenizer st = null;
|
||||||
private TermAttribute realTermAtt;
|
private CharTermAttribute realTermAtt;
|
||||||
private PositionIncrementAttribute realPosIncrAtt;
|
private PositionIncrementAttribute realPosIncrAtt;
|
||||||
private OffsetAttribute realOffsetAtt;
|
private OffsetAttribute realOffsetAtt;
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
|
|
||||||
public SynonymTokenizer(TokenStream realStream, Map<String,String> synonyms) {
|
public SynonymTokenizer(TokenStream realStream, Map<String,String> synonyms) {
|
||||||
this.realStream = realStream;
|
this.realStream = realStream;
|
||||||
this.synonyms = synonyms;
|
this.synonyms = synonyms;
|
||||||
realTermAtt = realStream.addAttribute(TermAttribute.class);
|
realTermAtt = realStream.addAttribute(CharTermAttribute.class);
|
||||||
realPosIncrAtt = realStream.addAttribute(PositionIncrementAttribute.class);
|
realPosIncrAtt = realStream.addAttribute(PositionIncrementAttribute.class);
|
||||||
realOffsetAtt = realStream.addAttribute(OffsetAttribute.class);
|
realOffsetAtt = realStream.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
}
|
}
|
||||||
|
@ -1840,25 +1832,25 @@ final class SynonymTokenizer extends TokenStream {
|
||||||
}
|
}
|
||||||
//Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
|
//Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(realTermAtt.term());
|
termAtt.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
|
||||||
offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
||||||
posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
|
||||||
|
|
||||||
String expansions = synonyms.get(realTermAtt.term());
|
String expansions = synonyms.get(realTermAtt.toString());
|
||||||
if (expansions == null) {
|
if (expansions == null) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
st = new StringTokenizer(expansions, ",");
|
st = new StringTokenizer(expansions, ",");
|
||||||
if (st.hasMoreTokens()) {
|
if (st.hasMoreTokens()) {
|
||||||
currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
|
||||||
currentRealToken.setTermBuffer(realTermAtt.term());
|
currentRealToken.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
String tok = st.nextToken();
|
String tok = st.nextToken();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(tok);
|
termAtt.setEmpty().append(tok);
|
||||||
offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
|
offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
|
||||||
posIncrAtt.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
if (!st.hasMoreTokens()) {
|
if (!st.hasMoreTokens()) {
|
||||||
|
|
|
@ -26,8 +26,8 @@ import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.document.Field.Index;
|
import org.apache.lucene.document.Field.Index;
|
||||||
|
@ -221,14 +221,14 @@ public abstract class AbstractTestCase extends LuceneTestCase {
|
||||||
ch = 0;
|
ch = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
TermAttribute termAtt = addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if( !getNextPartialSnippet() )
|
if( !getNextPartialSnippet() )
|
||||||
return false;
|
return false;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(snippet, startTerm, lenTerm);
|
termAtt.setEmpty().append(snippet, startTerm, startTerm + lenTerm);
|
||||||
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
|
offsetAtt.setOffset(correctOffset(startOffset), correctOffset(startOffset + lenTerm));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -25,7 +25,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.BooleanClause.Occur;
|
import org.apache.lucene.search.BooleanClause.Occur;
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
|
@ -301,7 +301,7 @@ public class IndexTimeSynonymTest extends AbstractTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
TokenStream ts = new TokenStream(Token.TOKEN_ATTRIBUTE_FACTORY) {
|
TokenStream ts = new TokenStream(Token.TOKEN_ATTRIBUTE_FACTORY) {
|
||||||
final AttributeImpl reusableToken = (AttributeImpl) addAttribute(TermAttribute.class);
|
final AttributeImpl reusableToken = (AttributeImpl) addAttribute(CharTermAttribute.class);
|
||||||
int p = 0;
|
int p = 0;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -561,7 +561,7 @@ public class InstantiatedIndexWriter implements Closeable {
|
||||||
// untokenized
|
// untokenized
|
||||||
String fieldVal = field.stringValue();
|
String fieldVal = field.stringValue();
|
||||||
Token token = new Token(0, fieldVal.length(), "untokenized");
|
Token token = new Token(0, fieldVal.length(), "untokenized");
|
||||||
token.setTermBuffer(fieldVal);
|
token.setEmpty().append(fieldVal);
|
||||||
tokens.add(token);
|
tokens.add(token);
|
||||||
fieldSetting.fieldLength++;
|
fieldSetting.fieldLength++;
|
||||||
}
|
}
|
||||||
|
@ -596,10 +596,10 @@ public class InstantiatedIndexWriter implements Closeable {
|
||||||
|
|
||||||
for (Token token : eField_Tokens.getValue()) {
|
for (Token token : eField_Tokens.getValue()) {
|
||||||
|
|
||||||
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.term());
|
TermDocumentInformationFactory termDocumentInformationFactory = termDocumentInformationFactoryByTermText.get(token.toString());
|
||||||
if (termDocumentInformationFactory == null) {
|
if (termDocumentInformationFactory == null) {
|
||||||
termDocumentInformationFactory = new TermDocumentInformationFactory();
|
termDocumentInformationFactory = new TermDocumentInformationFactory();
|
||||||
termDocumentInformationFactoryByTermText.put(token.term(), termDocumentInformationFactory);
|
termDocumentInformationFactoryByTermText.put(token.toString(), termDocumentInformationFactory);
|
||||||
}
|
}
|
||||||
//termDocumentInformationFactory.termFrequency++;
|
//termDocumentInformationFactory.termFrequency++;
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ import java.util.List;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.MockAnalyzer;
|
import org.apache.lucene.analysis.MockAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -278,7 +278,7 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||||
tokens.add(t);
|
tokens.add(t);
|
||||||
tokens.add(createToken("fin", 7, 9));
|
tokens.add(createToken("fin", 7, 9));
|
||||||
TokenStream ts = new TokenStream(Token.TOKEN_ATTRIBUTE_FACTORY) {
|
TokenStream ts = new TokenStream(Token.TOKEN_ATTRIBUTE_FACTORY) {
|
||||||
final AttributeImpl reusableToken = (AttributeImpl) addAttribute(TermAttribute.class);
|
final AttributeImpl reusableToken = (AttributeImpl) addAttribute(CharTermAttribute.class);
|
||||||
Iterator<Token> it = tokens.iterator();
|
Iterator<Token> it = tokens.iterator();
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -601,16 +601,12 @@ public class TestIndicesEquals extends LuceneTestCase {
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset);
|
return new Token(term, start, offset);
|
||||||
token.setTermBuffer(term);
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset, String type)
|
private static Token createToken(String term, int start, int offset, String type)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset, type);
|
return new Token(term, start, offset, type);
|
||||||
token.setTermBuffer(term);
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -36,8 +36,8 @@ import jline.ConsoleReader;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Fieldable;
|
import org.apache.lucene.document.Fieldable;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -303,14 +303,14 @@ class LuceneMethods {
|
||||||
int position = 0;
|
int position = 0;
|
||||||
// Tokenize field and add to postingTable
|
// Tokenize field and add to postingTable
|
||||||
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
TokenStream stream = analyzer.tokenStream(fieldName, reader);
|
||||||
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
position += (posIncrAtt.getPositionIncrement() - 1);
|
position += (posIncrAtt.getPositionIncrement() - 1);
|
||||||
position++;
|
position++;
|
||||||
String name = termAtt.term();
|
String name = termAtt.toString();
|
||||||
Integer Count = tokenMap.get(name);
|
Integer Count = tokenMap.get(name);
|
||||||
if (Count == null) { // not in there yet
|
if (Count == null) { // not in there yet
|
||||||
tokenMap.put(name, Integer.valueOf(1)); //first one
|
tokenMap.put(name, Integer.valueOf(1)); //first one
|
||||||
|
|
|
@ -30,9 +30,10 @@ import java.util.Map;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.FieldSelector;
|
import org.apache.lucene.document.FieldSelector;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -51,6 +52,7 @@ import org.apache.lucene.search.Searcher;
|
||||||
import org.apache.lucene.search.Scorer;
|
import org.apache.lucene.search.Scorer;
|
||||||
import org.apache.lucene.search.Similarity;
|
import org.apache.lucene.search.Similarity;
|
||||||
import org.apache.lucene.store.RAMDirectory; // for javadocs
|
import org.apache.lucene.store.RAMDirectory; // for javadocs
|
||||||
|
import org.apache.lucene.util.BytesRef;
|
||||||
import org.apache.lucene.util.Constants; // for javadocs
|
import org.apache.lucene.util.Constants; // for javadocs
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -276,8 +278,8 @@ public class MemoryIndex implements Serializable {
|
||||||
return new TokenStream() {
|
return new TokenStream() {
|
||||||
private Iterator<T> iter = keywords.iterator();
|
private Iterator<T> iter = keywords.iterator();
|
||||||
private int start = 0;
|
private int start = 0;
|
||||||
private TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() {
|
public boolean incrementToken() {
|
||||||
|
@ -289,8 +291,8 @@ public class MemoryIndex implements Serializable {
|
||||||
|
|
||||||
String term = obj.toString();
|
String term = obj.toString();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(term);
|
termAtt.setEmpty().append(term);
|
||||||
offsetAtt.setOffset(start, start+termAtt.termLength());
|
offsetAtt.setOffset(start, start+termAtt.length());
|
||||||
start += term.length() + 1; // separate words by 1 (blank) character
|
start += term.length() + 1; // separate words by 1 (blank) character
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -340,13 +342,15 @@ public class MemoryIndex implements Serializable {
|
||||||
int numOverlapTokens = 0;
|
int numOverlapTokens = 0;
|
||||||
int pos = -1;
|
int pos = -1;
|
||||||
|
|
||||||
TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
|
TermToBytesRefAttribute termAtt = stream.addAttribute(TermToBytesRefAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
|
||||||
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
|
||||||
|
BytesRef ref = new BytesRef(10);
|
||||||
stream.reset();
|
stream.reset();
|
||||||
while (stream.incrementToken()) {
|
while (stream.incrementToken()) {
|
||||||
String term = termAtt.term();
|
termAtt.toBytesRef(ref);
|
||||||
|
// TODO: support non-UTF8 strings (like numerics) here
|
||||||
|
String term = ref.utf8ToString();
|
||||||
if (term.length() == 0) continue; // nothing to do
|
if (term.length() == 0) continue; // nothing to do
|
||||||
// if (DEBUG) System.err.println("token='" + term + "'");
|
// if (DEBUG) System.err.println("token='" + term + "'");
|
||||||
numTokens++;
|
numTokens++;
|
||||||
|
|
|
@ -26,7 +26,7 @@ import java.util.Iterator;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.util.BytesRef;
|
import org.apache.lucene.util.BytesRef;
|
||||||
|
@ -185,14 +185,14 @@ public class FuzzyLikeThisQuery extends Query
|
||||||
{
|
{
|
||||||
if(f.queryString==null) return;
|
if(f.queryString==null) return;
|
||||||
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
TokenStream ts=analyzer.tokenStream(f.fieldName,new StringReader(f.queryString));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
int corpusNumDocs=reader.numDocs();
|
int corpusNumDocs=reader.numDocs();
|
||||||
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
|
||||||
HashSet<String> processedTerms=new HashSet<String>();
|
HashSet<String> processedTerms=new HashSet<String>();
|
||||||
while (ts.incrementToken())
|
while (ts.incrementToken())
|
||||||
{
|
{
|
||||||
String term = termAtt.term();
|
String term = termAtt.toString();
|
||||||
if(!processedTerms.contains(term))
|
if(!processedTerms.contains(term))
|
||||||
{
|
{
|
||||||
processedTerms.add(term);
|
processedTerms.add(term);
|
||||||
|
|
|
@ -32,7 +32,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -884,10 +884,10 @@ public final class MoreLikeThis {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
TokenStream ts = analyzer.tokenStream(fieldName, r);
|
||||||
int tokenCount=0;
|
int tokenCount=0;
|
||||||
// for every token
|
// for every token
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String word = termAtt.term();
|
String word = termAtt.toString();
|
||||||
tokenCount++;
|
tokenCount++;
|
||||||
if(tokenCount>maxNumTokensParsed)
|
if(tokenCount>maxNumTokensParsed)
|
||||||
{
|
{
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
@ -86,12 +86,12 @@ public final class SimilarityQueries
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
TokenStream ts = a.tokenStream( field, new StringReader( body));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
BooleanQuery tmp = new BooleanQuery();
|
BooleanQuery tmp = new BooleanQuery();
|
||||||
Set<String> already = new HashSet<String>(); // ignore dups
|
Set<String> already = new HashSet<String>(); // ignore dups
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String word = termAtt.term();
|
String word = termAtt.toString();
|
||||||
// ignore opt stop words
|
// ignore opt stop words
|
||||||
if ( stop != null &&
|
if ( stop != null &&
|
||||||
stop.contains( word)) continue;
|
stop.contains( word)) continue;
|
||||||
|
|
|
@ -24,7 +24,7 @@ import java.util.List;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.queryParser.ParseException;
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -107,7 +107,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
|
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
int countTokens = 0;
|
int countTokens = 0;
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -116,7 +116,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
String term = termAtt.term();
|
String term = termAtt.toString();
|
||||||
if (!"".equals(term)) {
|
if (!"".equals(term)) {
|
||||||
try {
|
try {
|
||||||
tlist.set(countTokens++, term);
|
tlist.set(countTokens++, term);
|
||||||
|
@ -190,7 +190,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
List<String> tlist = new ArrayList<String>();
|
List<String> tlist = new ArrayList<String>();
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
try {
|
try {
|
||||||
|
@ -198,7 +198,7 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
tlist.add(termAtt.term());
|
tlist.add(termAtt.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -237,13 +237,13 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
throws ParseException {
|
throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the term
|
// get Analyzer from superclass and tokenize the term
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(termStr));
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
String nextToken = null;
|
String nextToken = null;
|
||||||
boolean multipleTokens = false;
|
boolean multipleTokens = false;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (source.incrementToken()) {
|
if (source.incrementToken()) {
|
||||||
nextToken = termAtt.term();
|
nextToken = termAtt.toString();
|
||||||
}
|
}
|
||||||
multipleTokens = source.incrementToken();
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -273,13 +273,13 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
throws ParseException {
|
throws ParseException {
|
||||||
// get Analyzer from superclass and tokenize the terms
|
// get Analyzer from superclass and tokenize the terms
|
||||||
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
TokenStream source = getAnalyzer().tokenStream(field, new StringReader(part1));
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
boolean multipleTokens = false;
|
boolean multipleTokens = false;
|
||||||
|
|
||||||
// part1
|
// part1
|
||||||
try {
|
try {
|
||||||
if (source.incrementToken()) {
|
if (source.incrementToken()) {
|
||||||
part1 = termAtt.term();
|
part1 = termAtt.toString();
|
||||||
}
|
}
|
||||||
multipleTokens = source.incrementToken();
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
@ -297,11 +297,11 @@ public class AnalyzingQueryParser extends org.apache.lucene.queryParser.QueryPar
|
||||||
|
|
||||||
// part2
|
// part2
|
||||||
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
source = getAnalyzer().tokenStream(field, new StringReader(part2));
|
||||||
termAtt = source.addAttribute(TermAttribute.class);
|
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (source.incrementToken()) {
|
if (source.incrementToken()) {
|
||||||
part2 = termAtt.term();
|
part2 = termAtt.toString();
|
||||||
}
|
}
|
||||||
multipleTokens = source.incrementToken();
|
multipleTokens = source.incrementToken();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|
|
@ -307,7 +307,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
||||||
List<AttributeSource.State> list = new ArrayList<AttributeSource.State>();
|
List<AttributeSource.State> list = new ArrayList<AttributeSource.State>();
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posincrAtt = source.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posincrAtt = source.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -328,7 +328,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
||||||
return null;
|
return null;
|
||||||
else if (list.size() == 1) {
|
else if (list.size() == 1) {
|
||||||
source.restoreState(list.get(0));
|
source.restoreState(list.get(0));
|
||||||
return new TermQuery(new Term(field, termAtt.term()));
|
return new TermQuery(new Term(field, termAtt.toString()));
|
||||||
} else {
|
} else {
|
||||||
if (severalTokensAtSamePosition) {
|
if (severalTokensAtSamePosition) {
|
||||||
if (positionCount == 1) {
|
if (positionCount == 1) {
|
||||||
|
@ -337,7 +337,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < list.size(); i++) {
|
||||||
source.restoreState(list.get(i));
|
source.restoreState(list.get(i));
|
||||||
TermQuery currentQuery = new TermQuery(
|
TermQuery currentQuery = new TermQuery(
|
||||||
new Term(field, termAtt.term()));
|
new Term(field, termAtt.toString()));
|
||||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
|
@ -352,7 +352,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
||||||
mpq.add(multiTerms.toArray(new Term[0]));
|
mpq.add(multiTerms.toArray(new Term[0]));
|
||||||
multiTerms.clear();
|
multiTerms.clear();
|
||||||
}
|
}
|
||||||
multiTerms.add(new Term(field, termAtt.term()));
|
multiTerms.add(new Term(field, termAtt.toString()));
|
||||||
}
|
}
|
||||||
mpq.add(multiTerms.toArray(new Term[0]));
|
mpq.add(multiTerms.toArray(new Term[0]));
|
||||||
return mpq;
|
return mpq;
|
||||||
|
@ -363,7 +363,7 @@ public class PrecedenceQueryParser implements PrecedenceQueryParserConstants {
|
||||||
q.setSlop(phraseSlop);
|
q.setSlop(phraseSlop);
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < list.size(); i++) {
|
||||||
source.restoreState(list.get(i));
|
source.restoreState(list.get(i));
|
||||||
q.add(new Term(field, termAtt.term()));
|
q.add(new Term(field, termAtt.toString()));
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
|
@ -331,7 +331,7 @@ public class PrecedenceQueryParser {
|
||||||
List<AttributeSource.State> list = new ArrayList<AttributeSource.State>();
|
List<AttributeSource.State> list = new ArrayList<AttributeSource.State>();
|
||||||
int positionCount = 0;
|
int positionCount = 0;
|
||||||
boolean severalTokensAtSamePosition = false;
|
boolean severalTokensAtSamePosition = false;
|
||||||
TermAttribute termAtt = source.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posincrAtt = source.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posincrAtt = source.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
@ -352,7 +352,7 @@ public class PrecedenceQueryParser {
|
||||||
return null;
|
return null;
|
||||||
else if (list.size() == 1) {
|
else if (list.size() == 1) {
|
||||||
source.restoreState(list.get(0));
|
source.restoreState(list.get(0));
|
||||||
return new TermQuery(new Term(field, termAtt.term()));
|
return new TermQuery(new Term(field, termAtt.toString()));
|
||||||
} else {
|
} else {
|
||||||
if (severalTokensAtSamePosition) {
|
if (severalTokensAtSamePosition) {
|
||||||
if (positionCount == 1) {
|
if (positionCount == 1) {
|
||||||
|
@ -361,7 +361,7 @@ public class PrecedenceQueryParser {
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < list.size(); i++) {
|
||||||
source.restoreState(list.get(i));
|
source.restoreState(list.get(i));
|
||||||
TermQuery currentQuery = new TermQuery(
|
TermQuery currentQuery = new TermQuery(
|
||||||
new Term(field, termAtt.term()));
|
new Term(field, termAtt.toString()));
|
||||||
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
q.add(currentQuery, BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
|
@ -376,7 +376,7 @@ public class PrecedenceQueryParser {
|
||||||
mpq.add(multiTerms.toArray(new Term[0]));
|
mpq.add(multiTerms.toArray(new Term[0]));
|
||||||
multiTerms.clear();
|
multiTerms.clear();
|
||||||
}
|
}
|
||||||
multiTerms.add(new Term(field, termAtt.term()));
|
multiTerms.add(new Term(field, termAtt.toString()));
|
||||||
}
|
}
|
||||||
mpq.add(multiTerms.toArray(new Term[0]));
|
mpq.add(multiTerms.toArray(new Term[0]));
|
||||||
return mpq;
|
return mpq;
|
||||||
|
@ -387,7 +387,7 @@ public class PrecedenceQueryParser {
|
||||||
q.setSlop(phraseSlop);
|
q.setSlop(phraseSlop);
|
||||||
for (int i = 0; i < list.size(); i++) {
|
for (int i = 0; i < list.size(); i++) {
|
||||||
source.restoreState(list.get(i));
|
source.restoreState(list.get(i));
|
||||||
q.add(new Term(field, termAtt.term()));
|
q.add(new Term(field, termAtt.toString()));
|
||||||
}
|
}
|
||||||
return q;
|
return q;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,8 +26,8 @@ import java.util.List;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CachingTokenFilter;
|
import org.apache.lucene.analysis.CachingTokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.queryParser.core.QueryNodeException;
|
import org.apache.lucene.queryParser.core.QueryNodeException;
|
||||||
import org.apache.lucene.queryParser.core.config.QueryConfigHandler;
|
import org.apache.lucene.queryParser.core.config.QueryConfigHandler;
|
||||||
import org.apache.lucene.queryParser.core.nodes.FieldQueryNode;
|
import org.apache.lucene.queryParser.core.nodes.FieldQueryNode;
|
||||||
|
@ -162,11 +162,11 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
// ignore
|
// ignore
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!buffer.hasAttribute(TermAttribute.class)) {
|
if (!buffer.hasAttribute(CharTermAttribute.class)) {
|
||||||
return new NoTokenFoundQueryNode();
|
return new NoTokenFoundQueryNode();
|
||||||
}
|
}
|
||||||
|
|
||||||
TermAttribute termAtt = buffer.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = buffer.getAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
if (numTokens == 0) {
|
if (numTokens == 0) {
|
||||||
return new NoTokenFoundQueryNode();
|
return new NoTokenFoundQueryNode();
|
||||||
|
@ -177,7 +177,7 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
boolean hasNext;
|
boolean hasNext;
|
||||||
hasNext = buffer.incrementToken();
|
hasNext = buffer.incrementToken();
|
||||||
assert hasNext == true;
|
assert hasNext == true;
|
||||||
term = termAtt.term();
|
term = termAtt.toString();
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// safe to ignore, because we know the number of tokens
|
// safe to ignore, because we know the number of tokens
|
||||||
|
@ -197,7 +197,7 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
try {
|
try {
|
||||||
boolean hasNext = buffer.incrementToken();
|
boolean hasNext = buffer.incrementToken();
|
||||||
assert hasNext == true;
|
assert hasNext == true;
|
||||||
term = termAtt.term();
|
term = termAtt.toString();
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
// safe to ignore, because we know the number of tokens
|
// safe to ignore, because we know the number of tokens
|
||||||
|
@ -224,7 +224,7 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
try {
|
try {
|
||||||
boolean hasNext = buffer.incrementToken();
|
boolean hasNext = buffer.incrementToken();
|
||||||
assert hasNext == true;
|
assert hasNext == true;
|
||||||
term = termAtt.term();
|
term = termAtt.toString();
|
||||||
if (posIncrAtt != null) {
|
if (posIncrAtt != null) {
|
||||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
}
|
}
|
||||||
|
@ -290,7 +290,7 @@ public class AnalyzerQueryNodeProcessor extends QueryNodeProcessorImpl {
|
||||||
try {
|
try {
|
||||||
boolean hasNext = buffer.incrementToken();
|
boolean hasNext = buffer.incrementToken();
|
||||||
assert hasNext == true;
|
assert hasNext == true;
|
||||||
term = termAtt.term();
|
term = termAtt.toString();
|
||||||
|
|
||||||
if (posIncrAtt != null) {
|
if (posIncrAtt != null) {
|
||||||
positionIncrement = posIncrAtt.getPositionIncrement();
|
positionIncrement = posIncrAtt.getPositionIncrement();
|
||||||
|
|
|
@ -23,8 +23,8 @@ import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
import org.apache.lucene.search.FuzzyQuery;
|
import org.apache.lucene.search.FuzzyQuery;
|
||||||
|
@ -68,7 +68,7 @@ public class TestPrecedenceQueryParser extends LocalizedTestCase {
|
||||||
boolean inPhrase = false;
|
boolean inPhrase = false;
|
||||||
int savedStart = 0, savedEnd = 0;
|
int savedStart = 0, savedEnd = 0;
|
||||||
|
|
||||||
TermAttribute termAtt = addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -76,19 +76,19 @@ public class TestPrecedenceQueryParser extends LocalizedTestCase {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
if (inPhrase) {
|
if (inPhrase) {
|
||||||
inPhrase = false;
|
inPhrase = false;
|
||||||
termAtt.setTermBuffer("phrase2");
|
termAtt.setEmpty().append("phrase2");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else
|
} else
|
||||||
while(input.incrementToken())
|
while(input.incrementToken())
|
||||||
if (termAtt.term().equals("phrase")) {
|
if (termAtt.toString().equals("phrase")) {
|
||||||
inPhrase = true;
|
inPhrase = true;
|
||||||
savedStart = offsetAtt.startOffset();
|
savedStart = offsetAtt.startOffset();
|
||||||
savedEnd = offsetAtt.endOffset();
|
savedEnd = offsetAtt.endOffset();
|
||||||
termAtt.setTermBuffer("phrase1");
|
termAtt.setEmpty().append("phrase1");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else if (!termAtt.term().equals("stop"))
|
} else if (!termAtt.toString().equals("stop"))
|
||||||
return true;
|
return true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,9 +23,9 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.queryParser.core.QueryNodeException;
|
import org.apache.lucene.queryParser.core.QueryNodeException;
|
||||||
import org.apache.lucene.queryParser.standard.config.DefaultOperatorAttribute.Operator;
|
import org.apache.lucene.queryParser.standard.config.DefaultOperatorAttribute.Operator;
|
||||||
|
@ -163,24 +163,19 @@ public class TestMultiAnalyzerQPHelper extends LuceneTestCase {
|
||||||
private int prevStartOffset;
|
private int prevStartOffset;
|
||||||
private int prevEndOffset;
|
private int prevEndOffset;
|
||||||
|
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
public TestFilter(TokenStream in) {
|
public TestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
if (multiToken > 0) {
|
if (multiToken > 0) {
|
||||||
termAtt.setTermBuffer("multi" + (multiToken + 1));
|
termAtt.setEmpty().append("multi" + (multiToken + 1));
|
||||||
offsetAtt.setOffset(prevStartOffset, prevEndOffset);
|
offsetAtt.setOffset(prevStartOffset, prevEndOffset);
|
||||||
typeAtt.setType(prevType);
|
typeAtt.setType(prevType);
|
||||||
posIncrAtt.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
@ -194,7 +189,7 @@ public class TestMultiAnalyzerQPHelper extends LuceneTestCase {
|
||||||
prevType = typeAtt.type();
|
prevType = typeAtt.type();
|
||||||
prevStartOffset = offsetAtt.startOffset();
|
prevStartOffset = offsetAtt.startOffset();
|
||||||
prevEndOffset = offsetAtt.endOffset();
|
prevEndOffset = offsetAtt.endOffset();
|
||||||
String text = termAtt.term();
|
String text = termAtt.toString();
|
||||||
if (text.equals("triplemulti")) {
|
if (text.equals("triplemulti")) {
|
||||||
multiToken = 2;
|
multiToken = 2;
|
||||||
return true;
|
return true;
|
||||||
|
@ -228,21 +223,19 @@ public class TestMultiAnalyzerQPHelper extends LuceneTestCase {
|
||||||
|
|
||||||
private class TestPosIncrementFilter extends TokenFilter {
|
private class TestPosIncrementFilter extends TokenFilter {
|
||||||
|
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
public TestPosIncrementFilter(TokenStream in) {
|
public TestPosIncrementFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (termAtt.term().equals("the")) {
|
if (termAtt.toString().equals("the")) {
|
||||||
// stopword, do nothing
|
// stopword, do nothing
|
||||||
} else if (termAtt.term().equals("quick")) {
|
} else if (termAtt.toString().equals("quick")) {
|
||||||
posIncrAtt.setPositionIncrement(2);
|
posIncrAtt.setPositionIncrement(2);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -23,9 +23,9 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.queryParser.ParseException;
|
import org.apache.lucene.queryParser.ParseException;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
@ -157,24 +157,19 @@ public class TestMultiAnalyzerWrapper extends LuceneTestCase {
|
||||||
private int prevStartOffset;
|
private int prevStartOffset;
|
||||||
private int prevEndOffset;
|
private int prevEndOffset;
|
||||||
|
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
public TestFilter(TokenStream in) {
|
public TestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
if (multiToken > 0) {
|
if (multiToken > 0) {
|
||||||
termAtt.setTermBuffer("multi" + (multiToken + 1));
|
termAtt.setEmpty().append("multi" + (multiToken + 1));
|
||||||
offsetAtt.setOffset(prevStartOffset, prevEndOffset);
|
offsetAtt.setOffset(prevStartOffset, prevEndOffset);
|
||||||
typeAtt.setType(prevType);
|
typeAtt.setType(prevType);
|
||||||
posIncrAtt.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
|
@ -188,7 +183,7 @@ public class TestMultiAnalyzerWrapper extends LuceneTestCase {
|
||||||
prevType = typeAtt.type();
|
prevType = typeAtt.type();
|
||||||
prevStartOffset = offsetAtt.startOffset();
|
prevStartOffset = offsetAtt.startOffset();
|
||||||
prevEndOffset = offsetAtt.endOffset();
|
prevEndOffset = offsetAtt.endOffset();
|
||||||
String text = termAtt.term();
|
String text = termAtt.toString();
|
||||||
if (text.equals("triplemulti")) {
|
if (text.equals("triplemulti")) {
|
||||||
multiToken = 2;
|
multiToken = 2;
|
||||||
return true;
|
return true;
|
||||||
|
@ -222,21 +217,19 @@ public class TestMultiAnalyzerWrapper extends LuceneTestCase {
|
||||||
|
|
||||||
private class TestPosIncrementFilter extends TokenFilter {
|
private class TestPosIncrementFilter extends TokenFilter {
|
||||||
|
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
public TestPosIncrementFilter(TokenStream in) {
|
public TestPosIncrementFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws java.io.IOException {
|
public final boolean incrementToken() throws java.io.IOException {
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (termAtt.term().equals("the")) {
|
if (termAtt.toString().equals("the")) {
|
||||||
// stopword, do nothing
|
// stopword, do nothing
|
||||||
} else if (termAtt.term().equals("quick")) {
|
} else if (termAtt.toString().equals("quick")) {
|
||||||
posIncrAtt.setPositionIncrement(2);
|
posIncrAtt.setPositionIncrement(2);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -37,8 +37,8 @@ import org.apache.lucene.analysis.MockTokenFilter;
|
||||||
import org.apache.lucene.analysis.MockTokenizer;
|
import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
|
@ -96,8 +96,8 @@ public class TestQPHelper extends LocalizedTestCase {
|
||||||
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
||||||
|
|
||||||
public static final class QPTestFilter extends TokenFilter {
|
public static final class QPTestFilter extends TokenFilter {
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter which discards the token 'stop' and which expands the token
|
* Filter which discards the token 'stop' and which expands the token
|
||||||
|
@ -105,8 +105,6 @@ public class TestQPHelper extends LocalizedTestCase {
|
||||||
*/
|
*/
|
||||||
public QPTestFilter(TokenStream in) {
|
public QPTestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean inPhrase = false;
|
boolean inPhrase = false;
|
||||||
|
@ -117,19 +115,19 @@ public class TestQPHelper extends LocalizedTestCase {
|
||||||
if (inPhrase) {
|
if (inPhrase) {
|
||||||
inPhrase = false;
|
inPhrase = false;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer("phrase2");
|
termAtt.setEmpty().append("phrase2");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else
|
} else
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (termAtt.term().equals("phrase")) {
|
if (termAtt.toString().equals("phrase")) {
|
||||||
inPhrase = true;
|
inPhrase = true;
|
||||||
savedStart = offsetAtt.startOffset();
|
savedStart = offsetAtt.startOffset();
|
||||||
savedEnd = offsetAtt.endOffset();
|
savedEnd = offsetAtt.endOffset();
|
||||||
termAtt.setTermBuffer("phrase1");
|
termAtt.setEmpty().append("phrase1");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else if (!termAtt.term().equals("stop"))
|
} else if (!termAtt.toString().equals("stop"))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -1158,7 +1156,7 @@ public class TestQPHelper extends LocalizedTestCase {
|
||||||
private class CannedTokenStream extends TokenStream {
|
private class CannedTokenStream extends TokenStream {
|
||||||
private int upto = 0;
|
private int upto = 0;
|
||||||
final PositionIncrementAttribute posIncr = addAttribute(PositionIncrementAttribute.class);
|
final PositionIncrementAttribute posIncr = addAttribute(PositionIncrementAttribute.class);
|
||||||
final TermAttribute term = addAttribute(TermAttribute.class);
|
final CharTermAttribute term = addAttribute(CharTermAttribute.class);
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() {
|
public boolean incrementToken() {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
|
@ -1167,16 +1165,16 @@ public class TestQPHelper extends LocalizedTestCase {
|
||||||
}
|
}
|
||||||
if (upto == 0) {
|
if (upto == 0) {
|
||||||
posIncr.setPositionIncrement(1);
|
posIncr.setPositionIncrement(1);
|
||||||
term.setTermBuffer("a");
|
term.setEmpty().append("a");
|
||||||
} else if (upto == 1) {
|
} else if (upto == 1) {
|
||||||
posIncr.setPositionIncrement(1);
|
posIncr.setPositionIncrement(1);
|
||||||
term.setTermBuffer("b");
|
term.setEmpty().append("b");
|
||||||
} else if (upto == 2) {
|
} else if (upto == 2) {
|
||||||
posIncr.setPositionIncrement(0);
|
posIncr.setPositionIncrement(0);
|
||||||
term.setTermBuffer("c");
|
term.setEmpty().append("c");
|
||||||
} else {
|
} else {
|
||||||
posIncr.setPositionIncrement(0);
|
posIncr.setPositionIncrement(0);
|
||||||
term.setTermBuffer("d");
|
term.setEmpty().append("d");
|
||||||
}
|
}
|
||||||
upto++;
|
upto++;
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -36,7 +36,7 @@ import org.apache.lucene.analysis.MockTokenizer;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.DateField;
|
import org.apache.lucene.document.DateField;
|
||||||
import org.apache.lucene.document.DateTools;
|
import org.apache.lucene.document.DateTools;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
|
@ -93,8 +93,8 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
|
||||||
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
public static Analyzer qpAnalyzer = new QPTestAnalyzer();
|
||||||
|
|
||||||
public static final class QPTestFilter extends TokenFilter {
|
public static final class QPTestFilter extends TokenFilter {
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter which discards the token 'stop' and which expands the token
|
* Filter which discards the token 'stop' and which expands the token
|
||||||
|
@ -102,8 +102,6 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
|
||||||
*/
|
*/
|
||||||
public QPTestFilter(TokenStream in) {
|
public QPTestFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean inPhrase = false;
|
boolean inPhrase = false;
|
||||||
|
@ -114,19 +112,19 @@ public class TestQueryParserWrapper extends LocalizedTestCase {
|
||||||
if (inPhrase) {
|
if (inPhrase) {
|
||||||
inPhrase = false;
|
inPhrase = false;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer("phrase2");
|
termAtt.setEmpty().append("phrase2");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else
|
} else
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
if (termAtt.term().equals("phrase")) {
|
if (termAtt.toString().equals("phrase")) {
|
||||||
inPhrase = true;
|
inPhrase = true;
|
||||||
savedStart = offsetAtt.startOffset();
|
savedStart = offsetAtt.startOffset();
|
||||||
savedEnd = offsetAtt.endOffset();
|
savedEnd = offsetAtt.endOffset();
|
||||||
termAtt.setTermBuffer("phrase1");
|
termAtt.setEmpty().append("phrase1");
|
||||||
offsetAtt.setOffset(savedStart, savedEnd);
|
offsetAtt.setOffset(savedStart, savedEnd);
|
||||||
return true;
|
return true;
|
||||||
} else if (!termAtt.term().equals("stop"))
|
} else if (!termAtt.toString().equals("stop"))
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -29,7 +29,7 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -117,10 +117,10 @@ public final class SynExpand {
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String word = termAtt.term();
|
String word = termAtt.toString();
|
||||||
if ( already.add( word))
|
if ( already.add( word))
|
||||||
top.add( word);
|
top.add( word);
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
|
@ -125,10 +125,10 @@ public class SynLookup {
|
||||||
|
|
||||||
// [1] Parse query into separate words so that when we expand we can avoid dups
|
// [1] Parse query into separate words so that when we expand we can avoid dups
|
||||||
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
TokenStream ts = a.tokenStream( field, new StringReader( query));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String word = termAtt.term();
|
String word = termAtt.toString();
|
||||||
if ( already.add( word))
|
if ( already.add( word))
|
||||||
top.add( word);
|
top.add( word);
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,8 +21,8 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
@ -45,9 +45,9 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
private AttributeSource.State current = null;
|
private AttributeSource.State current = null;
|
||||||
private int todo = 0;
|
private int todo = 0;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates an instance for the given underlying stream and synonym table.
|
* Creates an instance for the given underlying stream and synonym table.
|
||||||
|
@ -71,10 +71,6 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
this.synonyms = synonyms;
|
this.synonyms = synonyms;
|
||||||
this.maxSynonyms = maxSynonyms;
|
this.maxSynonyms = maxSynonyms;
|
||||||
|
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
@ -89,7 +85,7 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
|
|
||||||
if (!input.incrementToken()) return false; // EOS; iterator exhausted
|
if (!input.incrementToken()) return false; // EOS; iterator exhausted
|
||||||
|
|
||||||
stack = synonyms.getSynonyms(termAtt.term()); // push onto stack
|
stack = synonyms.getSynonyms(termAtt.toString()); // push onto stack
|
||||||
if (stack.length > maxSynonyms) randomize(stack);
|
if (stack.length > maxSynonyms) randomize(stack);
|
||||||
index = 0;
|
index = 0;
|
||||||
current = captureState();
|
current = captureState();
|
||||||
|
@ -110,7 +106,7 @@ public class SynonymTokenFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
protected boolean createToken(String synonym, AttributeSource.State current) {
|
protected boolean createToken(String synonym, AttributeSource.State current) {
|
||||||
restoreState(current);
|
restoreState(current);
|
||||||
termAtt.setTermBuffer(synonym);
|
termAtt.setEmpty().append(synonym);
|
||||||
typeAtt.setType(SYNONYM_TOKEN_TYPE);
|
typeAtt.setType(SYNONYM_TOKEN_TYPE);
|
||||||
posIncrAtt.setPositionIncrement(0);
|
posIncrAtt.setPositionIncrement(0);
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -10,7 +10,7 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
import org.apache.lucene.search.similar.MoreLikeThisQuery;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.xmlparser.DOMUtils;
|
import org.apache.lucene.xmlparser.DOMUtils;
|
||||||
|
@ -77,11 +77,11 @@ public class LikeThisQueryBuilder implements QueryBuilder {
|
||||||
for (int i = 0; i < fields.length; i++)
|
for (int i = 0; i < fields.length; i++)
|
||||||
{
|
{
|
||||||
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
|
TokenStream ts = analyzer.tokenStream(fields[i],new StringReader(stopWords));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
while(ts.incrementToken()) {
|
while(ts.incrementToken()) {
|
||||||
stopWordsSet.add(termAtt.term());
|
stopWordsSet.add(termAtt.toString());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch(IOException ioe)
|
catch(IOException ioe)
|
||||||
|
|
|
@ -6,7 +6,7 @@ import java.util.ArrayList;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.spans.SpanOrQuery;
|
import org.apache.lucene.search.spans.SpanOrQuery;
|
||||||
import org.apache.lucene.search.spans.SpanQuery;
|
import org.apache.lucene.search.spans.SpanQuery;
|
||||||
|
@ -56,10 +56,10 @@ public class SpanOrTermsBuilder extends SpanBuilderBase
|
||||||
{
|
{
|
||||||
ArrayList<SpanQuery> clausesList=new ArrayList<SpanQuery>();
|
ArrayList<SpanQuery> clausesList=new ArrayList<SpanQuery>();
|
||||||
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
TokenStream ts=analyzer.tokenStream(fieldName,new StringReader(value));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.term()));
|
SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.toString()));
|
||||||
clausesList.add(stq);
|
clausesList.add(stq);
|
||||||
}
|
}
|
||||||
SpanOrQuery soq=new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
SpanOrQuery soq=new SpanOrQuery(clausesList.toArray(new SpanQuery[clausesList.size()]));
|
||||||
|
|
|
@ -5,7 +5,7 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.Filter;
|
import org.apache.lucene.search.Filter;
|
||||||
import org.apache.lucene.search.TermsFilter;
|
import org.apache.lucene.search.TermsFilter;
|
||||||
|
@ -57,7 +57,7 @@ public class TermsFilterBuilder implements FilterBuilder
|
||||||
String text = DOMUtils.getNonBlankTextOrFail(e);
|
String text = DOMUtils.getNonBlankTextOrFail(e);
|
||||||
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
String fieldName = DOMUtils.getAttributeWithInheritanceOrFail(e, "fieldName");
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
|
@ -65,11 +65,11 @@ public class TermsFilterBuilder implements FilterBuilder
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
if (term == null)
|
if (term == null)
|
||||||
{
|
{
|
||||||
term = new Term(fieldName, termAtt.term());
|
term = new Term(fieldName, termAtt.toString());
|
||||||
} else
|
} else
|
||||||
{
|
{
|
||||||
// create from previous to save fieldName.intern overhead
|
// create from previous to save fieldName.intern overhead
|
||||||
term = term.createTerm(termAtt.term());
|
term = term.createTerm(termAtt.toString());
|
||||||
}
|
}
|
||||||
tf.addTerm(term);
|
tf.addTerm(term);
|
||||||
}
|
}
|
||||||
|
|
|
@ -5,7 +5,7 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.index.Term;
|
import org.apache.lucene.index.Term;
|
||||||
import org.apache.lucene.search.BooleanClause;
|
import org.apache.lucene.search.BooleanClause;
|
||||||
import org.apache.lucene.search.BooleanQuery;
|
import org.apache.lucene.search.BooleanQuery;
|
||||||
|
@ -57,16 +57,16 @@ public class TermsQueryBuilder implements QueryBuilder {
|
||||||
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
TokenStream ts = analyzer.tokenStream(fieldName, new StringReader(text));
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
Term term = null;
|
Term term = null;
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
if (term == null)
|
if (term == null)
|
||||||
{
|
{
|
||||||
term = new Term(fieldName, termAtt.term());
|
term = new Term(fieldName, termAtt.toString());
|
||||||
} else
|
} else
|
||||||
{
|
{
|
||||||
// create from previous to save fieldName.intern overhead
|
// create from previous to save fieldName.intern overhead
|
||||||
term = term.createTerm(termAtt.term());
|
term = term.createTerm(termAtt.toString());
|
||||||
}
|
}
|
||||||
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
bq.add(new BooleanClause(new TermQuery(term),BooleanClause.Occur.SHOULD));
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
|
* A {@link TokenFilter} that applies {@link ArabicNormalizer} to normalize the orthography.
|
||||||
|
@ -29,21 +29,18 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final class ArabicNormalizationFilter extends TokenFilter {
|
public final class ArabicNormalizationFilter extends TokenFilter {
|
||||||
|
private final ArabicNormalizer normalizer = new ArabicNormalizer();
|
||||||
private final ArabicNormalizer normalizer;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final TermAttribute termAtt;
|
|
||||||
|
|
||||||
public ArabicNormalizationFilter(TokenStream input) {
|
public ArabicNormalizationFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
normalizer = new ArabicNormalizer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt.termLength());
|
int newlen = normalizer.normalize(termAtt.buffer(), termAtt.length());
|
||||||
termAtt.setTermLength(newlen);
|
termAtt.setLength(newlen);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
|
* A {@link TokenFilter} that applies {@link ArabicStemmer} to stem Arabic words..
|
||||||
|
@ -35,24 +35,20 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
* @see KeywordMarkerFilter */
|
* @see KeywordMarkerFilter */
|
||||||
|
|
||||||
public final class ArabicStemFilter extends TokenFilter {
|
public final class ArabicStemFilter extends TokenFilter {
|
||||||
|
private final ArabicStemmer stemmer = new ArabicStemmer();
|
||||||
private final ArabicStemmer stemmer;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final TermAttribute termAtt;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
|
||||||
|
|
||||||
public ArabicStemFilter(TokenStream input) {
|
public ArabicStemFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
stemmer = new ArabicStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if(!keywordAttr.isKeyword()) {
|
if(!keywordAttr.isKeyword()) {
|
||||||
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
termAtt.setTermLength(newlen);
|
termAtt.setLength(newlen);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for java
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
|
* A {@link TokenFilter} that applies {@link BulgarianStemmer} to stem Bulgarian
|
||||||
|
@ -35,23 +35,20 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
* </p>
|
* </p>
|
||||||
*/
|
*/
|
||||||
public final class BulgarianStemFilter extends TokenFilter {
|
public final class BulgarianStemFilter extends TokenFilter {
|
||||||
private final BulgarianStemmer stemmer;
|
private final BulgarianStemmer stemmer = new BulgarianStemmer();
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public BulgarianStemFilter(final TokenStream input) {
|
public BulgarianStemFilter(final TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
stemmer = new BulgarianStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if(!keywordAttr.isKeyword()) {
|
if(!keywordAttr.isKeyword()) {
|
||||||
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
termAtt.setTermLength(newlen);
|
termAtt.setLength(newlen);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for java
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
|
* A {@link TokenFilter} that applies {@link BrazilianStemmer}.
|
||||||
|
@ -41,10 +41,10 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* {@link BrazilianStemmer} in use by this filter.
|
* {@link BrazilianStemmer} in use by this filter.
|
||||||
*/
|
*/
|
||||||
private BrazilianStemmer stemmer = null;
|
private BrazilianStemmer stemmer = new BrazilianStemmer();
|
||||||
private Set<?> exclusions = null;
|
private Set<?> exclusions = null;
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new BrazilianStemFilter
|
* Creates a new BrazilianStemFilter
|
||||||
|
@ -53,9 +53,6 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public BrazilianStemFilter(TokenStream in) {
|
public BrazilianStemFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new BrazilianStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -74,13 +71,13 @@ public final class BrazilianStemFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
final String term = termAtt.term();
|
final String term = termAtt.toString();
|
||||||
// Check the exclusion table.
|
// Check the exclusion table.
|
||||||
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
|
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
|
||||||
final String s = stemmer.stem(term);
|
final String s = stemmer.stem(term);
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
if ((s != null) && !s.equals(term))
|
if ((s != null) && !s.equals(term))
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty().append(s);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
@ -98,9 +98,9 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
*/
|
*/
|
||||||
private boolean preIsTokened = false;
|
private boolean preIsTokened = false;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
//~ Constructors -----------------------------------------------------------
|
//~ Constructors -----------------------------------------------------------
|
||||||
|
|
||||||
|
@ -111,23 +111,14 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
*/
|
*/
|
||||||
public CJKTokenizer(Reader in) {
|
public CJKTokenizer(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
init();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CJKTokenizer(AttributeSource source, Reader in) {
|
public CJKTokenizer(AttributeSource source, Reader in) {
|
||||||
super(source, in);
|
super(source, in);
|
||||||
init();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public CJKTokenizer(AttributeFactory factory, Reader in) {
|
public CJKTokenizer(AttributeFactory factory, Reader in) {
|
||||||
super(factory, in);
|
super(factory, in);
|
||||||
init();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void init() {
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//~ Methods ----------------------------------------------------------------
|
//~ Methods ----------------------------------------------------------------
|
||||||
|
@ -287,7 +278,7 @@ public final class CJKTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (length > 0) {
|
if (length > 0) {
|
||||||
termAtt.setTermBuffer(buffer, 0, length);
|
termAtt.copyBuffer(buffer, 0, length);
|
||||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||||
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
|
typeAtt.setType(TOKEN_TYPE_NAMES[tokenType]);
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Arrays;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -61,21 +61,20 @@ public final class ChineseFilter extends TokenFilter {
|
||||||
|
|
||||||
private CharArraySet stopTable;
|
private CharArraySet stopTable;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
public ChineseFilter(TokenStream in) {
|
public ChineseFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
|
|
||||||
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
|
stopTable = new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(STOP_WORDS), false);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
|
|
||||||
while (input.incrementToken()) {
|
while (input.incrementToken()) {
|
||||||
char text[] = termAtt.termBuffer();
|
char text[] = termAtt.buffer();
|
||||||
int termLength = termAtt.termLength();
|
int termLength = termAtt.length();
|
||||||
|
|
||||||
// why not key off token type here assuming ChineseTokenizer comes first?
|
// why not key off token type here assuming ChineseTokenizer comes first?
|
||||||
if (!stopTable.contains(text, 0, termLength)) {
|
if (!stopTable.contains(text, 0, termLength)) {
|
||||||
|
|
|
@ -23,8 +23,8 @@ import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
|
||||||
|
@ -62,24 +62,16 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
|
|
||||||
public ChineseTokenizer(Reader in) {
|
public ChineseTokenizer(Reader in) {
|
||||||
super(in);
|
super(in);
|
||||||
init();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ChineseTokenizer(AttributeSource source, Reader in) {
|
public ChineseTokenizer(AttributeSource source, Reader in) {
|
||||||
super(source, in);
|
super(source, in);
|
||||||
init();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public ChineseTokenizer(AttributeFactory factory, Reader in) {
|
public ChineseTokenizer(AttributeFactory factory, Reader in) {
|
||||||
super(factory, in);
|
super(factory, in);
|
||||||
init();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void init() {
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int offset = 0, bufferIndex=0, dataLen=0;
|
private int offset = 0, bufferIndex=0, dataLen=0;
|
||||||
private final static int MAX_WORD_LEN = 255;
|
private final static int MAX_WORD_LEN = 255;
|
||||||
private final static int IO_BUFFER_SIZE = 1024;
|
private final static int IO_BUFFER_SIZE = 1024;
|
||||||
|
@ -90,8 +82,8 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
private int length;
|
private int length;
|
||||||
private int start;
|
private int start;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
private final void push(char c) {
|
private final void push(char c) {
|
||||||
|
|
||||||
|
@ -105,7 +97,7 @@ public final class ChineseTokenizer extends Tokenizer {
|
||||||
if (length>0) {
|
if (length>0) {
|
||||||
//System.out.println(new String(buffer, 0,
|
//System.out.println(new String(buffer, 0,
|
||||||
//length));
|
//length));
|
||||||
termAtt.setTermBuffer(buffer, 0, length);
|
termAtt.copyBuffer(buffer, 0, length);
|
||||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
offsetAtt.setOffset(correctOffset(start), correctOffset(start+length));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,7 +30,7 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
@ -69,12 +69,12 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
protected final int maxSubwordSize;
|
protected final int maxSubwordSize;
|
||||||
protected final boolean onlyLongestMatch;
|
protected final boolean onlyLongestMatch;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private FlagsAttribute flagsAtt;
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
private PositionIncrementAttribute posIncAtt;
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
private PayloadAttribute payloadAtt;
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
|
|
||||||
private final Token wrapper = new Token();
|
private final Token wrapper = new Token();
|
||||||
/**
|
/**
|
||||||
|
@ -160,13 +160,6 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
|
this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
|
||||||
addAllLowerCase(this.dictionary, dictionary);
|
addAllLowerCase(this.dictionary, dictionary);
|
||||||
}
|
}
|
||||||
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
|
||||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -192,7 +185,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
|
|
||||||
private final void setToken(final Token token) throws IOException {
|
private final void setToken(final Token token) throws IOException {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
typeAtt.setType(token.type());
|
typeAtt.setType(token.type());
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
|
@ -210,7 +203,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
if (!input.incrementToken())
|
if (!input.incrementToken())
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
wrapper.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||||
wrapper.setStartOffset(offsetAtt.startOffset());
|
wrapper.setStartOffset(offsetAtt.startOffset());
|
||||||
wrapper.setEndOffset(offsetAtt.endOffset());
|
wrapper.setEndOffset(offsetAtt.endOffset());
|
||||||
wrapper.setFlags(flagsAtt.getFlags());
|
wrapper.setFlags(flagsAtt.getFlags());
|
||||||
|
@ -248,7 +241,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
protected final Token createToken(final int offset, final int length,
|
protected final Token createToken(final int offset, final int length,
|
||||||
final Token prototype) {
|
final Token prototype) {
|
||||||
int newStart = prototype.startOffset() + offset;
|
int newStart = prototype.startOffset() + offset;
|
||||||
Token t = prototype.clone(prototype.termBuffer(), offset, length, newStart, newStart+length);
|
Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
|
||||||
t.setPositionIncrement(0);
|
t.setPositionIncrement(0);
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
@ -258,7 +251,7 @@ public abstract class CompoundWordTokenFilterBase extends TokenFilter {
|
||||||
tokens.add((Token) token.clone());
|
tokens.add((Token) token.clone());
|
||||||
|
|
||||||
// Only words longer than minWordSize get processed
|
// Only words longer than minWordSize get processed
|
||||||
if (token.termLength() < this.minWordSize) {
|
if (token.length() < this.minWordSize) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -191,22 +191,22 @@ public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBa
|
||||||
@Override
|
@Override
|
||||||
protected void decomposeInternal(final Token token) {
|
protected void decomposeInternal(final Token token) {
|
||||||
// Only words longer than minWordSize get processed
|
// Only words longer than minWordSize get processed
|
||||||
if (token.termLength() < this.minWordSize) {
|
if (token.length() < this.minWordSize) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
|
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
||||||
|
|
||||||
for (int i=0;i<token.termLength()-this.minSubwordSize;++i) {
|
for (int i=0;i<token.length()-this.minSubwordSize;++i) {
|
||||||
Token longestMatchToken=null;
|
Token longestMatchToken=null;
|
||||||
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
|
for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
|
||||||
if(i+j>token.termLength()) {
|
if(i+j>token.length()) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
|
if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
|
||||||
if (this.onlyLongestMatch) {
|
if (this.onlyLongestMatch) {
|
||||||
if (longestMatchToken!=null) {
|
if (longestMatchToken!=null) {
|
||||||
if (longestMatchToken.termLength()<j) {
|
if (longestMatchToken.length()<j) {
|
||||||
longestMatchToken=createToken(i,j,token);
|
longestMatchToken=createToken(i,j,token);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -302,15 +302,15 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
@Override
|
@Override
|
||||||
protected void decomposeInternal(final Token token) {
|
protected void decomposeInternal(final Token token) {
|
||||||
// get the hyphenation points
|
// get the hyphenation points
|
||||||
Hyphenation hyphens = hyphenator.hyphenate(token.termBuffer(), 0, token
|
Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
|
||||||
.termLength(), 1, 1);
|
.length(), 1, 1);
|
||||||
// No hyphen points found -> exit
|
// No hyphen points found -> exit
|
||||||
if (hyphens == null) {
|
if (hyphens == null) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
final int[] hyp = hyphens.getHyphenationPoints();
|
final int[] hyp = hyphens.getHyphenationPoints();
|
||||||
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.termBuffer());
|
char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
|
||||||
|
|
||||||
for (int i = 0; i < hyp.length; ++i) {
|
for (int i = 0; i < hyp.length; ++i) {
|
||||||
int remaining = hyp.length - i;
|
int remaining = hyp.length - i;
|
||||||
|
@ -335,7 +335,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
|
||||||
if (this.onlyLongestMatch) {
|
if (this.onlyLongestMatch) {
|
||||||
if (longestMatchToken != null) {
|
if (longestMatchToken != null) {
|
||||||
if (longestMatchToken.termLength() < partLength) {
|
if (longestMatchToken.length() < partLength) {
|
||||||
longestMatchToken = createToken(start, partLength, token);
|
longestMatchToken = createToken(start, partLength, token);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -352,7 +352,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
// characters
|
// characters
|
||||||
if (this.onlyLongestMatch) {
|
if (this.onlyLongestMatch) {
|
||||||
if (longestMatchToken != null) {
|
if (longestMatchToken != null) {
|
||||||
if (longestMatchToken.termLength() < partLength - 1) {
|
if (longestMatchToken.length() < partLength - 1) {
|
||||||
longestMatchToken = createToken(start, partLength - 1, token);
|
longestMatchToken = createToken(start, partLength - 1, token);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -6,7 +6,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for java
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -37,23 +37,20 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
* @see KeywordMarkerFilter
|
* @see KeywordMarkerFilter
|
||||||
*/
|
*/
|
||||||
public final class CzechStemFilter extends TokenFilter {
|
public final class CzechStemFilter extends TokenFilter {
|
||||||
private final CzechStemmer stemmer;
|
private final CzechStemmer stemmer = new CzechStemmer();
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public CzechStemFilter(TokenStream input) {
|
public CzechStemFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
stemmer = new CzechStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if(!keywordAttr.isKeyword()) {
|
if(!keywordAttr.isKeyword()) {
|
||||||
final int newlen = stemmer.stem(termAtt.termBuffer(), termAtt.termLength());
|
final int newlen = stemmer.stem(termAtt.buffer(), termAtt.length());
|
||||||
termAtt.setTermLength(newlen);
|
termAtt.setLength(newlen);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -24,7 +24,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for java
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that stems German words.
|
* A {@link TokenFilter} that stems German words.
|
||||||
|
@ -45,11 +45,11 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* The actual token in the input stream.
|
||||||
*/
|
*/
|
||||||
private GermanStemmer stemmer = null;
|
private GermanStemmer stemmer = new GermanStemmer();
|
||||||
private Set<?> exclusionSet = null;
|
private Set<?> exclusionSet = null;
|
||||||
|
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link GermanStemFilter} instance
|
* Creates a {@link GermanStemFilter} instance
|
||||||
|
@ -58,9 +58,6 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
public GermanStemFilter( TokenStream in )
|
public GermanStemFilter( TokenStream in )
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new GermanStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -80,13 +77,13 @@ public final class GermanStemFilter extends TokenFilter
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
String term = termAtt.term();
|
String term = termAtt.toString();
|
||||||
// Check the exclusion table.
|
// Check the exclusion table.
|
||||||
if (!keywordAttr.isKeyword() && (exclusionSet == null || !exclusionSet.contains(term))) {
|
if (!keywordAttr.isKeyword() && (exclusionSet == null || !exclusionSet.contains(term))) {
|
||||||
String s = stemmer.stem(term);
|
String s = stemmer.stem(term);
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
if ((s != null) && !s.equals(term))
|
if ((s != null) && !s.equals(term))
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty().append(s);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
|
* A {@link TokenFilter} that applies {@link PersianNormalizer} to normalize the
|
||||||
|
@ -30,22 +30,19 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public final class PersianNormalizationFilter extends TokenFilter {
|
public final class PersianNormalizationFilter extends TokenFilter {
|
||||||
|
private final PersianNormalizer normalizer = new PersianNormalizer();
|
||||||
private final PersianNormalizer normalizer;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final TermAttribute termAtt;
|
|
||||||
|
|
||||||
public PersianNormalizationFilter(TokenStream input) {
|
public PersianNormalizationFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
normalizer = new PersianNormalizer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
final int newlen = normalizer.normalize(termAtt.termBuffer(), termAtt
|
final int newlen = normalizer.normalize(termAtt.buffer(),
|
||||||
.termLength());
|
termAtt.length());
|
||||||
termAtt.setTermLength(newlen);
|
termAtt.setLength(newlen);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Arrays;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
import org.apache.lucene.analysis.standard.StandardTokenizer; // for javadocs
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -37,7 +37,7 @@ import org.apache.lucene.util.Version;
|
||||||
*/
|
*/
|
||||||
public final class ElisionFilter extends TokenFilter {
|
public final class ElisionFilter extends TokenFilter {
|
||||||
private CharArraySet articles = CharArraySet.EMPTY_SET;
|
private CharArraySet articles = CharArraySet.EMPTY_SET;
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
private static final CharArraySet DEFAULT_ARTICLES = CharArraySet.unmodifiableSet(
|
||||||
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
new CharArraySet(Version.LUCENE_CURRENT, Arrays.asList(
|
||||||
"l", "m", "t", "qu", "n", "s", "j"), true));
|
"l", "m", "t", "qu", "n", "s", "j"), true));
|
||||||
|
@ -100,7 +100,6 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
super(input);
|
super(input);
|
||||||
this.articles = CharArraySet.unmodifiableSet(
|
this.articles = CharArraySet.unmodifiableSet(
|
||||||
new CharArraySet(matchVersion, articles, true));
|
new CharArraySet(matchVersion, articles, true));
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -115,13 +114,13 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Increments the {@link TokenStream} with a {@link TermAttribute} without elisioned start
|
* Increments the {@link TokenStream} with a {@link CharTermAttribute} without elisioned start
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
char[] termBuffer = termAtt.termBuffer();
|
char[] termBuffer = termAtt.buffer();
|
||||||
int termLength = termAtt.termLength();
|
int termLength = termAtt.length();
|
||||||
|
|
||||||
int minPoz = Integer.MAX_VALUE;
|
int minPoz = Integer.MAX_VALUE;
|
||||||
for (int i = 0; i < apostrophes.length; i++) {
|
for (int i = 0; i < apostrophes.length; i++) {
|
||||||
|
@ -137,8 +136,8 @@ public final class ElisionFilter extends TokenFilter {
|
||||||
|
|
||||||
// An apostrophe has been found. If the prefix is an article strip it off.
|
// An apostrophe has been found. If the prefix is an article strip it off.
|
||||||
if (minPoz != Integer.MAX_VALUE
|
if (minPoz != Integer.MAX_VALUE
|
||||||
&& articles.contains(termAtt.termBuffer(), 0, minPoz)) {
|
&& articles.contains(termAtt.buffer(), 0, minPoz)) {
|
||||||
termAtt.setTermBuffer(termAtt.termBuffer(), minPoz + 1, termAtt.termLength() - (minPoz + 1));
|
termAtt.copyBuffer(termAtt.buffer(), minPoz + 1, termAtt.length() - (minPoz + 1));
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -51,17 +51,14 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* The actual token in the input stream.
|
||||||
*/
|
*/
|
||||||
private FrenchStemmer stemmer = null;
|
private FrenchStemmer stemmer = new FrenchStemmer();
|
||||||
private Set<?> exclusions = null;
|
private Set<?> exclusions = null;
|
||||||
|
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public FrenchStemFilter( TokenStream in ) {
|
public FrenchStemFilter( TokenStream in ) {
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new FrenchStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -82,14 +79,14 @@ public final class FrenchStemFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
String term = termAtt.term();
|
String term = termAtt.toString();
|
||||||
|
|
||||||
// Check the exclusion table
|
// Check the exclusion table
|
||||||
if ( !keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains( term )) ) {
|
if ( !keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains( term )) ) {
|
||||||
String s = stemmer.stem( term );
|
String s = stemmer.stem( term );
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
if ((s != null) && !s.equals( term ) )
|
if ((s != null) && !s.equals( term ) )
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty().append(s);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // javadoc
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
|
* A {@link TokenFilter} that applies {@link HindiNormalizer} to normalize the
|
||||||
|
@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
public final class HindiNormalizationFilter extends TokenFilter {
|
public final class HindiNormalizationFilter extends TokenFilter {
|
||||||
|
|
||||||
private final HindiNormalizer normalizer = new HindiNormalizer();
|
private final HindiNormalizer normalizer = new HindiNormalizer();
|
||||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public HindiNormalizationFilter(TokenStream input) {
|
public HindiNormalizationFilter(TokenStream input) {
|
||||||
|
@ -50,8 +50,8 @@ public final class HindiNormalizationFilter extends TokenFilter {
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (!keywordAtt.isKeyword())
|
if (!keywordAtt.isKeyword())
|
||||||
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(),
|
termAtt.setLength(normalizer.normalize(termAtt.buffer(),
|
||||||
termAtt.termLength()));
|
termAtt.length()));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -22,13 +22,13 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
|
* A {@link TokenFilter} that applies {@link HindiStemmer} to stem Hindi words.
|
||||||
*/
|
*/
|
||||||
public final class HindiStemFilter extends TokenFilter {
|
public final class HindiStemFilter extends TokenFilter {
|
||||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
private final HindiStemmer stemmer = new HindiStemmer();
|
private final HindiStemmer stemmer = new HindiStemmer();
|
||||||
|
|
||||||
|
@ -40,7 +40,7 @@ public final class HindiStemFilter extends TokenFilter {
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (!keywordAtt.isKeyword())
|
if (!keywordAtt.isKeyword())
|
||||||
termAtt.setTermLength(stemmer.stem(termAtt.termBuffer(), termAtt.termLength()));
|
termAtt.setLength(stemmer.stem(termAtt.buffer(), termAtt.length()));
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -21,14 +21,14 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
|
* A {@link TokenFilter} that applies {@link IndicNormalizer} to normalize text
|
||||||
* in Indian Languages.
|
* in Indian Languages.
|
||||||
*/
|
*/
|
||||||
public final class IndicNormalizationFilter extends TokenFilter {
|
public final class IndicNormalizationFilter extends TokenFilter {
|
||||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final IndicNormalizer normalizer = new IndicNormalizer();
|
private final IndicNormalizer normalizer = new IndicNormalizer();
|
||||||
|
|
||||||
public IndicNormalizationFilter(TokenStream input) {
|
public IndicNormalizationFilter(TokenStream input) {
|
||||||
|
@ -38,7 +38,7 @@ public final class IndicNormalizationFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
termAtt.setTermLength(normalizer.normalize(termAtt.termBuffer(), termAtt.termLength()));
|
termAtt.setLength(normalizer.normalize(termAtt.buffer(), termAtt.length()));
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -30,8 +30,8 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.StopAnalyzer;
|
import org.apache.lucene.analysis.core.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.StopFilter;
|
import org.apache.lucene.analysis.core.StopFilter;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -332,8 +332,8 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
private Matcher matcher;
|
private Matcher matcher;
|
||||||
private int pos = 0;
|
private int pos = 0;
|
||||||
private static final Locale locale = Locale.getDefault();
|
private static final Locale locale = Locale.getDefault();
|
||||||
private TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
public PatternTokenizer(String str, Pattern pattern, boolean toLowerCase) {
|
||||||
this.str = str;
|
this.str = str;
|
||||||
|
@ -360,7 +360,7 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
if (start != end) { // non-empty match (header/trailer)
|
if (start != end) { // non-empty match (header/trailer)
|
||||||
String text = str.substring(start, end);
|
String text = str.substring(start, end);
|
||||||
if (toLowerCase) text = text.toLowerCase(locale);
|
if (toLowerCase) text = text.toLowerCase(locale);
|
||||||
termAtt.setTermBuffer(text);
|
termAtt.setEmpty().append(text);
|
||||||
offsetAtt.setOffset(start, end);
|
offsetAtt.setOffset(start, end);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -392,8 +392,8 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
private final boolean toLowerCase;
|
private final boolean toLowerCase;
|
||||||
private final Set<?> stopWords;
|
private final Set<?> stopWords;
|
||||||
private static final Locale locale = Locale.getDefault();
|
private static final Locale locale = Locale.getDefault();
|
||||||
private TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
public FastStringTokenizer(String str, boolean isLetter, boolean toLowerCase, Set<?> stopWords) {
|
||||||
this.str = str;
|
this.str = str;
|
||||||
|
@ -446,7 +446,7 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
{
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
termAtt.setTermBuffer(text);
|
termAtt.setEmpty().append(text);
|
||||||
offsetAtt.setOffset(start, i);
|
offsetAtt.setOffset(start, i);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
@ -44,14 +44,14 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
private TokenStream prefix;
|
private TokenStream prefix;
|
||||||
private TokenStream suffix;
|
private TokenStream suffix;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
private PayloadAttribute payloadAtt;
|
private PayloadAttribute payloadAtt;
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
private TypeAttribute typeAtt;
|
private TypeAttribute typeAtt;
|
||||||
private FlagsAttribute flagsAtt;
|
private FlagsAttribute flagsAtt;
|
||||||
|
|
||||||
private TermAttribute p_termAtt;
|
private CharTermAttribute p_termAtt;
|
||||||
private PositionIncrementAttribute p_posIncrAtt;
|
private PositionIncrementAttribute p_posIncrAtt;
|
||||||
private PayloadAttribute p_payloadAtt;
|
private PayloadAttribute p_payloadAtt;
|
||||||
private OffsetAttribute p_offsetAtt;
|
private OffsetAttribute p_offsetAtt;
|
||||||
|
@ -64,14 +64,14 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
this.prefix = prefix;
|
this.prefix = prefix;
|
||||||
prefixExhausted = false;
|
prefixExhausted = false;
|
||||||
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
typeAtt = addAttribute(TypeAttribute.class);
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
p_termAtt = prefix.addAttribute(TermAttribute.class);
|
p_termAtt = prefix.addAttribute(CharTermAttribute.class);
|
||||||
p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
|
p_posIncrAtt = prefix.addAttribute(PositionIncrementAttribute.class);
|
||||||
p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
|
p_payloadAtt = prefix.addAttribute(PayloadAttribute.class);
|
||||||
p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
|
p_offsetAtt = prefix.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -115,7 +115,7 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
private void setCurrentToken(Token token) {
|
private void setCurrentToken(Token token) {
|
||||||
if (token == null) return;
|
if (token == null) return;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
|
@ -125,7 +125,7 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
|
|
||||||
private Token getNextPrefixInputToken(Token token) throws IOException {
|
private Token getNextPrefixInputToken(Token token) throws IOException {
|
||||||
if (!prefix.incrementToken()) return null;
|
if (!prefix.incrementToken()) return null;
|
||||||
token.setTermBuffer(p_termAtt.termBuffer(), 0, p_termAtt.termLength());
|
token.copyBuffer(p_termAtt.buffer(), 0, p_termAtt.length());
|
||||||
token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
|
token.setPositionIncrement(p_posIncrAtt.getPositionIncrement());
|
||||||
token.setFlags(p_flagsAtt.getFlags());
|
token.setFlags(p_flagsAtt.getFlags());
|
||||||
token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
|
token.setOffset(p_offsetAtt.startOffset(), p_offsetAtt.endOffset());
|
||||||
|
@ -136,7 +136,7 @@ public class PrefixAwareTokenFilter extends TokenStream {
|
||||||
|
|
||||||
private Token getNextSuffixInputToken(Token token) throws IOException {
|
private Token getNextSuffixInputToken(Token token) throws IOException {
|
||||||
if (!suffix.incrementToken()) return null;
|
if (!suffix.incrementToken()) return null;
|
||||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||||
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
||||||
token.setFlags(flagsAtt.getFlags());
|
token.setFlags(flagsAtt.getFlags());
|
||||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenStream} containing a single token.
|
* A {@link TokenStream} containing a single token.
|
||||||
|
@ -41,7 +41,7 @@ public final class SingleTokenTokenStream extends TokenStream {
|
||||||
assert token != null;
|
assert token != null;
|
||||||
this.singleToken = (Token) token.clone();
|
this.singleToken = (Token) token.clone();
|
||||||
|
|
||||||
tokenAtt = (AttributeImpl) addAttribute(TermAttribute.class);
|
tokenAtt = (AttributeImpl) addAttribute(CharTermAttribute.class);
|
||||||
assert (tokenAtt instanceof Token);
|
assert (tokenAtt instanceof Token);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,7 @@ import java.util.Map;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArrayMap;
|
import org.apache.lucene.analysis.util.CharArrayMap;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ import org.apache.lucene.util.Version;
|
||||||
public final class StemmerOverrideFilter extends TokenFilter {
|
public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
private final CharArrayMap<String> dictionary;
|
private final CharArrayMap<String> dictionary;
|
||||||
|
|
||||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -56,9 +56,9 @@ public final class StemmerOverrideFilter extends TokenFilter {
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
|
if (!keywordAtt.isKeyword()) { // don't muck with already-keyworded terms
|
||||||
String stem = dictionary.get(termAtt.termBuffer(), 0, termAtt.termLength());
|
String stem = dictionary.get(termAtt.buffer(), 0, termAtt.length());
|
||||||
if (stem != null) {
|
if (stem != null) {
|
||||||
termAtt.setTermBuffer(stem);
|
termAtt.setEmpty().append(stem);
|
||||||
keywordAtt.setKeyword(true);
|
keywordAtt.setKeyword(true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,7 @@ package org.apache.lucene.analysis.ngram;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -72,8 +72,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
private int curGramSize;
|
private int curGramSize;
|
||||||
private int tokStart;
|
private int tokStart;
|
||||||
|
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
* Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
|
||||||
|
@ -101,8 +101,6 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
this.side = side;
|
this.side = side;
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -124,8 +122,8 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
curTermBuffer = termAtt.termBuffer().clone();
|
curTermBuffer = termAtt.buffer().clone();
|
||||||
curTermLength = termAtt.termLength();
|
curTermLength = termAtt.length();
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
tokStart = offsetAtt.startOffset();
|
tokStart = offsetAtt.startOffset();
|
||||||
}
|
}
|
||||||
|
@ -138,7 +136,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
int end = start + curGramSize;
|
int end = start + curGramSize;
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
offsetAtt.setOffset(tokStart + start, tokStart + end);
|
||||||
termAtt.setTermBuffer(curTermBuffer, start, curGramSize);
|
termAtt.copyBuffer(curTermBuffer, start, curGramSize);
|
||||||
curGramSize++;
|
curGramSize++;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ngram;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -37,8 +37,8 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
||||||
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
public static final int DEFAULT_MAX_GRAM_SIZE = 1;
|
||||||
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
public static final int DEFAULT_MIN_GRAM_SIZE = 1;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/** Specifies which side of the input the n-gram should be generated from */
|
/** Specifies which side of the input the n-gram should be generated from */
|
||||||
public static enum Side {
|
public static enum Side {
|
||||||
|
@ -173,10 +173,6 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
this.side = side;
|
this.side = side;
|
||||||
|
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
@ -206,7 +202,7 @@ public final class EdgeNGramTokenizer extends Tokenizer {
|
||||||
// grab gramSize chars from front or back
|
// grab gramSize chars from front or back
|
||||||
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
int start = side == Side.FRONT ? 0 : inLen - gramSize;
|
||||||
int end = start + gramSize;
|
int end = start + gramSize;
|
||||||
termAtt.setTermBuffer(inStr, start, gramSize);
|
termAtt.setEmpty().append(inStr, start, end);
|
||||||
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
offsetAtt.setOffset(correctOffset(start), correctOffset(end));
|
||||||
gramSize++;
|
gramSize++;
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tokenizes the input into n-grams of the given size(s).
|
* Tokenizes the input into n-grams of the given size(s).
|
||||||
|
@ -39,8 +39,8 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
private int curPos;
|
private int curPos;
|
||||||
private int tokStart;
|
private int tokStart;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenFilter with given min and max n-grams.
|
* Creates NGramTokenFilter with given min and max n-grams.
|
||||||
|
@ -58,9 +58,6 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -79,8 +76,8 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
curTermBuffer = termAtt.termBuffer().clone();
|
curTermBuffer = termAtt.buffer().clone();
|
||||||
curTermLength = termAtt.termLength();
|
curTermLength = termAtt.length();
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
curPos = 0;
|
curPos = 0;
|
||||||
tokStart = offsetAtt.startOffset();
|
tokStart = offsetAtt.startOffset();
|
||||||
|
@ -89,7 +86,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
while (curGramSize <= maxGram) {
|
while (curGramSize <= maxGram) {
|
||||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(curTermBuffer, curPos, curGramSize);
|
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||||
curPos++;
|
curPos++;
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.ngram;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -39,8 +39,8 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
private String inStr;
|
private String inStr;
|
||||||
private boolean started = false;
|
private boolean started = false;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates NGramTokenizer with given min and max n-grams.
|
* Creates NGramTokenizer with given min and max n-grams.
|
||||||
|
@ -94,9 +94,6 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
}
|
}
|
||||||
this.minGram = minGram;
|
this.minGram = minGram;
|
||||||
this.maxGram = maxGram;
|
this.maxGram = maxGram;
|
||||||
|
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the next token in the stream, or null at EOS. */
|
/** Returns the next token in the stream, or null at EOS. */
|
||||||
|
@ -123,7 +120,7 @@ public final class NGramTokenizer extends Tokenizer {
|
||||||
|
|
||||||
int oldPos = pos;
|
int oldPos = pos;
|
||||||
pos++;
|
pos++;
|
||||||
termAtt.setTermBuffer(inStr, oldPos, gramSize);
|
termAtt.setEmpty().append(inStr, oldPos, oldPos+gramSize);
|
||||||
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
offsetAtt.setOffset(correctOffset(oldPos), correctOffset(oldPos+gramSize));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -28,7 +28,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A {@link TokenFilter} that stems Dutch words.
|
* A {@link TokenFilter} that stems Dutch words.
|
||||||
|
@ -52,17 +52,14 @@ public final class DutchStemFilter extends TokenFilter {
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* The actual token in the input stream.
|
||||||
*/
|
*/
|
||||||
private DutchStemmer stemmer = null;
|
private DutchStemmer stemmer = new DutchStemmer();
|
||||||
private Set<?> exclusions = null;
|
private Set<?> exclusions = null;
|
||||||
|
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public DutchStemFilter(TokenStream _in) {
|
public DutchStemFilter(TokenStream _in) {
|
||||||
super(_in);
|
super(_in);
|
||||||
stemmer = new DutchStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -99,14 +96,14 @@ public final class DutchStemFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
final String term = termAtt.term();
|
final String term = termAtt.toString();
|
||||||
|
|
||||||
// Check the exclusion table.
|
// Check the exclusion table.
|
||||||
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
|
if (!keywordAttr.isKeyword() && (exclusions == null || !exclusions.contains(term))) {
|
||||||
final String s = stemmer.stem(term);
|
final String s = stemmer.stem(term);
|
||||||
// If not stemmed, don't waste the time adjusting the token.
|
// If not stemmed, don't waste the time adjusting the token.
|
||||||
if ((s != null) && !s.equals(term))
|
if ((s != null) && !s.equals(term))
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty().append(s);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -39,15 +39,13 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
public final class DelimitedPayloadTokenFilter extends TokenFilter {
|
public final class DelimitedPayloadTokenFilter extends TokenFilter {
|
||||||
public static final char DEFAULT_DELIMITER = '|';
|
public static final char DEFAULT_DELIMITER = '|';
|
||||||
private final char delimiter;
|
private final char delimiter;
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final PayloadAttribute payAtt;
|
private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class);
|
||||||
private final PayloadEncoder encoder;
|
private final PayloadEncoder encoder;
|
||||||
|
|
||||||
|
|
||||||
public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) {
|
public DelimitedPayloadTokenFilter(TokenStream input, char delimiter, PayloadEncoder encoder) {
|
||||||
super(input);
|
super(input);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
payAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
this.delimiter = delimiter;
|
this.delimiter = delimiter;
|
||||||
this.encoder = encoder;
|
this.encoder = encoder;
|
||||||
}
|
}
|
||||||
|
@ -55,12 +53,12 @@ public final class DelimitedPayloadTokenFilter extends TokenFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
final char[] buffer = termAtt.termBuffer();
|
final char[] buffer = termAtt.buffer();
|
||||||
final int length = termAtt.termLength();
|
final int length = termAtt.length();
|
||||||
for (int i = 0; i < length; i++) {
|
for (int i = 0; i < length; i++) {
|
||||||
if (buffer[i] == delimiter) {
|
if (buffer[i] == delimiter) {
|
||||||
payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
|
payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
|
||||||
termAtt.setTermLength(i); // simply set a new length
|
termAtt.setLength(i); // simply set a new length
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,16 +35,14 @@ public class NumericPayloadTokenFilter extends TokenFilter {
|
||||||
private String typeMatch;
|
private String typeMatch;
|
||||||
private Payload thePayload;
|
private Payload thePayload;
|
||||||
|
|
||||||
private PayloadAttribute payloadAtt;
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
|
public NumericPayloadTokenFilter(TokenStream input, float payload, String typeMatch) {
|
||||||
super(input);
|
super(input);
|
||||||
//Need to encode the payload
|
//Need to encode the payload
|
||||||
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
|
thePayload = new Payload(PayloadHelper.encodeFloat(payload));
|
||||||
this.typeMatch = typeMatch;
|
this.typeMatch = typeMatch;
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -33,13 +33,11 @@ import org.apache.lucene.index.Payload;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
|
public class TokenOffsetPayloadTokenFilter extends TokenFilter {
|
||||||
protected OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
protected PayloadAttribute payAtt;
|
private final PayloadAttribute payAtt = addAttribute(PayloadAttribute.class);
|
||||||
|
|
||||||
public TokenOffsetPayloadTokenFilter(TokenStream input) {
|
public TokenOffsetPayloadTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
payAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -33,13 +33,11 @@ import java.io.IOException;
|
||||||
*
|
*
|
||||||
**/
|
**/
|
||||||
public class TypeAsPayloadTokenFilter extends TokenFilter {
|
public class TypeAsPayloadTokenFilter extends TokenFilter {
|
||||||
private PayloadAttribute payloadAtt;
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
public TypeAsPayloadTokenFilter(TokenStream input) {
|
public TypeAsPayloadTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -35,7 +35,7 @@ public final class PositionFilter extends TokenFilter {
|
||||||
/** The first token must have non-zero positionIncrement **/
|
/** The first token must have non-zero positionIncrement **/
|
||||||
private boolean firstTokenPositioned = false;
|
private boolean firstTokenPositioned = false;
|
||||||
|
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Constructs a PositionFilter that assigns a position increment of zero to
|
* Constructs a PositionFilter that assigns a position increment of zero to
|
||||||
|
@ -45,7 +45,6 @@ public final class PositionFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public PositionFilter(final TokenStream input) {
|
public PositionFilter(final TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -19,7 +19,7 @@ package org.apache.lucene.analysis.reverse;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -42,7 +42,7 @@ import java.io.IOException;
|
||||||
*/
|
*/
|
||||||
public final class ReverseStringFilter extends TokenFilter {
|
public final class ReverseStringFilter extends TokenFilter {
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final char marker;
|
private final char marker;
|
||||||
private final Version matchVersion;
|
private final Version matchVersion;
|
||||||
private static final char NOMARKER = '\uFFFF';
|
private static final char NOMARKER = '\uFFFF';
|
||||||
|
@ -131,20 +131,19 @@ public final class ReverseStringFilter extends TokenFilter {
|
||||||
super(in);
|
super(in);
|
||||||
this.matchVersion = matchVersion;
|
this.matchVersion = matchVersion;
|
||||||
this.marker = marker;
|
this.marker = marker;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
int len = termAtt.termLength();
|
int len = termAtt.length();
|
||||||
if (marker != NOMARKER) {
|
if (marker != NOMARKER) {
|
||||||
len++;
|
len++;
|
||||||
termAtt.resizeTermBuffer(len);
|
termAtt.resizeBuffer(len);
|
||||||
termAtt.termBuffer()[len - 1] = marker;
|
termAtt.buffer()[len - 1] = marker;
|
||||||
}
|
}
|
||||||
reverse( matchVersion, termAtt.termBuffer(), 0, len );
|
reverse( matchVersion, termAtt.buffer(), 0, len );
|
||||||
termAtt.setTermLength(len);
|
termAtt.setLength(len);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes token text to lower case.
|
* Normalizes token text to lower case.
|
||||||
|
@ -32,20 +32,19 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public final class RussianLowerCaseFilter extends TokenFilter
|
public final class RussianLowerCaseFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
public RussianLowerCaseFilter(TokenStream in)
|
public RussianLowerCaseFilter(TokenStream in)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException
|
public final boolean incrementToken() throws IOException
|
||||||
{
|
{
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
char[] chArray = termAtt.termBuffer();
|
char[] chArray = termAtt.buffer();
|
||||||
int chLen = termAtt.termLength();
|
int chLen = termAtt.length();
|
||||||
for (int i = 0; i < chLen; i++)
|
for (int i = 0; i < chLen; i++)
|
||||||
{
|
{
|
||||||
chArray[i] = Character.toLowerCase(chArray[i]);
|
chArray[i] = Character.toLowerCase(chArray[i]);
|
||||||
|
|
|
@ -22,7 +22,7 @@ import org.apache.lucene.analysis.miscellaneous.KeywordMarkerFilter; // for java
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
|
import org.apache.lucene.analysis.ru.RussianStemmer;//javadoc @link
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
|
import org.apache.lucene.analysis.snowball.SnowballFilter; // javadoc @link
|
||||||
|
|
||||||
|
@ -51,17 +51,14 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
/**
|
/**
|
||||||
* The actual token in the input stream.
|
* The actual token in the input stream.
|
||||||
*/
|
*/
|
||||||
private RussianStemmer stemmer = null;
|
private RussianStemmer stemmer = new RussianStemmer();
|
||||||
|
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr;
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public RussianStemFilter(TokenStream in)
|
public RussianStemFilter(TokenStream in)
|
||||||
{
|
{
|
||||||
super(in);
|
super(in);
|
||||||
stemmer = new RussianStemmer();
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
keywordAttr = addAttribute(KeywordAttribute.class);
|
|
||||||
}
|
}
|
||||||
/**
|
/**
|
||||||
* Returns the next token in the stream, or null at EOS
|
* Returns the next token in the stream, or null at EOS
|
||||||
|
@ -71,10 +68,10 @@ public final class RussianStemFilter extends TokenFilter
|
||||||
{
|
{
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if(!keywordAttr.isKeyword()) {
|
if(!keywordAttr.isKeyword()) {
|
||||||
final String term = termAtt.term();
|
final String term = termAtt.toString();
|
||||||
final String s = stemmer.stem(term);
|
final String s = stemmer.stem(term);
|
||||||
if (s != null && !s.equals(term))
|
if (s != null && !s.equals(term))
|
||||||
termAtt.setTermBuffer(s);
|
termAtt.setEmpty().append(s);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -137,10 +137,10 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
private boolean isOutputHere = false;
|
private boolean isOutputHere = false;
|
||||||
|
|
||||||
private final CharTermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private final PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private final TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -155,10 +155,6 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
super(input);
|
super(input);
|
||||||
setMaxShingleSize(maxShingleSize);
|
setMaxShingleSize(maxShingleSize);
|
||||||
setMinShingleSize(minShingleSize);
|
setMinShingleSize(minShingleSize);
|
||||||
this.termAtt = addAttribute(CharTermAttribute.class);
|
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -31,11 +31,11 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
|
|
||||||
|
@ -193,14 +193,14 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
private TokenStream input;
|
private TokenStream input;
|
||||||
|
|
||||||
private TermAttribute termAtt;
|
private CharTermAttribute termAtt;
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private PositionIncrementAttribute posIncrAtt;
|
||||||
private PayloadAttribute payloadAtt;
|
private PayloadAttribute payloadAtt;
|
||||||
private OffsetAttribute offsetAtt;
|
private OffsetAttribute offsetAtt;
|
||||||
private TypeAttribute typeAtt;
|
private TypeAttribute typeAtt;
|
||||||
private FlagsAttribute flagsAtt;
|
private FlagsAttribute flagsAtt;
|
||||||
|
|
||||||
private TermAttribute in_termAtt;
|
private CharTermAttribute in_termAtt;
|
||||||
private PositionIncrementAttribute in_posIncrAtt;
|
private PositionIncrementAttribute in_posIncrAtt;
|
||||||
private PayloadAttribute in_payloadAtt;
|
private PayloadAttribute in_payloadAtt;
|
||||||
private OffsetAttribute in_offsetAtt;
|
private OffsetAttribute in_offsetAtt;
|
||||||
|
@ -229,7 +229,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
||||||
this.settingsCodec = settingsCodec;
|
this.settingsCodec = settingsCodec;
|
||||||
|
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
|
@ -239,7 +239,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
// set the input to be an empty token stream, we already have the data.
|
// set the input to be an empty token stream, we already have the data.
|
||||||
this.input = new EmptyTokenStream();
|
this.input = new EmptyTokenStream();
|
||||||
|
|
||||||
in_termAtt = input.addAttribute(TermAttribute.class);
|
in_termAtt = input.addAttribute(CharTermAttribute.class);
|
||||||
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
||||||
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
||||||
in_offsetAtt = input.addAttribute(OffsetAttribute.class);
|
in_offsetAtt = input.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -311,14 +311,14 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
this.spacerCharacter = spacerCharacter;
|
this.spacerCharacter = spacerCharacter;
|
||||||
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
|
||||||
this.settingsCodec = settingsCodec;
|
this.settingsCodec = settingsCodec;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(CharTermAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
typeAtt = addAttribute(TypeAttribute.class);
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
in_termAtt = input.addAttribute(TermAttribute.class);
|
in_termAtt = input.addAttribute(CharTermAttribute.class);
|
||||||
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
in_posIncrAtt = input.addAttribute(PositionIncrementAttribute.class);
|
||||||
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
in_payloadAtt = input.addAttribute(PayloadAttribute.class);
|
||||||
in_offsetAtt = input.addAttribute(OffsetAttribute.class);
|
in_offsetAtt = input.addAttribute(OffsetAttribute.class);
|
||||||
|
@ -377,7 +377,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
if (token == null) return false;
|
if (token == null) return false;
|
||||||
|
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength());
|
termAtt.copyBuffer(token.buffer(), 0, token.length());
|
||||||
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
|
@ -388,7 +388,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
private Token getNextInputToken(Token token) throws IOException {
|
private Token getNextInputToken(Token token) throws IOException {
|
||||||
if (!input.incrementToken()) return null;
|
if (!input.incrementToken()) return null;
|
||||||
token.setTermBuffer(in_termAtt.termBuffer(), 0, in_termAtt.termLength());
|
token.copyBuffer(in_termAtt.buffer(), 0, in_termAtt.length());
|
||||||
token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
|
token.setPositionIncrement(in_posIncrAtt.getPositionIncrement());
|
||||||
token.setFlags(in_flagsAtt.getFlags());
|
token.setFlags(in_flagsAtt.getFlags());
|
||||||
token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
|
token.setOffset(in_offsetAtt.startOffset(), in_offsetAtt.endOffset());
|
||||||
|
@ -399,7 +399,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
private Token getNextToken(Token token) throws IOException {
|
private Token getNextToken(Token token) throws IOException {
|
||||||
if (!this.incrementToken()) return null;
|
if (!this.incrementToken()) return null;
|
||||||
token.setTermBuffer(termAtt.termBuffer(), 0, termAtt.termLength());
|
token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
|
||||||
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
token.setPositionIncrement(posIncrAtt.getPositionIncrement());
|
||||||
token.setFlags(flagsAtt.getFlags());
|
token.setFlags(flagsAtt.getFlags());
|
||||||
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
token.setOffset(offsetAtt.startOffset(), offsetAtt.endOffset());
|
||||||
|
@ -441,7 +441,7 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
|
|
||||||
for (int i = 0; i < currentShingleLength; i++) {
|
for (int i = 0; i < currentShingleLength; i++) {
|
||||||
Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
|
Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
|
||||||
termLength += shingleToken.termLength();
|
termLength += shingleToken.length();
|
||||||
shingle.add(shingleToken);
|
shingle.add(shingleToken);
|
||||||
}
|
}
|
||||||
if (spacerCharacter != null) {
|
if (spacerCharacter != null) {
|
||||||
|
@ -459,9 +459,9 @@ public final class ShingleMatrixFilter extends TokenStream {
|
||||||
if (spacerCharacter != null && sb.length() > 0) {
|
if (spacerCharacter != null && sb.length() > 0) {
|
||||||
sb.append(spacerCharacter);
|
sb.append(spacerCharacter);
|
||||||
}
|
}
|
||||||
sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
|
sb.append(shingleToken.buffer(), 0, shingleToken.length());
|
||||||
}
|
}
|
||||||
reusableToken.setTermBuffer(sb.toString());
|
reusableToken.setEmpty().append(sb);
|
||||||
updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
|
updateToken(reusableToken, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
|
||||||
|
|
||||||
return reusableToken;
|
return reusableToken;
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.text.DateFormat;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -34,7 +34,7 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||||
public static final String DATE_TYPE = "date";
|
public static final String DATE_TYPE = "date";
|
||||||
|
|
||||||
protected DateFormat dateFormat;
|
protected DateFormat dateFormat;
|
||||||
protected TermAttribute termAtt;
|
protected CharTermAttribute termAtt;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
||||||
|
@ -50,10 +50,10 @@ public class DateRecognizerSinkFilter extends TeeSinkTokenFilter.SinkFilter {
|
||||||
@Override
|
@Override
|
||||||
public boolean accept(AttributeSource source) {
|
public boolean accept(AttributeSource source) {
|
||||||
if (termAtt == null) {
|
if (termAtt == null) {
|
||||||
termAtt = source.addAttribute(TermAttribute.class);
|
termAtt = source.addAttribute(CharTermAttribute.class);
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
Date date = dateFormat.parse(termAtt.term());//We don't care about the date, just that we can parse it as a date
|
Date date = dateFormat.parse(termAtt.toString());//We don't care about the date, just that we can parse it as a date
|
||||||
if (date != null) {
|
if (date != null) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
import org.apache.lucene.analysis.core.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
import org.apache.lucene.analysis.tr.TurkishLowerCaseFilter; // javadoc @link
|
||||||
import org.tartarus.snowball.SnowballProgram;
|
import org.tartarus.snowball.SnowballProgram;
|
||||||
|
|
||||||
|
@ -42,7 +42,7 @@ public final class SnowballFilter extends TokenFilter {
|
||||||
|
|
||||||
private final SnowballProgram stemmer;
|
private final SnowballProgram stemmer;
|
||||||
|
|
||||||
private final TermAttribute termAtt = addAttribute(TermAttribute.class);
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
private final KeywordAttribute keywordAttr = addAttribute(KeywordAttribute.class);
|
||||||
|
|
||||||
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
public SnowballFilter(TokenStream input, SnowballProgram stemmer) {
|
||||||
|
@ -76,16 +76,16 @@ public final class SnowballFilter extends TokenFilter {
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (!keywordAttr.isKeyword()) {
|
if (!keywordAttr.isKeyword()) {
|
||||||
char termBuffer[] = termAtt.termBuffer();
|
char termBuffer[] = termAtt.buffer();
|
||||||
final int length = termAtt.termLength();
|
final int length = termAtt.length();
|
||||||
stemmer.setCurrent(termBuffer, length);
|
stemmer.setCurrent(termBuffer, length);
|
||||||
stemmer.stem();
|
stemmer.stem();
|
||||||
final char finalTerm[] = stemmer.getCurrentBuffer();
|
final char finalTerm[] = stemmer.getCurrentBuffer();
|
||||||
final int newLength = stemmer.getCurrentBufferLength();
|
final int newLength = stemmer.getCurrentBufferLength();
|
||||||
if (finalTerm != termBuffer)
|
if (finalTerm != termBuffer)
|
||||||
termAtt.setTermBuffer(finalTerm, 0, newLength);
|
termAtt.copyBuffer(finalTerm, 0, newLength);
|
||||||
else
|
else
|
||||||
termAtt.setTermLength(newLength);
|
termAtt.setLength(newLength);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -133,7 +133,7 @@ public final class SynonymFilter extends TokenFilter {
|
||||||
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
OffsetAttribute lastOffsetAtt = lastTok.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
newOffsetAtt.setOffset(newOffsetAtt.startOffset(), lastOffsetAtt.endOffset());
|
||||||
newTermAtt.copyBuffer(repTok.termBuffer(), 0, repTok.termLength());
|
newTermAtt.copyBuffer(repTok.buffer(), 0, repTok.length());
|
||||||
repPos += repTok.getPositionIncrement();
|
repPos += repTok.getPositionIncrement();
|
||||||
if (i==0) repPos=origPos; // make position of first token equal to original
|
if (i==0) repPos=origPos; // make position of first token equal to original
|
||||||
|
|
||||||
|
|
|
@ -103,8 +103,7 @@ public class SynonymMap {
|
||||||
List<Token> ret = new ArrayList<Token>(strings.size());
|
List<Token> ret = new ArrayList<Token>(strings.size());
|
||||||
for (String str : strings) {
|
for (String str : strings) {
|
||||||
//Token newTok = new Token(str,0,0,"SYNONYM");
|
//Token newTok = new Token(str,0,0,"SYNONYM");
|
||||||
Token newTok = new Token(0,0,"SYNONYM");
|
Token newTok = new Token(str, 0,0,"SYNONYM");
|
||||||
newTok.setTermBuffer(str.toCharArray(), 0, str.length());
|
|
||||||
ret.add(newTok);
|
ret.add(newTok);
|
||||||
}
|
}
|
||||||
return ret;
|
return ret;
|
||||||
|
@ -137,7 +136,7 @@ public class SynonymMap {
|
||||||
while(tok1!=null || tok2!=null) {
|
while(tok1!=null || tok2!=null) {
|
||||||
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
while (tok1 != null && (pos1 <= pos2 || tok2==null)) {
|
||||||
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
Token tok = new Token(tok1.startOffset(), tok1.endOffset(), tok1.type());
|
||||||
tok.setTermBuffer(tok1.termBuffer(), 0, tok1.termLength());
|
tok.copyBuffer(tok1.buffer(), 0, tok1.length());
|
||||||
tok.setPositionIncrement(pos1-pos);
|
tok.setPositionIncrement(pos1-pos);
|
||||||
result.add(tok);
|
result.add(tok);
|
||||||
pos=pos1;
|
pos=pos1;
|
||||||
|
@ -146,7 +145,7 @@ public class SynonymMap {
|
||||||
}
|
}
|
||||||
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
while (tok2 != null && (pos2 <= pos1 || tok1==null)) {
|
||||||
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
Token tok = new Token(tok2.startOffset(), tok2.endOffset(), tok2.type());
|
||||||
tok.setTermBuffer(tok2.termBuffer(), 0, tok2.termLength());
|
tok.copyBuffer(tok2.buffer(), 0, tok2.length());
|
||||||
tok.setPositionIncrement(pos2-pos);
|
tok.setPositionIncrement(pos2-pos);
|
||||||
result.add(tok);
|
result.add(tok);
|
||||||
pos=pos2;
|
pos=pos2;
|
||||||
|
|
|
@ -21,7 +21,7 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Normalizes Turkish token text to lower case.
|
* Normalizes Turkish token text to lower case.
|
||||||
|
@ -37,7 +37,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
||||||
private static final int LATIN_SMALL_LETTER_I = '\u0069';
|
private static final int LATIN_SMALL_LETTER_I = '\u0069';
|
||||||
private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
|
private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
|
||||||
private static final int COMBINING_DOT_ABOVE = '\u0307';
|
private static final int COMBINING_DOT_ABOVE = '\u0307';
|
||||||
private final TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new TurkishLowerCaseFilter, that normalizes Turkish token text
|
* Create a new TurkishLowerCaseFilter, that normalizes Turkish token text
|
||||||
|
@ -47,7 +47,6 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
||||||
*/
|
*/
|
||||||
public TurkishLowerCaseFilter(TokenStream in) {
|
public TurkishLowerCaseFilter(TokenStream in) {
|
||||||
super(in);
|
super(in);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -55,8 +54,8 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
||||||
boolean iOrAfter = false;
|
boolean iOrAfter = false;
|
||||||
|
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
final char[] buffer = termAtt.termBuffer();
|
final char[] buffer = termAtt.buffer();
|
||||||
int length = termAtt.termLength();
|
int length = termAtt.length();
|
||||||
for (int i = 0; i < length;) {
|
for (int i = 0; i < length;) {
|
||||||
final int ch = Character.codePointAt(buffer, i);
|
final int ch = Character.codePointAt(buffer, i);
|
||||||
|
|
||||||
|
@ -88,7 +87,7 @@ public final class TurkishLowerCaseFilter extends TokenFilter {
|
||||||
i += Character.toChars(Character.toLowerCase(ch), buffer, i);
|
i += Character.toChars(Character.toLowerCase(ch), buffer, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
termAtt.setTermLength(length);
|
termAtt.setLength(length);
|
||||||
return true;
|
return true;
|
||||||
} else
|
} else
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -18,10 +18,10 @@
|
||||||
package org.apache.lucene.analysis.wikipedia;
|
package org.apache.lucene.analysis.wikipedia;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.AttributeSource;
|
import org.apache.lucene.util.AttributeSource;
|
||||||
|
|
||||||
|
@ -116,11 +116,11 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
private Set<String> untokenizedTypes = Collections.emptySet();
|
private Set<String> untokenizedTypes = Collections.emptySet();
|
||||||
private Iterator<AttributeSource.State> tokens = null;
|
private Iterator<AttributeSource.State> tokens = null;
|
||||||
|
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
private PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private FlagsAttribute flagsAtt;
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
* Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
|
||||||
|
@ -176,12 +176,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
|
|
||||||
private void init(int tokenOutput, Set<String> untokenizedTypes) {
|
private void init(int tokenOutput, Set<String> untokenizedTypes) {
|
||||||
this.tokenOutput = tokenOutput;
|
this.tokenOutput = tokenOutput;
|
||||||
this.untokenizedTypes = untokenizedTypes;
|
this.untokenizedTypes = untokenizedTypes;
|
||||||
this.offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
this.typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
this.termAtt = addAttribute(TermAttribute.class);
|
|
||||||
this.flagsAtt = addAttribute(FlagsAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -245,8 +240,9 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
lastPos = currPos + numAdded;
|
lastPos = currPos + numAdded;
|
||||||
}
|
}
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
|
// TODO: this is inefficient
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
termAtt.setEmpty().append(s);
|
||||||
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
||||||
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
|
@ -283,8 +279,9 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
lastPos = currPos + numAdded;
|
lastPos = currPos + numAdded;
|
||||||
}
|
}
|
||||||
//trim the buffer
|
//trim the buffer
|
||||||
|
// TODO: this is inefficient
|
||||||
String s = buffer.toString().trim();
|
String s = buffer.toString().trim();
|
||||||
termAtt.setTermBuffer(s.toCharArray(), 0, s.length());
|
termAtt.setEmpty().append(s);
|
||||||
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
|
||||||
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
|
||||||
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
//The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos
|
||||||
|
@ -298,7 +295,7 @@ public final class WikipediaTokenizer extends Tokenizer {
|
||||||
private void setupToken() {
|
private void setupToken() {
|
||||||
scanner.getText(termAtt);
|
scanner.getText(termAtt);
|
||||||
final int start = scanner.yychar();
|
final int start = scanner.yychar();
|
||||||
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.termLength()));
|
offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 17.05.10 14:51 */
|
/* The following code was generated by JFlex 1.5.0-SNAPSHOT on 5/31/10 3:11 PM */
|
||||||
|
|
||||||
package org.apache.lucene.analysis.wikipedia;
|
package org.apache.lucene.analysis.wikipedia;
|
||||||
|
|
||||||
|
@ -19,14 +19,14 @@ package org.apache.lucene.analysis.wikipedia;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This class is a scanner generated by
|
* This class is a scanner generated by
|
||||||
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
* <a href="http://www.jflex.de/">JFlex</a> 1.5.0-SNAPSHOT
|
||||||
* on 17.05.10 14:51 from the specification file
|
* on 5/31/10 3:11 PM from the specification file
|
||||||
* <tt>C:/Users/Uwe Schindler/Projects/lucene/newtrunk/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
* <tt>C:/Users/rmuir/workspace/solrcene/modules/analysis/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex</tt>
|
||||||
*/
|
*/
|
||||||
class WikipediaTokenizerImpl {
|
class WikipediaTokenizerImpl {
|
||||||
|
|
||||||
|
@ -37,16 +37,16 @@ class WikipediaTokenizerImpl {
|
||||||
private static final int ZZ_BUFFERSIZE = 16384;
|
private static final int ZZ_BUFFERSIZE = 16384;
|
||||||
|
|
||||||
/** lexical states */
|
/** lexical states */
|
||||||
public static final int CATEGORY_STATE = 2;
|
public static final int THREE_SINGLE_QUOTES_STATE = 10;
|
||||||
public static final int DOUBLE_EQUALS_STATE = 14;
|
|
||||||
public static final int EXTERNAL_LINK_STATE = 6;
|
public static final int EXTERNAL_LINK_STATE = 6;
|
||||||
|
public static final int DOUBLE_EQUALS_STATE = 14;
|
||||||
public static final int INTERNAL_LINK_STATE = 4;
|
public static final int INTERNAL_LINK_STATE = 4;
|
||||||
public static final int DOUBLE_BRACE_STATE = 16;
|
public static final int DOUBLE_BRACE_STATE = 16;
|
||||||
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
|
public static final int CATEGORY_STATE = 2;
|
||||||
public static final int STRING = 18;
|
|
||||||
public static final int TWO_SINGLE_QUOTES_STATE = 8;
|
|
||||||
public static final int YYINITIAL = 0;
|
public static final int YYINITIAL = 0;
|
||||||
public static final int THREE_SINGLE_QUOTES_STATE = 10;
|
public static final int STRING = 18;
|
||||||
|
public static final int FIVE_SINGLE_QUOTES_STATE = 12;
|
||||||
|
public static final int TWO_SINGLE_QUOTES_STATE = 8;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
* ZZ_LEXSTATE[l] is the state in the DFA for the lexical state l
|
||||||
|
@ -487,8 +487,8 @@ public final int getPositionIncrement(){
|
||||||
/**
|
/**
|
||||||
* Fills Lucene token with the current token text.
|
* Fills Lucene token with the current token text.
|
||||||
*/
|
*/
|
||||||
final void getText(TermAttribute t) {
|
final void getText(CharTermAttribute t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
final int setText(StringBuilder buffer){
|
final int setText(StringBuilder buffer){
|
||||||
|
@ -803,184 +803,184 @@ final int setText(StringBuilder buffer){
|
||||||
zzMarkedPos = zzMarkedPosL;
|
zzMarkedPos = zzMarkedPosL;
|
||||||
|
|
||||||
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
|
||||||
case 25:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
|
||||||
}
|
|
||||||
case 46: break;
|
|
||||||
case 30:
|
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
|
||||||
}
|
|
||||||
case 47: break;
|
|
||||||
case 41:
|
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
|
||||||
}
|
|
||||||
case 48: break;
|
|
||||||
case 14:
|
|
||||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
|
||||||
}
|
|
||||||
case 49: break;
|
|
||||||
case 23:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
|
||||||
}
|
|
||||||
case 50: break;
|
|
||||||
case 34:
|
|
||||||
{ positionInc = 1; return NUM;
|
|
||||||
}
|
|
||||||
case 51: break;
|
|
||||||
case 18:
|
|
||||||
{ /* ignore STRING */
|
|
||||||
}
|
|
||||||
case 52: break;
|
|
||||||
case 12:
|
|
||||||
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
|
||||||
}
|
|
||||||
case 53: break;
|
|
||||||
case 37:
|
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
|
||||||
}
|
|
||||||
case 54: break;
|
|
||||||
case 31:
|
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 55: break;
|
|
||||||
case 10:
|
|
||||||
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
|
|
||||||
}
|
|
||||||
case 56: break;
|
|
||||||
case 38:
|
|
||||||
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
|
||||||
}
|
|
||||||
case 57: break;
|
|
||||||
case 19:
|
|
||||||
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
|
||||||
}
|
|
||||||
case 58: break;
|
|
||||||
case 11:
|
|
||||||
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
|
||||||
}
|
|
||||||
case 59: break;
|
|
||||||
case 1:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1;
|
|
||||||
}
|
|
||||||
case 60: break;
|
|
||||||
case 33:
|
|
||||||
{ positionInc = 1; return HOST;
|
|
||||||
}
|
|
||||||
case 61: break;
|
|
||||||
case 3:
|
|
||||||
{ positionInc = 1; return CJ;
|
|
||||||
}
|
|
||||||
case 62: break;
|
|
||||||
case 17:
|
|
||||||
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
|
||||||
}
|
|
||||||
case 63: break;
|
|
||||||
case 32:
|
|
||||||
{ positionInc = 1; return APOSTROPHE;
|
|
||||||
}
|
|
||||||
case 64: break;
|
|
||||||
case 8:
|
|
||||||
{ /* ignore */
|
|
||||||
}
|
|
||||||
case 65: break;
|
|
||||||
case 4:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 66: break;
|
|
||||||
case 2:
|
|
||||||
{ positionInc = 1; return ALPHANUM;
|
|
||||||
}
|
|
||||||
case 67: break;
|
|
||||||
case 26:
|
|
||||||
{ yybegin(YYINITIAL);
|
|
||||||
}
|
|
||||||
case 68: break;
|
|
||||||
case 43:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
|
||||||
}
|
|
||||||
case 69: break;
|
|
||||||
case 36:
|
|
||||||
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
|
||||||
}
|
|
||||||
case 70: break;
|
|
||||||
case 13:
|
|
||||||
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 71: break;
|
|
||||||
case 24:
|
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 72: break;
|
|
||||||
case 27:
|
|
||||||
{ numLinkToks = 0; yybegin(YYINITIAL);
|
|
||||||
}
|
|
||||||
case 73: break;
|
|
||||||
case 15:
|
|
||||||
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);
|
|
||||||
}
|
|
||||||
case 74: break;
|
|
||||||
case 28:
|
|
||||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 75: break;
|
|
||||||
case 39:
|
|
||||||
{ positionInc = 1; return ACRONYM;
|
|
||||||
}
|
|
||||||
case 76: break;
|
|
||||||
case 29:
|
|
||||||
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
|
||||||
}
|
|
||||||
case 77: break;
|
|
||||||
case 7:
|
|
||||||
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
|
||||||
}
|
|
||||||
case 78: break;
|
|
||||||
case 16:
|
case 16:
|
||||||
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
{ currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
}
|
}
|
||||||
case 79: break;
|
case 46: break;
|
||||||
|
case 39:
|
||||||
|
{ positionInc = 1; return ACRONYM;
|
||||||
|
}
|
||||||
|
case 47: break;
|
||||||
|
case 8:
|
||||||
|
{ /* ignore */
|
||||||
|
}
|
||||||
|
case 48: break;
|
||||||
case 20:
|
case 20:
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 80: break;
|
case 49: break;
|
||||||
case 35:
|
case 35:
|
||||||
{ positionInc = 1; return COMPANY;
|
{ positionInc = 1; return COMPANY;
|
||||||
}
|
}
|
||||||
|
case 50: break;
|
||||||
|
case 4:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 51: break;
|
||||||
|
case 25:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
|
||||||
|
}
|
||||||
|
case 52: break;
|
||||||
|
case 43:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
|
case 53: break;
|
||||||
|
case 22:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
||||||
|
}
|
||||||
|
case 54: break;
|
||||||
|
case 34:
|
||||||
|
{ positionInc = 1; return NUM;
|
||||||
|
}
|
||||||
|
case 55: break;
|
||||||
|
case 32:
|
||||||
|
{ positionInc = 1; return APOSTROPHE;
|
||||||
|
}
|
||||||
|
case 56: break;
|
||||||
|
case 23:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
|
||||||
|
}
|
||||||
|
case 57: break;
|
||||||
|
case 21:
|
||||||
|
{ yybegin(STRING); return currentTokType;/*pipe*/
|
||||||
|
}
|
||||||
|
case 58: break;
|
||||||
|
case 2:
|
||||||
|
{ positionInc = 1; return ALPHANUM;
|
||||||
|
}
|
||||||
|
case 59: break;
|
||||||
|
case 29:
|
||||||
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 60: break;
|
||||||
|
case 17:
|
||||||
|
{ yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
|
||||||
|
}
|
||||||
|
case 61: break;
|
||||||
|
case 44:
|
||||||
|
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
|
case 62: break;
|
||||||
|
case 26:
|
||||||
|
{ yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
|
case 63: break;
|
||||||
|
case 3:
|
||||||
|
{ positionInc = 1; return CJ;
|
||||||
|
}
|
||||||
|
case 64: break;
|
||||||
|
case 38:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
|
||||||
|
}
|
||||||
|
case 65: break;
|
||||||
|
case 15:
|
||||||
|
{ currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);
|
||||||
|
}
|
||||||
|
case 66: break;
|
||||||
|
case 30:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
|
||||||
|
}
|
||||||
|
case 67: break;
|
||||||
|
case 6:
|
||||||
|
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
|
}
|
||||||
|
case 68: break;
|
||||||
|
case 5:
|
||||||
|
{ positionInc = 1;
|
||||||
|
}
|
||||||
|
case 69: break;
|
||||||
|
case 19:
|
||||||
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
|
||||||
|
}
|
||||||
|
case 70: break;
|
||||||
|
case 42:
|
||||||
|
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
||||||
|
}
|
||||||
|
case 71: break;
|
||||||
|
case 27:
|
||||||
|
{ numLinkToks = 0; yybegin(YYINITIAL);
|
||||||
|
}
|
||||||
|
case 72: break;
|
||||||
|
case 11:
|
||||||
|
{ currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);
|
||||||
|
}
|
||||||
|
case 73: break;
|
||||||
|
case 13:
|
||||||
|
{ currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 74: break;
|
||||||
|
case 14:
|
||||||
|
{ yybegin(STRING); numWikiTokensSeen++; return currentTokType;
|
||||||
|
}
|
||||||
|
case 75: break;
|
||||||
|
case 45:
|
||||||
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
||||||
|
}
|
||||||
|
case 76: break;
|
||||||
|
case 28:
|
||||||
|
{ currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
|
case 77: break;
|
||||||
|
case 37:
|
||||||
|
{ numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
|
||||||
|
}
|
||||||
|
case 78: break;
|
||||||
|
case 9:
|
||||||
|
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
||||||
|
}
|
||||||
|
case 79: break;
|
||||||
|
case 7:
|
||||||
|
{ yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
|
||||||
|
}
|
||||||
|
case 80: break;
|
||||||
|
case 24:
|
||||||
|
{ numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
|
||||||
|
}
|
||||||
case 81: break;
|
case 81: break;
|
||||||
case 40:
|
case 40:
|
||||||
{ positionInc = 1; return EMAIL;
|
{ positionInc = 1; return EMAIL;
|
||||||
}
|
}
|
||||||
case 82: break;
|
case 82: break;
|
||||||
case 42:
|
case 1:
|
||||||
{ positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
|
{ numWikiTokensSeen = 0; positionInc = 1;
|
||||||
}
|
}
|
||||||
case 83: break;
|
case 83: break;
|
||||||
case 6:
|
case 18:
|
||||||
{ yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
|
{ /* ignore STRING */
|
||||||
}
|
}
|
||||||
case 84: break;
|
case 84: break;
|
||||||
case 44:
|
case 36:
|
||||||
{ currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);
|
{ currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);
|
||||||
}
|
}
|
||||||
case 85: break;
|
case 85: break;
|
||||||
case 5:
|
case 33:
|
||||||
{ positionInc = 1;
|
{ positionInc = 1; return HOST;
|
||||||
}
|
}
|
||||||
case 86: break;
|
case 86: break;
|
||||||
case 9:
|
case 31:
|
||||||
{ if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
|
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
|
||||||
}
|
}
|
||||||
case 87: break;
|
case 87: break;
|
||||||
case 45:
|
case 41:
|
||||||
{ numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
|
{ numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
|
||||||
}
|
}
|
||||||
case 88: break;
|
case 88: break;
|
||||||
case 22:
|
case 12:
|
||||||
{ numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
|
{ currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/
|
||||||
}
|
}
|
||||||
case 89: break;
|
case 89: break;
|
||||||
case 21:
|
case 10:
|
||||||
{ yybegin(STRING); return currentTokType;/*pipe*/
|
{ numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
|
||||||
}
|
}
|
||||||
case 90: break;
|
case 90: break;
|
||||||
default:
|
default:
|
||||||
|
|
|
@ -17,7 +17,7 @@ package org.apache.lucene.analysis.wikipedia;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
%%
|
%%
|
||||||
|
|
||||||
|
@ -81,8 +81,8 @@ public final int getPositionIncrement(){
|
||||||
/**
|
/**
|
||||||
* Fills Lucene token with the current token text.
|
* Fills Lucene token with the current token text.
|
||||||
*/
|
*/
|
||||||
final void getText(TermAttribute t) {
|
final void getText(CharTermAttribute t) {
|
||||||
t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
|
||||||
}
|
}
|
||||||
|
|
||||||
final int setText(StringBuilder buffer){
|
final int setText(StringBuilder buffer){
|
||||||
|
|
|
@ -17,8 +17,6 @@ package org.apache.lucene.analysis.compound;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -27,7 +25,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
public void testHyphenationCompoundWordsDA() throws Exception {
|
public void testHyphenationCompoundWordsDA() throws Exception {
|
||||||
|
@ -176,15 +174,15 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
|
|
||||||
TermAttribute termAtt = tf.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
|
||||||
assertTrue(tf.incrementToken());
|
assertTrue(tf.incrementToken());
|
||||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
|
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||||
assertTrue(tf.incrementToken());
|
assertTrue(tf.incrementToken());
|
||||||
assertEquals("Rind", termAtt.term());
|
assertEquals("Rind", termAtt.toString());
|
||||||
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
|
wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
|
||||||
tf.reset();
|
tf.reset();
|
||||||
assertTrue(tf.incrementToken());
|
assertTrue(tf.incrementToken());
|
||||||
assertEquals("Rindfleischüberwachungsgesetz", termAtt.term());
|
assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private Reader getHyphenationReader() throws Exception {
|
private Reader getHyphenationReader() throws Exception {
|
||||||
|
|
|
@ -28,6 +28,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -50,9 +51,9 @@ public class TestElision extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private List<String> filter(TokenFilter filter) throws IOException {
|
private List<String> filter(TokenFilter filter) throws IOException {
|
||||||
List<String> tas = new ArrayList<String>();
|
List<String> tas = new ArrayList<String>();
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
while (filter.incrementToken()) {
|
while (filter.incrementToken()) {
|
||||||
tas.add(termAtt.term());
|
tas.add(termAtt.toString());
|
||||||
}
|
}
|
||||||
return tas;
|
return tas;
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,8 +41,6 @@ public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset);
|
return new Token(term, start, offset);
|
||||||
token.setTermBuffer(term);
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -52,8 +52,6 @@ public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset);
|
return new Token(term, start, offset);
|
||||||
token.setTermBuffer(term);
|
|
||||||
return token;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -51,7 +51,7 @@ public class TestRemoveDuplicatesTokenFilter extends BaseTokenStreamTestCase {
|
||||||
if (toks.hasNext()) {
|
if (toks.hasNext()) {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
Token tok = toks.next();
|
Token tok = toks.next();
|
||||||
termAtt.setEmpty().append(tok.term());
|
termAtt.setEmpty().append(tok);
|
||||||
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
|
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
|
||||||
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
|
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -22,14 +22,14 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.AttributeImpl;
|
import org.apache.lucene.util.AttributeImpl;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
public class TestSingleTokenTokenFilter extends LuceneTestCase {
|
public class TestSingleTokenTokenFilter extends LuceneTestCase {
|
||||||
|
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
Token token = new Token();
|
Token token = new Token();
|
||||||
SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
|
SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
|
||||||
AttributeImpl tokenAtt = (AttributeImpl) ts.addAttribute(TermAttribute.class);
|
AttributeImpl tokenAtt = (AttributeImpl) ts.addAttribute(CharTermAttribute.class);
|
||||||
assertTrue(tokenAtt instanceof Token);
|
assertTrue(tokenAtt instanceof Token);
|
||||||
ts.reset();
|
ts.reset();
|
||||||
|
|
||||||
|
|
|
@ -97,7 +97,7 @@ public class TestTrimFilter extends BaseTokenStreamTestCase {
|
||||||
else {
|
else {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
Token token = tokens[index++];
|
Token token = tokens[index++];
|
||||||
termAtt.setEmpty().append(token.term());
|
termAtt.setEmpty().append(token);
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
|
|
|
@ -18,8 +18,8 @@ package org.apache.lucene.analysis.payloads;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||||
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
||||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||||
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
assertTermEquals("quick", filter, termAtt, payAtt, "JJ".getBytes("UTF-8"));
|
||||||
|
@ -70,7 +70,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
public void testFloatEncoding() throws Exception {
|
public void testFloatEncoding() throws Exception {
|
||||||
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
|
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new FloatEncoder());
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new FloatEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||||
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
|
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeFloat(1.0f));
|
||||||
|
@ -88,7 +88,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
public void testIntEncoding() throws Exception {
|
public void testIntEncoding() throws Exception {
|
||||||
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
|
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||||
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
|
assertTermEquals("quick", filter, termAtt, payAtt, PayloadHelper.encodeInt(1));
|
||||||
|
@ -104,10 +104,10 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
void assertTermEquals(String expected, TokenStream stream, byte[] expectPay) throws Exception {
|
||||||
TermAttribute termAtt = stream.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
|
||||||
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = stream.getAttribute(PayloadAttribute.class);
|
||||||
assertTrue(stream.incrementToken());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals(expected, termAtt.term());
|
assertEquals(expected, termAtt.toString());
|
||||||
Payload payload = payloadAtt.getPayload();
|
Payload payload = payloadAtt.getPayload();
|
||||||
if (payload != null) {
|
if (payload != null) {
|
||||||
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
||||||
|
@ -121,9 +121,9 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void assertTermEquals(String expected, TokenStream stream, TermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
|
void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt, PayloadAttribute payAtt, byte[] expectPay) throws Exception {
|
||||||
assertTrue(stream.incrementToken());
|
assertTrue(stream.incrementToken());
|
||||||
assertEquals(expected, termAtt.term());
|
assertEquals(expected, termAtt.toString());
|
||||||
Payload payload = payAtt.getPayload();
|
Payload payload = payAtt.getPayload();
|
||||||
if (payload != null) {
|
if (payload != null) {
|
||||||
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
assertTrue(payload.length() + " does not equal: " + expectPay.length, payload.length() == expectPay.length);
|
||||||
|
|
|
@ -20,8 +20,8 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -39,11 +39,11 @@ public class NumericPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))), 3, "D");
|
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))), 3, "D");
|
||||||
boolean seenDogs = false;
|
boolean seenDogs = false;
|
||||||
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
|
||||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||||
while (nptf.incrementToken()) {
|
while (nptf.incrementToken()) {
|
||||||
if (termAtt.term().equals("dogs")) {
|
if (termAtt.toString().equals("dogs")) {
|
||||||
seenDogs = true;
|
seenDogs = true;
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
||||||
assertTrue("payloadAtt.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
assertTrue("payloadAtt.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||||
|
@ -60,19 +60,17 @@ public class NumericPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class WordTokenFilter extends TokenFilter {
|
private final class WordTokenFilter extends TokenFilter {
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
private WordTokenFilter(TokenStream input) {
|
private WordTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
if (termAtt.term().equals("dogs"))
|
if (termAtt.toString().equals("dogs"))
|
||||||
typeAtt.setType("D");
|
typeAtt.setType("D");
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
|
|
|
@ -21,7 +21,7 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -39,12 +39,12 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||||
int count = 0;
|
int count = 0;
|
||||||
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = nptf.getAttribute(CharTermAttribute.class);
|
||||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||||
|
|
||||||
while (nptf.incrementToken()) {
|
while (nptf.incrementToken()) {
|
||||||
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0]))));
|
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.buffer()[0]))));
|
||||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||||
String type = new String(payloadAtt.getPayload().getData(), "UTF-8");
|
String type = new String(payloadAtt.getPayload().getData(), "UTF-8");
|
||||||
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
||||||
|
@ -55,19 +55,17 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class WordTokenFilter extends TokenFilter {
|
private final class WordTokenFilter extends TokenFilter {
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
private WordTokenFilter(TokenStream input) {
|
private WordTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
if (input.incrementToken()) {
|
if (input.incrementToken()) {
|
||||||
typeAtt.setType(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0])));
|
typeAtt.setType(String.valueOf(Character.toUpperCase(termAtt.buffer()[0])));
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -22,7 +22,7 @@ import java.io.IOException;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
import org.apache.lucene.analysis.shingle.ShingleFilter;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
|
||||||
public class PositionFilterTest extends BaseTokenStreamTestCase {
|
public class PositionFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -30,19 +30,18 @@ public class PositionFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
protected int index = 0;
|
protected int index = 0;
|
||||||
protected String[] testToken;
|
protected String[] testToken;
|
||||||
protected TermAttribute termAtt;
|
protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
public TestTokenStream(String[] testToken) {
|
public TestTokenStream(String[] testToken) {
|
||||||
super();
|
super();
|
||||||
this.testToken = testToken;
|
this.testToken = testToken;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
if (index < testToken.length) {
|
if (index < testToken.length) {
|
||||||
termAtt.setTermBuffer(testToken[index++]);
|
termAtt.setEmpty().append(testToken[index++]);
|
||||||
return true;
|
return true;
|
||||||
} else {
|
} else {
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexReader;
|
import org.apache.lucene.index.IndexReader;
|
||||||
|
@ -176,9 +175,6 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
a.addStopWords(reader, 10);
|
a.addStopWords(reader, 10);
|
||||||
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
||||||
TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
|
assertTokenStreamContents(ts, new String[] { "this" });
|
||||||
assertTrue(ts.incrementToken());
|
|
||||||
assertEquals("this", termAtt.term());
|
|
||||||
assertFalse(ts.incrementToken());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,46 +21,22 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||||
public void testFilter() throws Exception {
|
public void testFilter() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("Do have a nice day")); // 1-4 length string
|
new StringReader("Do have a nice day")); // 1-4 length string
|
||||||
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream);
|
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream);
|
||||||
TermAttribute text = filter.getAttribute(TermAttribute.class);
|
assertTokenStreamContents(filter, new String[] { "oD", "evah", "a", "ecin", "yad" });
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("oD", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("evah", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("a", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("ecin", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("yad", text.term());
|
|
||||||
assertFalse(filter.incrementToken());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFilterWithMark() throws Exception {
|
public void testFilterWithMark() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Do have a nice day")); // 1-4 length string
|
"Do have a nice day")); // 1-4 length string
|
||||||
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream, '\u0001');
|
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream, '\u0001');
|
||||||
TermAttribute text = filter
|
assertTokenStreamContents(filter,
|
||||||
.getAttribute(TermAttribute.class);
|
new String[] { "\u0001oD", "\u0001evah", "\u0001a", "\u0001ecin", "\u0001yad" });
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("\u0001oD", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("\u0001evah", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("\u0001a", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("\u0001ecin", text.term());
|
|
||||||
assertTrue(filter.incrementToken());
|
|
||||||
assertEquals("\u0001yad", text.term());
|
|
||||||
assertFalse(filter.incrementToken());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReverseString() throws Exception {
|
public void testReverseString() throws Exception {
|
||||||
|
|
|
@ -17,17 +17,13 @@ package org.apache.lucene.analysis.ru;
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.io.Reader;
|
|
||||||
import java.io.StringReader;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.util.CharArraySet;
|
import org.apache.lucene.analysis.util.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -65,8 +61,8 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
new RussianLetterTokenizer(TEST_VERSION_CURRENT,
|
new RussianLetterTokenizer(TEST_VERSION_CURRENT,
|
||||||
sampleUnicode);
|
sampleUnicode);
|
||||||
|
|
||||||
TermAttribute text = in.getAttribute(TermAttribute.class);
|
CharTermAttribute text = in.getAttribute(CharTermAttribute.class);
|
||||||
TermAttribute sampleText = sample.getAttribute(TermAttribute.class);
|
CharTermAttribute sampleText = sample.getAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
for (;;)
|
for (;;)
|
||||||
{
|
{
|
||||||
|
@ -76,34 +72,21 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
boolean nextSampleToken = sample.incrementToken();
|
boolean nextSampleToken = sample.incrementToken();
|
||||||
assertEquals(
|
assertEquals(
|
||||||
"Unicode",
|
"Unicode",
|
||||||
text.term(),
|
text.toString(),
|
||||||
nextSampleToken == false
|
nextSampleToken == false
|
||||||
? null
|
? null
|
||||||
: sampleText.term());
|
: sampleText.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
inWords.close();
|
inWords.close();
|
||||||
sampleUnicode.close();
|
sampleUnicode.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testDigitsInRussianCharset()
|
/** Check that RussianAnalyzer doesnt discard any numbers */
|
||||||
|
public void testDigitsInRussianCharset() throws IOException
|
||||||
{
|
{
|
||||||
Reader reader = new StringReader("text 1000");
|
RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
assertAnalyzesTo(ra, "text 1000", new String[] { "text", "1000" });
|
||||||
TokenStream stream = ra.tokenStream("", reader);
|
|
||||||
|
|
||||||
TermAttribute termText = stream.getAttribute(TermAttribute.class);
|
|
||||||
try {
|
|
||||||
assertTrue(stream.incrementToken());
|
|
||||||
assertEquals("text", termText.term());
|
|
||||||
assertTrue(stream.incrementToken());
|
|
||||||
assertEquals("RussianAnalyzer's tokenizer skips numbers from input text", "1000", termText.term());
|
|
||||||
assertFalse(stream.incrementToken());
|
|
||||||
}
|
|
||||||
catch (IOException e)
|
|
||||||
{
|
|
||||||
fail("unexpected IOException");
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** @deprecated remove this test in Lucene 4.0: stopwords changed */
|
/** @deprecated remove this test in Lucene 4.0: stopwords changed */
|
||||||
|
|
|
@ -26,8 +26,8 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.LetterTokenizer;
|
import org.apache.lucene.analysis.core.LetterTokenizer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.document.Document;
|
import org.apache.lucene.document.Document;
|
||||||
import org.apache.lucene.document.Field;
|
import org.apache.lucene.document.Field;
|
||||||
import org.apache.lucene.index.IndexWriter;
|
import org.apache.lucene.index.IndexWriter;
|
||||||
|
@ -159,11 +159,11 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
int j = -1;
|
int j = -1;
|
||||||
|
|
||||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
j += posIncrAtt.getPositionIncrement();
|
j += posIncrAtt.getPositionIncrement();
|
||||||
String termText = termAtt.term();
|
String termText = termAtt.toString();
|
||||||
q.add(new Term("content", termText), j);
|
q.add(new Term("content", termText), j);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -186,10 +186,10 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
TokenStream ts = analyzer.tokenStream("content",
|
TokenStream ts = analyzer.tokenStream("content",
|
||||||
new StringReader("test sentence"));
|
new StringReader("test sentence"));
|
||||||
|
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
while (ts.incrementToken()) {
|
while (ts.incrementToken()) {
|
||||||
String termText = termAtt.term();
|
String termText = termAtt.toString();
|
||||||
q.add(new TermQuery(new Term("content", termText)),
|
q.add(new TermQuery(new Term("content", termText)),
|
||||||
BooleanClause.Occur.SHOULD);
|
BooleanClause.Occur.SHOULD);
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,7 +31,12 @@ import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -415,7 +420,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
|
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
|
||||||
Token token = new Token(startOffset, endOffset);
|
Token token = new Token(startOffset, endOffset);
|
||||||
token.setTermBuffer(text);
|
token.setEmpty().append(text);
|
||||||
token.setPositionIncrement(posIncr);
|
token.setPositionIncrement(posIncr);
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
@ -427,7 +432,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
|
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
|
||||||
Token token = new Token(startOffset, endOffset);
|
Token token = new Token(startOffset, endOffset);
|
||||||
token.setTermBuffer(text);
|
token.setEmpty().append(text);
|
||||||
token.setPositionIncrement(posIncr);
|
token.setPositionIncrement(posIncr);
|
||||||
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
||||||
return token;
|
return token;
|
||||||
|
@ -435,7 +440,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
|
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
|
||||||
Token token = new Token(startOffset, endOffset);
|
Token token = new Token(startOffset, endOffset);
|
||||||
token.setTermBuffer(text);
|
token.setEmpty().append(text);
|
||||||
token.setPositionIncrement(posIncr);
|
token.setPositionIncrement(posIncr);
|
||||||
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
|
||||||
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
|
ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
|
||||||
|
@ -445,20 +450,20 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
// assert-methods start here
|
// assert-methods start here
|
||||||
|
|
||||||
private void assertNext(TokenStream ts, String text) throws IOException {
|
private void assertNext(TokenStream ts, String text) throws IOException {
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
|
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(text, termAtt.term());
|
assertEquals(text, termAtt.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(text, termAtt.term());
|
assertEquals(text, termAtt.toString());
|
||||||
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
|
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
|
||||||
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
|
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
|
||||||
assertEquals(startOffset, offsetAtt.startOffset());
|
assertEquals(startOffset, offsetAtt.startOffset());
|
||||||
|
@ -466,11 +471,11 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
private void assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
|
||||||
|
|
||||||
assertTrue(ts.incrementToken());
|
assertTrue(ts.incrementToken());
|
||||||
assertEquals(text, termAtt.term());
|
assertEquals(text, termAtt.toString());
|
||||||
assertEquals(startOffset, offsetAtt.startOffset());
|
assertEquals(startOffset, offsetAtt.startOffset());
|
||||||
assertEquals(endOffset, offsetAtt.endOffset());
|
assertEquals(endOffset, offsetAtt.endOffset());
|
||||||
}
|
}
|
||||||
|
@ -478,7 +483,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
private static Token createToken(String term, int start, int offset)
|
private static Token createToken(String term, int start, int offset)
|
||||||
{
|
{
|
||||||
Token token = new Token(start, offset);
|
Token token = new Token(start, offset);
|
||||||
token.setTermBuffer(term);
|
token.setEmpty().append(term);
|
||||||
return token;
|
return token;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -486,21 +491,15 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
public final static class TokenListStream extends TokenStream {
|
public final static class TokenListStream extends TokenStream {
|
||||||
|
|
||||||
private Collection<Token> tokens;
|
private Collection<Token> tokens;
|
||||||
TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt;
|
private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
PayloadAttribute payloadAtt;
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
FlagsAttribute flagsAtt;
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
public TokenListStream(Collection<Token> tokens) {
|
public TokenListStream(Collection<Token> tokens) {
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Iterator<Token> iterator;
|
private Iterator<Token> iterator;
|
||||||
|
@ -515,7 +514,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
Token prototype = iterator.next();
|
Token prototype = iterator.next();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
|
termAtt.copyBuffer(prototype.buffer(), 0, prototype.length());
|
||||||
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
|
||||||
flagsAtt.setFlags(prototype.getFlags());
|
flagsAtt.setFlags(prototype.getFlags());
|
||||||
offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
|
offsetAtt.setOffset(prototype.startOffset(), prototype.endOffset());
|
||||||
|
|
|
@ -23,7 +23,7 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
|
|
||||||
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
@ -41,11 +41,11 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
boolean seenDogs = false;
|
boolean seenDogs = false;
|
||||||
|
|
||||||
TermAttribute termAtt = ttf.addAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = ttf.addAttribute(CharTermAttribute.class);
|
||||||
TypeAttribute typeAtt = ttf.addAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = ttf.addAttribute(TypeAttribute.class);
|
||||||
ttf.reset();
|
ttf.reset();
|
||||||
while (ttf.incrementToken()) {
|
while (ttf.incrementToken()) {
|
||||||
if (termAtt.term().equals("dogs")) {
|
if (termAtt.toString().equals("dogs")) {
|
||||||
seenDogs = true;
|
seenDogs = true;
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
assertTrue(typeAtt.type() + " is not equal to " + "D", typeAtt.type().equals("D") == true);
|
||||||
} else {
|
} else {
|
||||||
|
@ -64,20 +64,18 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private class WordTokenFilter extends TokenFilter {
|
private class WordTokenFilter extends TokenFilter {
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
|
|
||||||
private WordTokenFilter(TokenStream input) {
|
private WordTokenFilter(TokenStream input) {
|
||||||
super(input);
|
super(input);
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public final boolean incrementToken() throws IOException {
|
public final boolean incrementToken() throws IOException {
|
||||||
if (!input.incrementToken()) return false;
|
if (!input.incrementToken()) return false;
|
||||||
|
|
||||||
if (termAtt.term().equals("dogs")) {
|
if (termAtt.toString().equals("dogs")) {
|
||||||
typeAtt.setType("D");
|
typeAtt.setType("D");
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
|
|
@ -22,11 +22,11 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
|
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
|
@ -93,7 +93,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testFilterTokens() throws Exception {
|
public void testFilterTokens() throws Exception {
|
||||||
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
|
||||||
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
|
||||||
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
|
||||||
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
|
@ -102,7 +102,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
filter.incrementToken();
|
filter.incrementToken();
|
||||||
|
|
||||||
assertEquals("accent", termAtt.term());
|
assertEquals("accent", termAtt.toString());
|
||||||
assertEquals(2, offsetAtt.startOffset());
|
assertEquals(2, offsetAtt.startOffset());
|
||||||
assertEquals(7, offsetAtt.endOffset());
|
assertEquals(7, offsetAtt.endOffset());
|
||||||
assertEquals("wrd", typeAtt.type());
|
assertEquals("wrd", typeAtt.type());
|
||||||
|
@ -112,27 +112,21 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private final class TestTokenStream extends TokenStream {
|
private final class TestTokenStream extends TokenStream {
|
||||||
private TermAttribute termAtt;
|
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||||
private OffsetAttribute offsetAtt;
|
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||||
private TypeAttribute typeAtt;
|
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
|
||||||
private PayloadAttribute payloadAtt;
|
private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
|
||||||
private PositionIncrementAttribute posIncAtt;
|
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
private FlagsAttribute flagsAtt;
|
private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
|
|
||||||
TestTokenStream() {
|
TestTokenStream() {
|
||||||
super();
|
super();
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
|
||||||
offsetAtt = addAttribute(OffsetAttribute.class);
|
|
||||||
typeAtt = addAttribute(TypeAttribute.class);
|
|
||||||
payloadAtt = addAttribute(PayloadAttribute.class);
|
|
||||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() {
|
public boolean incrementToken() {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer("accents");
|
termAtt.setEmpty().append("accents");
|
||||||
offsetAtt.setOffset(2, 7);
|
offsetAtt.setOffset(2, 7);
|
||||||
typeAtt.setType("wrd");
|
typeAtt.setType("wrd");
|
||||||
posIncAtt.setPositionIncrement(3);
|
posIncAtt.setPositionIncrement(3);
|
||||||
|
|
|
@ -404,7 +404,7 @@ public class TestSynonymFilter extends BaseTokenStreamTestCase {
|
||||||
else {
|
else {
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
Token token = tokens[index++];
|
Token token = tokens[index++];
|
||||||
termAtt.setEmpty().append(token.term());
|
termAtt.setEmpty().append(token);
|
||||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||||
flagsAtt.setFlags(token.getFlags());
|
flagsAtt.setFlags(token.getFlags());
|
||||||
|
|
|
@ -20,30 +20,20 @@ package org.apache.lucene.analysis.wikipedia;
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
|
||||||
|
|
||||||
|
import static org.apache.lucene.analysis.wikipedia.WikipediaTokenizer.*;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
* Basic Tests for {@link WikipediaTokenizer}
|
||||||
*
|
|
||||||
**/
|
**/
|
||||||
public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
|
protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
|
||||||
|
|
||||||
public WikipediaTokenizerTest(String s) {
|
|
||||||
super(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testSimple() throws Exception {
|
public void testSimple() throws Exception {
|
||||||
String text = "This is a [[Category:foo]]";
|
String text = "This is a [[Category:foo]]";
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(text));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(text));
|
||||||
|
@ -51,216 +41,85 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
new String[] { "This", "is", "a", "foo" },
|
new String[] { "This", "is", "a", "foo" },
|
||||||
new int[] { 0, 5, 8, 21 },
|
new int[] { 0, 5, 8, 21 },
|
||||||
new int[] { 4, 7, 9, 24 },
|
new int[] { 4, 7, 9, 24 },
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", WikipediaTokenizer.CATEGORY },
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY },
|
||||||
new int[] { 1, 1, 1, 1, },
|
new int[] { 1, 1, 1, 1, },
|
||||||
text.length());
|
text.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testHandwritten() throws Exception {
|
public void testHandwritten() throws Exception {
|
||||||
//make sure all tokens are in only one type
|
// make sure all tokens are in only one type
|
||||||
String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] " +
|
String test = "[[link]] This is a [[Category:foo]] Category This is a linked [[:Category:bar none withstanding]] "
|
||||||
"Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] " +
|
+ "Category This is (parens) This is a [[link]] This is an external URL [http://lucene.apache.org] "
|
||||||
"Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " +
|
+ "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' "
|
||||||
" This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. " +
|
+ " This is a [[link|display info]] This is a period. Here is $3.25 and here is 3.50. Here's Johnny. "
|
||||||
"==heading== ===sub head=== followed by some text [[Category:blah| ]] " +
|
+ "==heading== ===sub head=== followed by some text [[Category:blah| ]] "
|
||||||
"''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed." +
|
+ "''[[Category:ital_cat]]'' here is some that is ''italics [[Category:foo]] but is never closed."
|
||||||
"'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" +
|
+ "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this"
|
||||||
" [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" +
|
+ " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]"
|
||||||
" [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
|
+ " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
|
||||||
Map<String,String> tcm = new HashMap<String,String>();//map tokens to types
|
|
||||||
tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
|
|
||||||
tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
|
|
||||||
tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
|
|
||||||
|
|
||||||
tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
|
||||||
tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
|
||||||
tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
|
||||||
tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL);
|
|
||||||
tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
|
|
||||||
|
|
||||||
//alphanums
|
|
||||||
tcm.put("This", "<ALPHANUM>");
|
|
||||||
tcm.put("is", "<ALPHANUM>");
|
|
||||||
tcm.put("a", "<ALPHANUM>");
|
|
||||||
tcm.put("Category", "<ALPHANUM>");
|
|
||||||
tcm.put("linked", "<ALPHANUM>");
|
|
||||||
tcm.put("parens", "<ALPHANUM>");
|
|
||||||
tcm.put("external", "<ALPHANUM>");
|
|
||||||
tcm.put("URL", "<ALPHANUM>");
|
|
||||||
tcm.put("and", "<ALPHANUM>");
|
|
||||||
tcm.put("period", "<ALPHANUM>");
|
|
||||||
tcm.put("Here", "<ALPHANUM>");
|
|
||||||
tcm.put("Here's", "<APOSTROPHE>");
|
|
||||||
tcm.put("here", "<ALPHANUM>");
|
|
||||||
tcm.put("Johnny", "<ALPHANUM>");
|
|
||||||
tcm.put("followed", "<ALPHANUM>");
|
|
||||||
tcm.put("by", "<ALPHANUM>");
|
|
||||||
tcm.put("text", "<ALPHANUM>");
|
|
||||||
tcm.put("that", "<ALPHANUM>");
|
|
||||||
tcm.put("but", "<ALPHANUM>");
|
|
||||||
tcm.put("never", "<ALPHANUM>");
|
|
||||||
tcm.put("closed", "<ALPHANUM>");
|
|
||||||
tcm.put("goes", "<ALPHANUM>");
|
|
||||||
tcm.put("for", "<ALPHANUM>");
|
|
||||||
tcm.put("this", "<ALPHANUM>");
|
|
||||||
tcm.put("an", "<ALPHANUM>");
|
|
||||||
tcm.put("some", "<ALPHANUM>");
|
|
||||||
tcm.put("martian", "<ALPHANUM>");
|
|
||||||
tcm.put("code", "<ALPHANUM>");
|
|
||||||
|
|
||||||
tcm.put("foo", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("bar", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("none", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("blah", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("ital", WikipediaTokenizer.CATEGORY);
|
|
||||||
tcm.put("cat", WikipediaTokenizer.CATEGORY);
|
|
||||||
|
|
||||||
tcm.put("italics", WikipediaTokenizer.ITALICS);
|
|
||||||
tcm.put("more", WikipediaTokenizer.ITALICS);
|
|
||||||
tcm.put("bold", WikipediaTokenizer.BOLD);
|
|
||||||
tcm.put("same", WikipediaTokenizer.BOLD);
|
|
||||||
tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
|
|
||||||
tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
|
|
||||||
tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
|
|
||||||
|
|
||||||
tcm.put("heading", WikipediaTokenizer.HEADING);
|
|
||||||
tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
|
|
||||||
tcm.put("head", WikipediaTokenizer.SUB_HEADING);
|
|
||||||
|
|
||||||
tcm.put("Citation", WikipediaTokenizer.CITATION);
|
|
||||||
|
|
||||||
tcm.put("3.25", "<NUM>");
|
|
||||||
tcm.put("3.50", "<NUM>");
|
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||||
int count = 0;
|
assertTokenStreamContents(tf,
|
||||||
int numItalics = 0;
|
new String[] {"link", "This", "is", "a",
|
||||||
int numBoldItalics = 0;
|
"foo", "Category", "This", "is", "a", "linked", "bar", "none",
|
||||||
int numCategory = 0;
|
"withstanding", "Category", "This", "is", "parens", "This", "is", "a",
|
||||||
int numCitation = 0;
|
"link", "This", "is", "an", "external", "URL",
|
||||||
TermAttribute termAtt = tf.addAttribute(TermAttribute.class);
|
"http://lucene.apache.org", "Here", "is", "italics", "and", "more",
|
||||||
TypeAttribute typeAtt = tf.addAttribute(TypeAttribute.class);
|
"italics", "bold", "and", "five", "quotes", "This", "is", "a", "link",
|
||||||
|
"display", "info", "This", "is", "a", "period", "Here", "is", "3.25",
|
||||||
while (tf.incrementToken()) {
|
"and", "here", "is", "3.50", "Here's", "Johnny", "heading", "sub",
|
||||||
String tokText = termAtt.term();
|
"head", "followed", "by", "some", "text", "blah", "ital", "cat",
|
||||||
//System.out.println("Text: " + tokText + " Type: " + token.type());
|
"here", "is", "some", "that", "is", "italics", "foo", "but", "is",
|
||||||
String expectedType = tcm.get(tokText);
|
"never", "closed", "same", "foo", "goes", "for", "this", "and2", "foo",
|
||||||
assertTrue("expectedType is null and it shouldn't be for: " + tf.toString(), expectedType != null);
|
"and", "this", "http://foo.boo.com/test/test/", "Test", "Test",
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + expectedType + " for " + tf.toString(), typeAtt.type().equals(expectedType) == true);
|
"http://foo.boo.com/test/test/test.html", "Test", "Test",
|
||||||
count++;
|
"http://foo.boo.com/test/test/test.html?g=b&c=d", "Test", "Test",
|
||||||
if (typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true){
|
"Citation", "martian", "code"},
|
||||||
numItalics++;
|
new String[] {INTERNAL_LINK,
|
||||||
} else if (typeAtt.type().equals(WikipediaTokenizer.BOLD_ITALICS) == true){
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY, "<ALPHANUM>",
|
||||||
numBoldItalics++;
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", CATEGORY,
|
||||||
} else if (typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true){
|
CATEGORY, CATEGORY, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
numCategory++;
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", INTERNAL_LINK,
|
||||||
}
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
else if (typeAtt.type().equals(WikipediaTokenizer.CITATION) == true){
|
EXTERNAL_LINK_URL, "<ALPHANUM>", "<ALPHANUM>", ITALICS, "<ALPHANUM>",
|
||||||
numCitation++;
|
ITALICS, ITALICS, BOLD, "<ALPHANUM>", BOLD_ITALICS, BOLD_ITALICS,
|
||||||
}
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", INTERNAL_LINK, INTERNAL_LINK,
|
||||||
}
|
INTERNAL_LINK, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size());
|
"<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4);
|
"<ALPHANUM>", "<NUM>", "<APOSTROPHE>", "<ALPHANUM>", HEADING,
|
||||||
assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3);
|
SUB_HEADING, SUB_HEADING, "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>",
|
||||||
assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10);
|
"<ALPHANUM>", CATEGORY, CATEGORY, CATEGORY, "<ALPHANUM>", "<ALPHANUM>",
|
||||||
assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1);
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", ITALICS, CATEGORY,
|
||||||
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", BOLD, CATEGORY,
|
||||||
|
"<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", BOLD_ITALICS, CATEGORY,
|
||||||
|
"<ALPHANUM>", "<ALPHANUM>", EXTERNAL_LINK_URL, EXTERNAL_LINK,
|
||||||
|
EXTERNAL_LINK, EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK,
|
||||||
|
EXTERNAL_LINK_URL, EXTERNAL_LINK, EXTERNAL_LINK, CITATION,
|
||||||
|
"<ALPHANUM>", "<ALPHANUM>"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLinkPhrases() throws Exception {
|
public void testLinkPhrases() throws Exception {
|
||||||
|
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
|
||||||
checkLinkPhrases(tf);
|
checkLinkPhrases(tf);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
|
||||||
TermAttribute termAtt = tf.addAttribute(TermAttribute.class);
|
assertTokenStreamContents(tf,
|
||||||
PositionIncrementAttribute posIncrAtt = tf.addAttribute(PositionIncrementAttribute.class);
|
new String[] { "click", "link", "here", "again", "click",
|
||||||
|
"http://lucene.apache.org", "here", "again", "a", "b", "c", "d" },
|
||||||
assertTrue(tf.incrementToken());
|
new int[] { 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1 });
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "click", termAtt.term().equals("click") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "link", termAtt.term().equals("link") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "here",
|
|
||||||
termAtt.term().equals("here") == true);
|
|
||||||
//The link, and here should be at the same position for phrases to work
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "again",
|
|
||||||
termAtt.term().equals("again") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "click",
|
|
||||||
termAtt.term().equals("click") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org",
|
|
||||||
termAtt.term().equals("http://lucene.apache.org") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "here",
|
|
||||||
termAtt.term().equals("here") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "again",
|
|
||||||
termAtt.term().equals("again") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "a",
|
|
||||||
termAtt.term().equals("a") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "b",
|
|
||||||
termAtt.term().equals("b") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "c",
|
|
||||||
termAtt.term().equals("c") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "d",
|
|
||||||
termAtt.term().equals("d") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
|
|
||||||
assertFalse(tf.incrementToken());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLinks() throws Exception {
|
public void testLinks() throws Exception {
|
||||||
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here] [https://lucene.apache.org/java/docs/index.html?b=c here]";
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
|
||||||
TermAttribute termAtt = tf.addAttribute(TermAttribute.class);
|
assertTokenStreamContents(tf,
|
||||||
TypeAttribute typeAtt = tf.addAttribute(TypeAttribute.class);
|
new String[] { "http://lucene.apache.org/java/docs/index.html#news", "here",
|
||||||
|
"http://lucene.apache.org/java/docs/index.html?b=c", "here",
|
||||||
assertTrue(tf.incrementToken());
|
"https://lucene.apache.org/java/docs/index.html?b=c", "here" },
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
|
new String[] { EXTERNAL_LINK_URL, EXTERNAL_LINK,
|
||||||
termAtt.term().equals("http://lucene.apache.org/java/docs/index.html#news") == true);
|
EXTERNAL_LINK_URL, EXTERNAL_LINK,
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
EXTERNAL_LINK_URL, EXTERNAL_LINK, });
|
||||||
tf.incrementToken();//skip here
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
|
|
||||||
termAtt.term().equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
|
||||||
tf.incrementToken();//skip here
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
|
|
||||||
termAtt.term().equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, typeAtt.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertFalse(tf.incrementToken());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testLucene1133() throws Exception {
|
public void testLucene1133() throws Exception {
|
||||||
|
@ -272,73 +131,13 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
checkLinkPhrases(tf);
|
checkLinkPhrases(tf);
|
||||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
|
||||||
TermAttribute termAtt = tf.addAttribute(TermAttribute.class);
|
assertTokenStreamContents(tf,
|
||||||
PositionIncrementAttribute posIncrAtt = tf.addAttribute(PositionIncrementAttribute.class);
|
new String[] { "a b c d", "e f g", "link", "here", "link",
|
||||||
OffsetAttribute offsetAtt = tf.addAttribute(OffsetAttribute.class);
|
"there", "italics here", "something", "more italics", "h i j" },
|
||||||
|
new int[] { 11, 32, 42, 47, 56, 61, 71, 86, 98, 124 },
|
||||||
assertTrue(tf.incrementToken());
|
new int[] { 18, 37, 46, 51, 60, 66, 83, 95, 110, 133 },
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "a b c d",
|
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }
|
||||||
termAtt.term().equals("a b c d") == true);
|
);
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "e f g",
|
|
||||||
termAtt.term().equals("e f g") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "link",
|
|
||||||
termAtt.term().equals("link") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "here",
|
|
||||||
termAtt.term().equals("here") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "link",
|
|
||||||
termAtt.term().equals("link") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "there",
|
|
||||||
termAtt.term().equals("there") == true);
|
|
||||||
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "italics here",
|
|
||||||
termAtt.term().equals("italics here") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "something",
|
|
||||||
termAtt.term().equals("something") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "more italics",
|
|
||||||
termAtt.term().equals("more italics") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "h i j",
|
|
||||||
termAtt.term().equals("h i j") == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
|
||||||
|
|
||||||
assertFalse(tf.incrementToken());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBoth() throws Exception {
|
public void testBoth() throws Exception {
|
||||||
|
@ -348,211 +147,26 @@ public class WikipediaTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h i j]]";
|
||||||
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
//should output all the indivual tokens plus the untokenized tokens as well. Untokenized tokens
|
||||||
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
||||||
TermAttribute termAtt = tf.addAttribute(TermAttribute.class);
|
assertTokenStreamContents(tf,
|
||||||
TypeAttribute typeAtt = tf.addAttribute(TypeAttribute.class);
|
new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g",
|
||||||
PositionIncrementAttribute posIncrAtt = tf.addAttribute(PositionIncrementAttribute.class);
|
"link", "here", "link", "there", "italics here", "italics", "here",
|
||||||
OffsetAttribute offsetAtt = tf.addAttribute(OffsetAttribute.class);
|
"something", "more italics", "more", "italics", "h i j", "h", "i", "j" },
|
||||||
|
new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98, 98, 103, 124, 124, 128, 132 },
|
||||||
|
new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 },
|
||||||
|
new int[] { 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1 }
|
||||||
|
);
|
||||||
|
|
||||||
|
// now check the flags, TODO: add way to check flags from BaseTokenStreamTestCase?
|
||||||
|
tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
|
||||||
|
int expectedFlags[] = new int[] { UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, 0,
|
||||||
|
0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, UNTOKENIZED_TOKEN_FLAG, 0, 0, 0 };
|
||||||
FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
|
FlagsAttribute flagsAtt = tf.addAttribute(FlagsAttribute.class);
|
||||||
|
tf.reset();
|
||||||
assertTrue(tf.incrementToken());
|
for (int i = 0; i < expectedFlags.length; i++) {
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "a b c d",
|
assertTrue(tf.incrementToken());
|
||||||
termAtt.term().equals("a b c d") == true);
|
assertEquals("flags " + i, expectedFlags[i], flagsAtt.getFlags());
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
}
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "a",
|
|
||||||
termAtt.term().equals("a") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", flagsAtt.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 11, offsetAtt.startOffset() == 11);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 12, offsetAtt.endOffset() == 12);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "b",
|
|
||||||
termAtt.term().equals("b") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 13, offsetAtt.startOffset() == 13);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 14, offsetAtt.endOffset() == 14);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "c",
|
|
||||||
termAtt.term().equals("c") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 15, offsetAtt.startOffset() == 15);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 16, offsetAtt.endOffset() == 16);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "d",
|
|
||||||
termAtt.term().equals("d") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 17, offsetAtt.startOffset() == 17);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 18, offsetAtt.endOffset() == 18);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "e f g",
|
|
||||||
termAtt.term().equals("e f g") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "e",
|
|
||||||
termAtt.term().equals("e") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 32, offsetAtt.startOffset() == 32);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 33, offsetAtt.endOffset() == 33);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "f",
|
|
||||||
termAtt.term().equals("f") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 34, offsetAtt.startOffset() == 34);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 35, offsetAtt.endOffset() == 35);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "g",
|
|
||||||
termAtt.term().equals("g") == true);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 36, offsetAtt.startOffset() == 36);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 37, offsetAtt.endOffset() == 37);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "link",
|
|
||||||
termAtt.term().equals("link") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 42, offsetAtt.startOffset() == 42);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 46, offsetAtt.endOffset() == 46);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "here",
|
|
||||||
termAtt.term().equals("here") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 47, offsetAtt.startOffset() == 47);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 51, offsetAtt.endOffset() == 51);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "link",
|
|
||||||
termAtt.term().equals("link") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 56, offsetAtt.startOffset() == 56);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 60, offsetAtt.endOffset() == 60);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "there",
|
|
||||||
termAtt.term().equals("there") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, typeAtt.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 61, offsetAtt.startOffset() == 61);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 66, offsetAtt.endOffset() == 66);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "italics here",
|
|
||||||
termAtt.term().equals("italics here") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "italics",
|
|
||||||
termAtt.term().equals("italics") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 71, offsetAtt.startOffset() == 71);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 78, offsetAtt.endOffset() == 78);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "here",
|
|
||||||
termAtt.term().equals("here") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 79, offsetAtt.startOffset() == 79);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 83, offsetAtt.endOffset() == 83);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "something",
|
|
||||||
termAtt.term().equals("something") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 86, offsetAtt.startOffset() == 86);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 95, offsetAtt.endOffset() == 95);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "more italics",
|
|
||||||
termAtt.term().equals("more italics") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "more",
|
|
||||||
termAtt.term().equals("more") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 98, offsetAtt.startOffset() == 98);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 102, offsetAtt.endOffset() == 102);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "italics",
|
|
||||||
termAtt.term().equals("italics") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.ITALICS, typeAtt.type().equals(WikipediaTokenizer.ITALICS) == true);
|
|
||||||
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 103, offsetAtt.startOffset() == 103);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 110, offsetAtt.endOffset() == 110);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "h i j",
|
|
||||||
termAtt.term().equals("h i j") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(flagsAtt.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, flagsAtt.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "h",
|
|
||||||
termAtt.term().equals("h") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 0, posIncrAtt.getPositionIncrement() == 0);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 124, offsetAtt.startOffset() == 124);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 125, offsetAtt.endOffset() == 125);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "i",
|
|
||||||
termAtt.term().equals("i") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 128, offsetAtt.startOffset() == 128);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 129, offsetAtt.endOffset() == 129);
|
|
||||||
|
|
||||||
assertTrue(tf.incrementToken());
|
|
||||||
assertTrue(termAtt.term() + " is not equal to " + "j",
|
|
||||||
termAtt.term().equals("j") == true);
|
|
||||||
assertTrue(posIncrAtt.getPositionIncrement() + " does not equal: " + 1, posIncrAtt.getPositionIncrement() == 1);
|
|
||||||
assertTrue(typeAtt.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, typeAtt.type().equals(WikipediaTokenizer.CATEGORY) == true);
|
|
||||||
assertTrue(offsetAtt.startOffset() + " does not equal: " + 132, offsetAtt.startOffset() == 132);
|
|
||||||
assertTrue(offsetAtt.endOffset() + " does not equal: " + 133, offsetAtt.endOffset() == 133);
|
|
||||||
|
|
||||||
assertFalse(tf.incrementToken());
|
assertFalse(tf.incrementToken());
|
||||||
|
tf.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue