mirror of https://github.com/apache/lucene.git
LUCENE-2285: Code cleanups to remove compiler warnings in eclipse.
git-svn-id: https://svn.apache.org/repos/asf/lucene/java/trunk@917019 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
e358c3f2dd
commit
efb74380fd
|
@ -187,6 +187,8 @@ Optimizations
|
||||||
* LUCENE-2195: Speedup CharArraySet if set is empty.
|
* LUCENE-2195: Speedup CharArraySet if set is empty.
|
||||||
(Simon Willnauer via Robert Muir)
|
(Simon Willnauer via Robert Muir)
|
||||||
|
|
||||||
|
* LUCENE-2285: Code cleanup. (Shai Erera via Uwe Schindler)
|
||||||
|
|
||||||
Build
|
Build
|
||||||
|
|
||||||
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
* LUCENE-2124: Moved the JDK-based collation support from contrib/collation
|
||||||
|
@ -209,9 +211,10 @@ Test Cases
|
||||||
|
|
||||||
* LUCENE-2170: Fix thread starvation problems. (Uwe Schindler)
|
* LUCENE-2170: Fix thread starvation problems. (Uwe Schindler)
|
||||||
|
|
||||||
* LUCENE-2248, LUCENE-2251: Refactor tests to not use Version.LUCENE_CURRENT,
|
* LUCENE-2248, LUCENE-2251, LUCENE-2285: Refactor tests to not use
|
||||||
but instead use a global static value from LuceneTestCase(J4), that
|
Version.LUCENE_CURRENT, but instead use a global static value
|
||||||
contains the release version. (Uwe Schindler, Simon Willnauer)
|
from LuceneTestCase(J4), that contains the release version.
|
||||||
|
(Uwe Schindler, Simon Willnauer, Shai Erera)
|
||||||
|
|
||||||
================== Release 2.9.2 / 3.0.1 2010-02-26 ====================
|
================== Release 2.9.2 / 3.0.1 2010-02-26 ====================
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
@ -162,14 +161,16 @@ public final class ArabicAnalyzer extends StopwordAnalyzerBase {
|
||||||
this(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT));
|
this(matchVersion, WordlistLoader.getWordSet( stopwords, STOPWORDS_COMMENT));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
|
* Creates
|
||||||
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from an {@link ArabicLetterTokenizer} filtered with
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter}, {@link ArabicNormalizationFilter},
|
* built from an {@link ArabicLetterTokenizer} filtered with
|
||||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
* and {@link ArabicStemFilter}.
|
* {@link ArabicNormalizationFilter}, {@link KeywordMarkerTokenFilter}
|
||||||
|
* if a stem exclusion set is provided and {@link ArabicStemFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
@ -119,13 +118,16 @@ public final class BulgarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
matchVersion, stemExclusionSet)); }
|
matchVersion, stemExclusionSet)); }
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link BulgarianStemFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link BulgarianStemFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
|
||||||
|
|
|
@ -29,7 +29,6 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
@ -191,12 +190,16 @@ public final class BrazilianAnalyzer extends StopwordAnalyzerBase {
|
||||||
excltable = WordlistLoader.getWordSet( exclusionlist );
|
excltable = WordlistLoader.getWordSet( exclusionlist );
|
||||||
setPreviousTokenStream(null); // force a new stemmer to be created
|
setPreviousTokenStream(null); // force a new stemmer to be created
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided {@link Reader}.
|
* Creates
|
||||||
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer} filtered with
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}, and
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link BrazilianStemFilter}.
|
* {@link LowerCaseFilter}, {@link StandardFilter}, {@link StopFilter}
|
||||||
|
* , and {@link BrazilianStemFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.cjk;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.cn;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
|
import org.apache.lucene.analysis.standard.StandardAnalyzer; // javadoc @link
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
|
@ -35,11 +34,13 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
public final class ChineseAnalyzer extends ReusableAnalyzerBase {
|
public final class ChineseAnalyzer extends ReusableAnalyzerBase {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
* Creates
|
||||||
* provided {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link ChineseTokenizer} filtered with {@link ChineseFilter}
|
* built from a {@link ChineseTokenizer} filtered with
|
||||||
|
* {@link ChineseFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -113,7 +113,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
* strings.
|
* strings.
|
||||||
*/
|
*/
|
||||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||||
HyphenationTree hyphenator, Set dictionary) {
|
HyphenationTree hyphenator, Set<?> dictionary) {
|
||||||
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
}
|
}
|
||||||
|
@ -145,7 +145,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
* Add only the longest matching subword to the stream
|
* Add only the longest matching subword to the stream
|
||||||
*/
|
*/
|
||||||
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
|
||||||
HyphenationTree hyphenator, Set dictionary, int minWordSize,
|
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
|
||||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||||
onlyLongestMatch);
|
onlyLongestMatch);
|
||||||
|
@ -201,7 +201,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||||
HyphenationTree hyphenator, Set dictionary) {
|
HyphenationTree hyphenator, Set<?> dictionary) {
|
||||||
this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
|
||||||
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
|
||||||
}
|
}
|
||||||
|
@ -223,7 +223,7 @@ public class HyphenationCompoundWordTokenFilter extends
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
public HyphenationCompoundWordTokenFilter(TokenStream input,
|
||||||
HyphenationTree hyphenator, Set dictionary, int minWordSize,
|
HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
|
||||||
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
|
||||||
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
|
||||||
onlyLongestMatch);
|
onlyLongestMatch);
|
||||||
|
|
|
@ -83,7 +83,7 @@ public class CharVector implements Cloneable, Serializable {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public Object clone() {
|
public Object clone() {
|
||||||
CharVector cv = new CharVector((char[]) array.clone(), blockSize);
|
CharVector cv = new CharVector(array.clone(), blockSize);
|
||||||
cv.n = this.n;
|
cv.n = this.n;
|
||||||
return cv;
|
return cv;
|
||||||
}
|
}
|
||||||
|
|
|
@ -26,11 +26,6 @@ public class Hyphenation {
|
||||||
|
|
||||||
private int[] hyphenPoints;
|
private int[] hyphenPoints;
|
||||||
|
|
||||||
/**
|
|
||||||
* number of hyphenation points in word
|
|
||||||
*/
|
|
||||||
private int len;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
|
* rawWord as made of alternating strings and {@link Hyphen Hyphen} instances
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -44,7 +44,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||||
/**
|
/**
|
||||||
* This map stores hyphenation exceptions
|
* This map stores hyphenation exceptions
|
||||||
*/
|
*/
|
||||||
protected HashMap<String,ArrayList> stoplist;
|
protected HashMap<String,ArrayList<Object>> stoplist;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This map stores the character classes
|
* This map stores the character classes
|
||||||
|
@ -57,7 +57,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||||
private transient TernaryTree ivalues;
|
private transient TernaryTree ivalues;
|
||||||
|
|
||||||
public HyphenationTree() {
|
public HyphenationTree() {
|
||||||
stoplist = new HashMap<String,ArrayList>(23); // usually a small table
|
stoplist = new HashMap<String,ArrayList<Object>>(23); // usually a small table
|
||||||
classmap = new TernaryTree();
|
classmap = new TernaryTree();
|
||||||
vspace = new ByteVector();
|
vspace = new ByteVector();
|
||||||
vspace.alloc(1); // this reserves index 0, which we don't use
|
vspace.alloc(1); // this reserves index 0, which we don't use
|
||||||
|
@ -363,7 +363,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||||
if (stoplist.containsKey(sw)) {
|
if (stoplist.containsKey(sw)) {
|
||||||
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
|
// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no =
|
||||||
// null)
|
// null)
|
||||||
ArrayList hw = stoplist.get(sw);
|
ArrayList<Object> hw = stoplist.get(sw);
|
||||||
int j = 0;
|
int j = 0;
|
||||||
for (i = 0; i < hw.size(); i++) {
|
for (i = 0; i < hw.size(); i++) {
|
||||||
Object o = hw.get(i);
|
Object o = hw.get(i);
|
||||||
|
@ -443,7 +443,7 @@ public class HyphenationTree extends TernaryTree implements PatternConsumer,
|
||||||
* @param hyphenatedword a vector of alternating strings and
|
* @param hyphenatedword a vector of alternating strings and
|
||||||
* {@link Hyphen hyphen} objects.
|
* {@link Hyphen hyphen} objects.
|
||||||
*/
|
*/
|
||||||
public void addException(String word, ArrayList hyphenatedword) {
|
public void addException(String word, ArrayList<Object> hyphenatedword) {
|
||||||
stoplist.put(word, hyphenatedword);
|
stoplist.put(word, hyphenatedword);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -42,7 +42,7 @@ public interface PatternConsumer {
|
||||||
* his own hyphenation. A hyphenatedword is a vector of alternating String's
|
* his own hyphenation. A hyphenatedword is a vector of alternating String's
|
||||||
* and {@link Hyphen Hyphen} instances
|
* and {@link Hyphen Hyphen} instances
|
||||||
*/
|
*/
|
||||||
void addException(String word, ArrayList hyphenatedword);
|
void addException(String word, ArrayList<Object> hyphenatedword);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add hyphenation patterns.
|
* Add hyphenation patterns.
|
||||||
|
|
|
@ -51,7 +51,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
|
|
||||||
StringBuilder token;
|
StringBuilder token;
|
||||||
|
|
||||||
ArrayList exception;
|
ArrayList<Object> exception;
|
||||||
|
|
||||||
char hyphenChar;
|
char hyphenChar;
|
||||||
|
|
||||||
|
@ -199,8 +199,8 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
return pat.toString();
|
return pat.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
protected ArrayList normalizeException(ArrayList ex) {
|
protected ArrayList<Object> normalizeException(ArrayList<?> ex) {
|
||||||
ArrayList res = new ArrayList();
|
ArrayList<Object> res = new ArrayList<Object>();
|
||||||
for (int i = 0; i < ex.size(); i++) {
|
for (int i = 0; i < ex.size(); i++) {
|
||||||
Object item = ex.get(i);
|
Object item = ex.get(i);
|
||||||
if (item instanceof String) {
|
if (item instanceof String) {
|
||||||
|
@ -230,7 +230,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected String getExceptionWord(ArrayList ex) {
|
protected String getExceptionWord(ArrayList<?> ex) {
|
||||||
StringBuilder res = new StringBuilder();
|
StringBuilder res = new StringBuilder();
|
||||||
for (int i = 0; i < ex.size(); i++) {
|
for (int i = 0; i < ex.size(); i++) {
|
||||||
Object item = ex.get(i);
|
Object item = ex.get(i);
|
||||||
|
@ -291,7 +291,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
currElement = ELEM_PATTERNS;
|
currElement = ELEM_PATTERNS;
|
||||||
} else if (local.equals("exceptions")) {
|
} else if (local.equals("exceptions")) {
|
||||||
currElement = ELEM_EXCEPTIONS;
|
currElement = ELEM_EXCEPTIONS;
|
||||||
exception = new ArrayList();
|
exception = new ArrayList<Object>();
|
||||||
} else if (local.equals("hyphen")) {
|
} else if (local.equals("hyphen")) {
|
||||||
if (token.length() > 0) {
|
if (token.length() > 0) {
|
||||||
exception.add(token.toString());
|
exception.add(token.toString());
|
||||||
|
@ -308,6 +308,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
* java.lang.String, java.lang.String)
|
* java.lang.String, java.lang.String)
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
public void endElement(String uri, String local, String raw) {
|
public void endElement(String uri, String local, String raw) {
|
||||||
|
|
||||||
if (token.length() > 0) {
|
if (token.length() > 0) {
|
||||||
|
@ -344,6 +345,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
/**
|
/**
|
||||||
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
|
* @see org.xml.sax.ContentHandler#characters(char[], int, int)
|
||||||
*/
|
*/
|
||||||
|
@SuppressWarnings("unchecked")
|
||||||
@Override
|
@Override
|
||||||
public void characters(char ch[], int start, int length) {
|
public void characters(char ch[], int start, int length) {
|
||||||
StringBuffer chars = new StringBuffer(length);
|
StringBuffer chars = new StringBuffer(length);
|
||||||
|
@ -428,7 +430,7 @@ public class PatternParser extends DefaultHandler implements PatternConsumer {
|
||||||
System.out.println("class: " + c);
|
System.out.println("class: " + c);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addException(String w, ArrayList e) {
|
public void addException(String w, ArrayList<Object> e) {
|
||||||
System.out.println("exception: " + w + " : " + e.toString());
|
System.out.println("exception: " + w + " : " + e.toString());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -351,10 +351,10 @@ public class TernaryTree implements Cloneable, Serializable {
|
||||||
@Override
|
@Override
|
||||||
public Object clone() {
|
public Object clone() {
|
||||||
TernaryTree t = new TernaryTree();
|
TernaryTree t = new TernaryTree();
|
||||||
t.lo = (char[]) this.lo.clone();
|
t.lo = this.lo.clone();
|
||||||
t.hi = (char[]) this.hi.clone();
|
t.hi = this.hi.clone();
|
||||||
t.eq = (char[]) this.eq.clone();
|
t.eq = this.eq.clone();
|
||||||
t.sc = (char[]) this.sc.clone();
|
t.sc = this.sc.clone();
|
||||||
t.kv = (CharVector) this.kv.clone();
|
t.kv = (CharVector) this.kv.clone();
|
||||||
t.root = this.root;
|
t.root = this.root;
|
||||||
t.freenode = this.freenode;
|
t.freenode = this.freenode;
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.cz;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
|
@ -216,16 +215,20 @@ public final class CzechAnalyzer extends ReusableAnalyzerBase {
|
||||||
stoptable = Collections.emptySet();
|
stoptable = Collections.emptySet();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
* Creates
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link StopFilter}, and {@link CzechStemFilter} (only if version is
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
* >= LUCENE_31). If a version is >= LUCENE_31 and a stem exclusion set
|
* , and {@link CzechStemFilter} (only if version is >= LUCENE_31). If
|
||||||
* is provided via {@link #CzechAnalyzer(Version, Set, Set)} a
|
* a version is >= LUCENE_31 and a stem exclusion set is provided via
|
||||||
* {@link KeywordMarkerTokenFilter} is added before {@link CzechStemFilter}.
|
* {@link #CzechAnalyzer(Version, Set, Set)} a
|
||||||
|
* {@link KeywordMarkerTokenFilter} is added before
|
||||||
|
* {@link CzechStemFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class DanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -29,7 +29,6 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
@ -224,14 +223,15 @@ public final class GermanAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
* Creates
|
||||||
* provided {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided, and
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
* {@link SnowballFilter}
|
* provided, and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.el;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -122,12 +121,14 @@ public final class GreekAnalyzer extends StopwordAnalyzerBase
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
* Creates
|
||||||
* provided {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StandardTokenizer} filtered with
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and {@link StopFilter}
|
* {@link GreekLowerCaseFilter}, {@link StandardFilter} and
|
||||||
|
* {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
import org.apache.lucene.analysis.standard.StandardAnalyzer;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -90,13 +89,16 @@ public final class EnglishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link PorterStemFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link PorterStemFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class SpanishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -25,7 +25,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
|
@ -136,12 +135,13 @@ public final class PersianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
* Creates
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link ArabicLetterTokenizer}
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* filtered with {@link LowerCaseFilter},
|
* built from a {@link ArabicLetterTokenizer} filtered with
|
||||||
* {@link ArabicNormalizationFilter},
|
* {@link LowerCaseFilter}, {@link ArabicNormalizationFilter},
|
||||||
* {@link PersianNormalizationFilter} and Persian Stop words
|
* {@link PersianNormalizationFilter} and Persian Stop words
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class FinnishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.fr;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
|
@ -225,14 +224,16 @@ public final class FrenchAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
* Creates
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link StandardTokenizer}
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* filtered with {@link StandardFilter}, {@link ElisionFilter},
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter}, {@link ElisionFilter},
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
* {@link LowerCaseFilter}, {@link StopFilter},
|
||||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
* and {@link SnowballFilter}
|
* provided, and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.Reader;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
@ -106,15 +105,16 @@ public final class HindiAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the provided
|
* Creates
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a {@link IndicTokenizer}
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* filtered with {@link LowerCaseFilter},
|
* built from a {@link IndicTokenizer} filtered with
|
||||||
* {@link IndicNormalizationFilter},
|
* {@link LowerCaseFilter}, {@link IndicNormalizationFilter},
|
||||||
* {@link HindiNormalizationFilter},
|
* {@link HindiNormalizationFilter}, {@link KeywordMarkerTokenFilter}
|
||||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
* if a stem exclusion set is provided, {@link HindiStemFilter}, and
|
||||||
* {@link HindiStemFilter}, and Hindi Stop words
|
* Hindi Stop words
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class HungarianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class ItalianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -311,7 +311,7 @@ public final class PatternAnalyzer extends Analyzer {
|
||||||
|
|
||||||
return new String(output, 0, len);
|
return new String(output, 0, len);
|
||||||
} finally {
|
} finally {
|
||||||
if (input != null) input.close();
|
input.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -124,7 +124,7 @@ public final class EdgeNGramTokenFilter extends TokenFilter {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
curTermBuffer = (char[]) termAtt.termBuffer().clone();
|
curTermBuffer = termAtt.termBuffer().clone();
|
||||||
curTermLength = termAtt.termLength();
|
curTermLength = termAtt.termLength();
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
tokStart = offsetAtt.startOffset();
|
tokStart = offsetAtt.startOffset();
|
||||||
|
|
|
@ -79,7 +79,7 @@ public final class NGramTokenFilter extends TokenFilter {
|
||||||
if (!input.incrementToken()) {
|
if (!input.incrementToken()) {
|
||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
curTermBuffer = (char[]) termAtt.termBuffer().clone();
|
curTermBuffer = termAtt.termBuffer().clone();
|
||||||
curTermLength = termAtt.termLength();
|
curTermLength = termAtt.termLength();
|
||||||
curGramSize = minGram;
|
curGramSize = minGram;
|
||||||
curPos = 0;
|
curPos = 0;
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class NorwegianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class PortugueseAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -192,7 +192,7 @@ public final class QueryAutoStopWordAnalyzer extends Analyzer {
|
||||||
* if there stopwords, it is a StopFilter around wrapped.
|
* if there stopwords, it is a StopFilter around wrapped.
|
||||||
*/
|
*/
|
||||||
TokenStream withStopFilter;
|
TokenStream withStopFilter;
|
||||||
};
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
public TokenStream reusableTokenStream(String fieldName, Reader reader)
|
||||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -110,13 +109,16 @@ public final class RomanianAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -26,7 +26,6 @@ import java.util.Set;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -161,14 +160,15 @@ public final class RussianAnalyzer extends StopwordAnalyzerBase
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
* Creates
|
||||||
* provided {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link LowerCaseFilter}, {@link StopFilter},
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
* {@link KeywordMarkerTokenFilter} if a stem exclusion set is provided,
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
* and {@link SnowballFilter}
|
* provided, and {@link SnowballFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -26,7 +26,7 @@ package org.apache.lucene.analysis.ru;
|
||||||
class RussianStemmer
|
class RussianStemmer
|
||||||
{
|
{
|
||||||
// positions of RV, R1 and R2 respectively
|
// positions of RV, R1 and R2 respectively
|
||||||
private int RV, R1, R2;
|
private int RV, /*R1,*/ R2;
|
||||||
|
|
||||||
// letters (currently unused letters are commented out)
|
// letters (currently unused letters are commented out)
|
||||||
private final static char A = '\u0430';
|
private final static char A = '\u0430';
|
||||||
|
@ -263,11 +263,7 @@ class RussianStemmer
|
||||||
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
|
if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
|
||||||
return false;
|
return false;
|
||||||
// if adjective ending was found, try for participle ending.
|
// if adjective ending was found, try for participle ending.
|
||||||
// variable r is unused, we are just interested in the side effect of
|
if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
|
||||||
// findAndRemoveEnding():
|
|
||||||
boolean r =
|
|
||||||
findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors)
|
|
||||||
||
|
|
||||||
findAndRemoveEnding(stemmingZone, participleEndings2);
|
findAndRemoveEnding(stemmingZone, participleEndings2);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -391,7 +387,7 @@ class RussianStemmer
|
||||||
private void markPositions(String word)
|
private void markPositions(String word)
|
||||||
{
|
{
|
||||||
RV = 0;
|
RV = 0;
|
||||||
R1 = 0;
|
// R1 = 0;
|
||||||
R2 = 0;
|
R2 = 0;
|
||||||
int i = 0;
|
int i = 0;
|
||||||
// find RV
|
// find RV
|
||||||
|
@ -409,7 +405,7 @@ class RussianStemmer
|
||||||
}
|
}
|
||||||
if (word.length() - 1 < ++i)
|
if (word.length() - 1 < ++i)
|
||||||
return; // R1 zone is empty
|
return; // R1 zone is empty
|
||||||
R1 = i;
|
// R1 = i;
|
||||||
// find R2
|
// find R2
|
||||||
while (word.length() > i && !isVowel(word.charAt(i)))
|
while (word.length() > i && !isVowel(word.charAt(i)))
|
||||||
{
|
{
|
||||||
|
@ -532,13 +528,9 @@ class RussianStemmer
|
||||||
if (!perfectiveGerund(stemmingZone))
|
if (!perfectiveGerund(stemmingZone))
|
||||||
{
|
{
|
||||||
reflexive(stemmingZone);
|
reflexive(stemmingZone);
|
||||||
// variable r is unused, we are just interested in the flow that gets
|
if (!adjectival(stemmingZone))
|
||||||
// created by logical expression: apply adjectival(); if that fails,
|
if (!verb(stemmingZone))
|
||||||
// apply verb() etc
|
noun(stemmingZone);
|
||||||
boolean r =
|
|
||||||
adjectival(stemmingZone)
|
|
||||||
|| verb(stemmingZone)
|
|
||||||
|| noun(stemmingZone);
|
|
||||||
}
|
}
|
||||||
// Step 2
|
// Step 2
|
||||||
removeI(stemmingZone);
|
removeI(stemmingZone);
|
||||||
|
|
|
@ -391,8 +391,8 @@ public final class ShingleFilter extends TokenFilter {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* {@see #advance()}
|
|
||||||
* @return the current value.
|
* @return the current value.
|
||||||
|
* @see #advance()
|
||||||
*/
|
*/
|
||||||
public int getValue() {
|
public int getValue() {
|
||||||
return value;
|
return value;
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
|
||||||
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column.Row;
|
||||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.sinks;
|
||||||
|
|
||||||
import java.text.DateFormat;
|
import java.text.DateFormat;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkFilter;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkFilter;
|
||||||
|
@ -42,7 +41,7 @@ public class DateRecognizerSinkFilter extends SinkFilter {
|
||||||
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
* Uses {@link java.text.SimpleDateFormat#getDateInstance()} as the {@link java.text.DateFormat} object.
|
||||||
*/
|
*/
|
||||||
public DateRecognizerSinkFilter() {
|
public DateRecognizerSinkFilter() {
|
||||||
this(SimpleDateFormat.getDateInstance());
|
this(DateFormat.getDateInstance());
|
||||||
}
|
}
|
||||||
|
|
||||||
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
public DateRecognizerSinkFilter(DateFormat dateFormat) {
|
||||||
|
|
|
@ -30,7 +30,6 @@ import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WordlistLoader;
|
import org.apache.lucene.analysis.WordlistLoader;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -106,13 +105,16 @@ public final class SwedishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link LowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* {@link StandardFilter}, {@link LowerCaseFilter}, {@link StopFilter}
|
||||||
|
* , {@link KeywordMarkerTokenFilter} if a stem exclusion set is
|
||||||
|
* provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.th;
|
||||||
import java.io.Reader;
|
import java.io.Reader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
import org.apache.lucene.analysis.ReusableAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.StopFilter;
|
import org.apache.lucene.analysis.StopFilter;
|
||||||
|
@ -45,12 +44,14 @@ public final class ThaiAnalyzer extends ReusableAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates {@link TokenStreamComponents} used to tokenize all the text in the
|
* Creates
|
||||||
* provided {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* used to tokenize all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return {@link TokenStreamComponents} built from a
|
* @return {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
* {@link StandardTokenizer} filtered with {@link StandardFilter},
|
* built from a {@link StandardTokenizer} filtered with
|
||||||
* {@link ThaiWordFilter}, and {@link StopFilter}
|
* {@link StandardFilter}, {@link ThaiWordFilter}, and
|
||||||
|
* {@link StopFilter}
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected TokenStreamComponents createComponents(String fieldName,
|
protected TokenStreamComponents createComponents(String fieldName,
|
||||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.StopFilter;
|
||||||
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
import org.apache.lucene.analysis.StopwordAnalyzerBase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents; // javadoc @link
|
|
||||||
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
import org.apache.lucene.analysis.snowball.SnowballFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardFilter;
|
import org.apache.lucene.analysis.standard.StandardFilter;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
|
@ -109,11 +108,14 @@ public final class TurkishAnalyzer extends StopwordAnalyzerBase {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates a {@link TokenStreamComponents} which tokenizes all the text in the provided
|
* Creates a
|
||||||
* {@link Reader}.
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* which tokenizes all the text in the provided {@link Reader}.
|
||||||
*
|
*
|
||||||
* @return A {@link TokenStreamComponents} built from an {@link StandardTokenizer}
|
* @return A
|
||||||
* filtered with {@link StandardFilter}, {@link TurkishLowerCaseFilter},
|
* {@link org.apache.lucene.analysis.ReusableAnalyzerBase.TokenStreamComponents}
|
||||||
|
* built from an {@link StandardTokenizer} filtered with
|
||||||
|
* {@link StandardFilter}, {@link TurkishLowerCaseFilter},
|
||||||
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
* {@link StopFilter}, {@link KeywordMarkerTokenFilter} if a stem
|
||||||
* exclusion set is provided and {@link SnowballFilter}.
|
* exclusion set is provided and {@link SnowballFilter}.
|
||||||
*/
|
*/
|
||||||
|
|
|
@ -24,7 +24,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Arabic Analyzer
|
* Test the Arabic Analyzer
|
||||||
|
@ -35,14 +34,14 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new ArabicAnalyzer(Version.LUCENE_CURRENT);
|
new ArabicAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Some simple tests showing some features of the analyzer, how some regular forms will conflate
|
* Some simple tests showing some features of the analyzer, how some regular forms will conflate
|
||||||
*/
|
*/
|
||||||
public void testBasicFeatures() throws Exception {
|
public void testBasicFeatures() throws Exception {
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
|
assertAnalyzesTo(a, "كبير", new String[] { "كبير" });
|
||||||
assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
assertAnalyzesTo(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
||||||
|
|
||||||
|
@ -63,7 +62,7 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Simple tests to show things are getting reset correctly, etc.
|
* Simple tests to show things are getting reset correctly, etc.
|
||||||
*/
|
*/
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
|
assertAnalyzesToReuse(a, "كبير", new String[] { "كبير" });
|
||||||
assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
assertAnalyzesToReuse(a, "كبيرة", new String[] { "كبير" }); // feminine marker
|
||||||
}
|
}
|
||||||
|
@ -72,7 +71,7 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Non-arabic text gets treated in a similar way as SimpleAnalyzer.
|
* Non-arabic text gets treated in a similar way as SimpleAnalyzer.
|
||||||
*/
|
*/
|
||||||
public void testEnglishInput() throws Exception {
|
public void testEnglishInput() throws Exception {
|
||||||
assertAnalyzesTo(new ArabicAnalyzer(Version.LUCENE_CURRENT), "English text.", new String[] {
|
assertAnalyzesTo(new ArabicAnalyzer(TEST_VERSION_CURRENT), "English text.", new String[] {
|
||||||
"english", "text" });
|
"english", "text" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -82,7 +81,7 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testCustomStopwords() throws Exception {
|
public void testCustomStopwords() throws Exception {
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
Collections.addAll(set, "the", "and", "a");
|
Collections.addAll(set, "the", "and", "a");
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, set);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, set);
|
||||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
"brown", "fox" });
|
"brown", "fox" });
|
||||||
}
|
}
|
||||||
|
@ -90,12 +89,12 @@ public class TestArabicAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testWithStemExclusionSet() throws IOException {
|
public void testWithStemExclusionSet() throws IOException {
|
||||||
Set<String> set = new HashSet<String>();
|
Set<String> set = new HashSet<String>();
|
||||||
set.add("ساهدهات");
|
set.add("ساهدهات");
|
||||||
ArabicAnalyzer a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
ArabicAnalyzer a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
|
||||||
|
|
||||||
|
|
||||||
a = new ArabicAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
|
a = new ArabicAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
|
||||||
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||||
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
assertAnalyzesToReuse(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,11 +21,9 @@ import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Arabic Normalization Filter
|
* Test the Arabic Normalization Filter
|
||||||
*
|
|
||||||
*/
|
*/
|
||||||
public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
|
public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -86,7 +84,7 @@ public class TestArabicNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT, new StringReader(input));
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
ArabicNormalizationFilter filter = new ArabicNormalizationFilter(tokenStream);
|
||||||
assertTokenStreamContents(filter, new String[]{expected});
|
assertTokenStreamContents(filter, new String[]{expected});
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,7 +23,6 @@ import java.io.StringReader;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Arabic Normalization Filter
|
* Test the Arabic Normalization Filter
|
||||||
|
@ -116,16 +115,16 @@ public class TestArabicStemFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("ساهدهات");
|
set.add("ساهدهات");
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT, new StringReader("ساهدهات"));
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader("ساهدهات"));
|
||||||
|
|
||||||
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerTokenFilter(tokenStream, set));
|
ArabicStemFilter filter = new ArabicStemFilter(new KeywordMarkerTokenFilter(tokenStream, set));
|
||||||
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
|
assertTokenStreamContents(filter, new String[]{"ساهدهات"});
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT, new StringReader(input));
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT, new StringReader(input));
|
||||||
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
ArabicStemFilter filter = new ArabicStemFilter(tokenStream);
|
||||||
assertTokenStreamContents(filter, new String[]{expected});
|
assertTokenStreamContents(filter, new String[]{expected});
|
||||||
}
|
}
|
||||||
|
|
|
@ -34,23 +34,23 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* This test fails with NPE when the stopwords file is missing in classpath
|
* This test fails with NPE when the stopwords file is missing in classpath
|
||||||
*/
|
*/
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopwords() throws IOException {
|
public void testStopwords() throws IOException {
|
||||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
|
assertAnalyzesTo(a, "Как се казваш?", new String[] {"казваш"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testCustomStopwords() throws IOException {
|
public void testCustomStopwords() throws IOException {
|
||||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, Collections
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, Collections
|
||||||
.emptySet());
|
.emptySet());
|
||||||
assertAnalyzesTo(a, "Как се казваш?",
|
assertAnalyzesTo(a, "Как се казваш?",
|
||||||
new String[] {"как", "се", "казваш"});
|
new String[] {"как", "се", "казваш"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws IOException {
|
public void testReusableTokenStream() throws IOException {
|
||||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
|
assertAnalyzesToReuse(a, "документи", new String[] {"документ"});
|
||||||
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
|
assertAnalyzesToReuse(a, "документ", new String[] {"документ"});
|
||||||
}
|
}
|
||||||
|
@ -59,7 +59,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test some examples from the paper
|
* Test some examples from the paper
|
||||||
*/
|
*/
|
||||||
public void testBasicExamples() throws IOException {
|
public void testBasicExamples() throws IOException {
|
||||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"});
|
assertAnalyzesTo(a, "енергийни кризи", new String[] {"енергийн", "криз"});
|
||||||
assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"});
|
assertAnalyzesTo(a, "Атомната енергия", new String[] {"атомн", "енерг"});
|
||||||
|
|
||||||
|
@ -72,7 +72,7 @@ public class TestBulgarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testWithStemExclusionSet() throws IOException {
|
public void testWithStemExclusionSet() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
||||||
set.add("строеве");
|
set.add("строеве");
|
||||||
Analyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
Analyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
|
assertAnalyzesTo(a, "строевете строеве", new String[] { "строй", "строеве" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,7 +35,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
* common (and some rare) plural pattern is listed.
|
* common (and some rare) plural pattern is listed.
|
||||||
*/
|
*/
|
||||||
public void testMasculineNouns() throws IOException {
|
public void testMasculineNouns() throws IOException {
|
||||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
// -и pattern
|
// -и pattern
|
||||||
assertAnalyzesTo(a, "град", new String[] {"град"});
|
assertAnalyzesTo(a, "град", new String[] {"град"});
|
||||||
|
@ -101,7 +101,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how feminine noun forms conflate
|
* Test showing how feminine noun forms conflate
|
||||||
*/
|
*/
|
||||||
public void testFeminineNouns() throws IOException {
|
public void testFeminineNouns() throws IOException {
|
||||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(a, "вест", new String[] {"вест"});
|
assertAnalyzesTo(a, "вест", new String[] {"вест"});
|
||||||
assertAnalyzesTo(a, "вестта", new String[] {"вест"});
|
assertAnalyzesTo(a, "вестта", new String[] {"вест"});
|
||||||
|
@ -114,7 +114,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
* plural pattern is listed
|
* plural pattern is listed
|
||||||
*/
|
*/
|
||||||
public void testNeuterNouns() throws IOException {
|
public void testNeuterNouns() throws IOException {
|
||||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
// -а pattern
|
// -а pattern
|
||||||
assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
|
assertAnalyzesTo(a, "дърво", new String[] {"дърв"});
|
||||||
|
@ -142,7 +142,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how adjectival forms conflate
|
* Test showing how adjectival forms conflate
|
||||||
*/
|
*/
|
||||||
public void testAdjectives() throws IOException {
|
public void testAdjectives() throws IOException {
|
||||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "красив", new String[] {"красив"});
|
assertAnalyzesTo(a, "красив", new String[] {"красив"});
|
||||||
assertAnalyzesTo(a, "красивия", new String[] {"красив"});
|
assertAnalyzesTo(a, "красивия", new String[] {"красив"});
|
||||||
assertAnalyzesTo(a, "красивият", new String[] {"красив"});
|
assertAnalyzesTo(a, "красивият", new String[] {"красив"});
|
||||||
|
@ -158,7 +158,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test some exceptional rules, implemented as rewrites.
|
* Test some exceptional rules, implemented as rewrites.
|
||||||
*/
|
*/
|
||||||
public void testExceptions() throws IOException {
|
public void testExceptions() throws IOException {
|
||||||
BulgarianAnalyzer a = new BulgarianAnalyzer(Version.LUCENE_CURRENT);
|
BulgarianAnalyzer a = new BulgarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
// ци -> к
|
// ци -> к
|
||||||
assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
|
assertAnalyzesTo(a, "собственик", new String[] {"собственик"});
|
||||||
|
@ -215,7 +215,7 @@ public class TestBulgarianStemmer extends BaseTokenStreamTestCase {
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
CharArraySet set = new CharArraySet(Version.LUCENE_31, 1, true);
|
||||||
set.add("строеве");
|
set.add("строеве");
|
||||||
WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
WhitespaceTokenizer tokenStream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("строевете строеве"));
|
new StringReader("строевете строеве"));
|
||||||
|
|
||||||
BulgarianStemFilter filter = new BulgarianStemFilter(
|
BulgarianStemFilter filter = new BulgarianStemFilter(
|
||||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
import org.apache.lucene.analysis.LowerCaseTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Brazilian Stem Filter, which only modifies the term text.
|
* Test the Brazilian Stem Filter, which only modifies the term text.
|
||||||
|
@ -128,7 +127,7 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkReuse(a, "boa", "boa");
|
checkReuse(a, "boa", "boa");
|
||||||
checkReuse(a, "boainain", "boainain");
|
checkReuse(a, "boainain", "boainain");
|
||||||
checkReuse(a, "boas", "boas");
|
checkReuse(a, "boas", "boas");
|
||||||
|
@ -136,35 +135,35 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemExclusionTable() throws Exception {
|
public void testStemExclusionTable() throws Exception {
|
||||||
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
a.setStemExclusionTable(new String[] { "quintessência" });
|
a.setStemExclusionTable(new String[] { "quintessência" });
|
||||||
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
checkReuse(a, "quintessência", "quintessência"); // excluded words will be completely unchanged.
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStemExclusionTableBWCompat() throws IOException {
|
public void testStemExclusionTableBWCompat() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("Brasília");
|
set.add("Brasília");
|
||||||
BrazilianStemFilter filter = new BrazilianStemFilter(
|
BrazilianStemFilter filter = new BrazilianStemFilter(
|
||||||
new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader("Brasília Brasilia")), set);
|
new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader("Brasília Brasilia")), set);
|
||||||
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
|
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("Brasília");
|
set.add("Brasília");
|
||||||
BrazilianStemFilter filter = new BrazilianStemFilter(
|
BrazilianStemFilter filter = new BrazilianStemFilter(
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Brasília Brasilia")), set));
|
"Brasília Brasilia")), set));
|
||||||
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
|
assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("Brasília");
|
set.add("Brasília");
|
||||||
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set1 = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set1.add("Brasilia");
|
set1.add("Brasilia");
|
||||||
BrazilianStemFilter filter = new BrazilianStemFilter(
|
BrazilianStemFilter filter = new BrazilianStemFilter(
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Brasília Brasilia")), set), set1);
|
"Brasília Brasilia")), set), set1);
|
||||||
assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
|
assertTokenStreamContents(filter, new String[] { "brasília", "brasilia" });
|
||||||
}
|
}
|
||||||
|
@ -174,14 +173,14 @@ public class TestBrazilianStemmer extends BaseTokenStreamTestCase {
|
||||||
* when using reusable token streams.
|
* when using reusable token streams.
|
||||||
*/
|
*/
|
||||||
public void testExclusionTableReuse() throws Exception {
|
public void testExclusionTableReuse() throws Exception {
|
||||||
BrazilianAnalyzer a = new BrazilianAnalyzer(Version.LUCENE_CURRENT);
|
BrazilianAnalyzer a = new BrazilianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkReuse(a, "quintessência", "quintessente");
|
checkReuse(a, "quintessência", "quintessente");
|
||||||
a.setStemExclusionTable(new String[] { "quintessência" });
|
a.setStemExclusionTable(new String[] { "quintessência" });
|
||||||
checkReuse(a, "quintessência", "quintessência");
|
checkReuse(a, "quintessência", "quintessência");
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws Exception {
|
private void check(final String input, final String expected) throws Exception {
|
||||||
checkOneTerm(new BrazilianAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
checkOneTerm(new BrazilianAnalyzer(TEST_VERSION_CURRENT), input, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
|
private void checkReuse(Analyzer a, String input, String expected) throws Exception {
|
||||||
|
|
|
@ -21,7 +21,6 @@ import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -42,7 +41,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
|
public void checkCJKToken(final String str, final TestToken[] out_tokens) throws IOException {
|
||||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||||
String terms[] = new String[out_tokens.length];
|
String terms[] = new String[out_tokens.length];
|
||||||
int startOffsets[] = new int[out_tokens.length];
|
int startOffsets[] = new int[out_tokens.length];
|
||||||
int endOffsets[] = new int[out_tokens.length];
|
int endOffsets[] = new int[out_tokens.length];
|
||||||
|
@ -57,7 +56,7 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
|
public void checkCJKTokenReusable(final Analyzer a, final String str, final TestToken[] out_tokens) throws IOException {
|
||||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||||
String terms[] = new String[out_tokens.length];
|
String terms[] = new String[out_tokens.length];
|
||||||
int startOffsets[] = new int[out_tokens.length];
|
int startOffsets[] = new int[out_tokens.length];
|
||||||
int endOffsets[] = new int[out_tokens.length];
|
int endOffsets[] = new int[out_tokens.length];
|
||||||
|
@ -213,13 +212,13 @@ public class TestCJKTokenizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTokenStream() throws Exception {
|
public void testTokenStream() throws Exception {
|
||||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
|
assertAnalyzesTo(analyzer, "\u4e00\u4e01\u4e02",
|
||||||
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
|
new String[] { "\u4e00\u4e01", "\u4e01\u4e02"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer analyzer = new CJKAnalyzer(TEST_VERSION_CURRENT);
|
||||||
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053";
|
||||||
|
|
||||||
TestToken[] out_tokens = {
|
TestToken[] out_tokens = {
|
||||||
|
|
|
@ -28,17 +28,11 @@ import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
static final File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");
|
static final File testFile = new File(dataDir, "org/apache/lucene/analysis/compound/da_UTF8.xml");
|
||||||
|
|
||||||
@Override
|
|
||||||
protected void setUp() throws Exception {
|
|
||||||
super.setUp();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testHyphenationCompoundWordsDA() throws Exception {
|
public void testHyphenationCompoundWordsDA() throws Exception {
|
||||||
String[] dict = { "læse", "hest" };
|
String[] dict = { "læse", "hest" };
|
||||||
|
|
||||||
|
@ -47,8 +41,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
|
||||||
.getHyphenationTree(reader);
|
.getHyphenationTree(reader);
|
||||||
|
|
||||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT,
|
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"min veninde som er lidt af en læsehest")), hyphenator,
|
"min veninde som er lidt af en læsehest")), hyphenator,
|
||||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
|
@ -67,8 +61,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
.getHyphenationTree(reader);
|
.getHyphenationTree(reader);
|
||||||
|
|
||||||
// the word basket will not be added due to the longest match option
|
// the word basket will not be added due to the longest match option
|
||||||
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(Version.LUCENE_CURRENT,
|
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"basketballkurv")), hyphenator, dict,
|
"basketballkurv")), hyphenator, dict,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
|
||||||
|
@ -84,8 +78,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
|
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
|
||||||
"Sko", "Vind", "Rute", "Torkare", "Blad" };
|
"Sko", "Vind", "Rute", "Torkare", "Blad" };
|
||||||
|
|
||||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
|
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader(
|
new StringReader(
|
||||||
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
"Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
|
||||||
dict);
|
dict);
|
||||||
|
@ -113,8 +107,8 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
|
"Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
|
||||||
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
|
"Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
|
||||||
|
|
||||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
|
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Basfiolsfodralmakaregesäll")),
|
||||||
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
|
CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
|
||||||
|
@ -129,9 +123,9 @@ public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
|
||||||
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
|
||||||
"Aufgabe", "Überwachung" };
|
"Aufgabe", "Überwachung" };
|
||||||
|
|
||||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Rindfleischüberwachungsgesetz"));
|
"Rindfleischüberwachungsgesetz"));
|
||||||
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(Version.LUCENE_CURRENT,
|
DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
|
||||||
wsTokenizer, dict,
|
wsTokenizer, dict,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
|
||||||
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
|
||||||
|
|
|
@ -48,7 +48,7 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopWord() throws Exception {
|
public void testStopWord() throws Exception {
|
||||||
assertAnalyzesTo(new CzechAnalyzer(Version.LUCENE_CURRENT), "Pokud mluvime o volnem",
|
assertAnalyzesTo(new CzechAnalyzer(TEST_VERSION_CURRENT), "Pokud mluvime o volnem",
|
||||||
new String[] { "mluvim", "voln" });
|
new String[] { "mluvim", "voln" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer analyzer = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer analyzer = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
|
assertAnalyzesToReuse(analyzer, "Pokud mluvime o volnem", new String[] { "mluvim", "voln" });
|
||||||
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
|
assertAnalyzesToReuse(analyzer, "Česká Republika", new String[] { "česk", "republik" });
|
||||||
}
|
}
|
||||||
|
@ -112,9 +112,9 @@ public class TestCzechAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithStemExclusionSet() throws IOException{
|
public void testWithStemExclusionSet() throws IOException{
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("hole");
|
set.add("hole");
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
|
assertAnalyzesTo(cz, "hole desek", new String[] {"hole", "desk"});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.CharArraySet;
|
import org.apache.lucene.analysis.CharArraySet;
|
||||||
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
import org.apache.lucene.analysis.KeywordMarkerTokenFilter;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Czech Stemmer.
|
* Test the Czech Stemmer.
|
||||||
|
@ -38,7 +37,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how masculine noun forms conflate
|
* Test showing how masculine noun forms conflate
|
||||||
*/
|
*/
|
||||||
public void testMasculineNouns() throws IOException {
|
public void testMasculineNouns() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
/* animate ending with a hard consonant */
|
/* animate ending with a hard consonant */
|
||||||
assertAnalyzesTo(cz, "pán", new String[] { "pán" });
|
assertAnalyzesTo(cz, "pán", new String[] { "pán" });
|
||||||
|
@ -106,7 +105,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how feminine noun forms conflate
|
* Test showing how feminine noun forms conflate
|
||||||
*/
|
*/
|
||||||
public void testFeminineNouns() throws IOException {
|
public void testFeminineNouns() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
/* ending with hard consonant */
|
/* ending with hard consonant */
|
||||||
assertAnalyzesTo(cz, "kost", new String[] { "kost" });
|
assertAnalyzesTo(cz, "kost", new String[] { "kost" });
|
||||||
|
@ -150,7 +149,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how neuter noun forms conflate
|
* Test showing how neuter noun forms conflate
|
||||||
*/
|
*/
|
||||||
public void testNeuterNouns() throws IOException {
|
public void testNeuterNouns() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
/* ending with o */
|
/* ending with o */
|
||||||
assertAnalyzesTo(cz, "město", new String[] { "měst" });
|
assertAnalyzesTo(cz, "město", new String[] { "měst" });
|
||||||
|
@ -193,7 +192,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test showing how adjectival forms conflate
|
* Test showing how adjectival forms conflate
|
||||||
*/
|
*/
|
||||||
public void testAdjectives() throws IOException {
|
public void testAdjectives() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
/* ending with ý/á/é */
|
/* ending with ý/á/é */
|
||||||
assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
|
assertAnalyzesTo(cz, "mladý", new String[] { "mlad" });
|
||||||
|
@ -221,7 +220,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test some possessive suffixes
|
* Test some possessive suffixes
|
||||||
*/
|
*/
|
||||||
public void testPossessive() throws IOException {
|
public void testPossessive() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
|
assertAnalyzesTo(cz, "Karlův", new String[] { "karl" });
|
||||||
assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
|
assertAnalyzesTo(cz, "jazykový", new String[] { "jazyk" });
|
||||||
}
|
}
|
||||||
|
@ -230,7 +229,7 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test some exceptional rules, implemented as rewrites.
|
* Test some exceptional rules, implemented as rewrites.
|
||||||
*/
|
*/
|
||||||
public void testExceptions() throws IOException {
|
public void testExceptions() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
/* rewrite of št -> sk */
|
/* rewrite of št -> sk */
|
||||||
assertAnalyzesTo(cz, "český", new String[] { "česk" });
|
assertAnalyzesTo(cz, "český", new String[] { "česk" });
|
||||||
|
@ -270,16 +269,16 @@ public class TestCzechStemmer extends BaseTokenStreamTestCase {
|
||||||
* Test that very short words are not stemmed.
|
* Test that very short words are not stemmed.
|
||||||
*/
|
*/
|
||||||
public void testDontStem() throws IOException {
|
public void testDontStem() throws IOException {
|
||||||
CzechAnalyzer cz = new CzechAnalyzer(Version.LUCENE_CURRENT);
|
CzechAnalyzer cz = new CzechAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(cz, "e", new String[] { "e" });
|
assertAnalyzesTo(cz, "e", new String[] { "e" });
|
||||||
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
|
assertAnalyzesTo(cz, "zi", new String[] { "zi" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("hole");
|
set.add("hole");
|
||||||
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
|
CzechStemFilter filter = new CzechStemFilter(new KeywordMarkerTokenFilter(
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hole desek")), set));
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hole desek")), set));
|
||||||
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
assertTokenStreamContents(filter, new String[] { "hole", "desk" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new DanishAnalyzer(Version.LUCENE_CURRENT);
|
new DanishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||||
checkOneTermReuse(a, "undersøgelse", "undersøg");
|
checkOneTermReuse(a, "undersøgelse", "undersøg");
|
||||||
|
@ -46,7 +45,7 @@ public class TestDanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("undersøgelse");
|
exclusionSet.add("undersøgelse");
|
||||||
Analyzer a = new DanishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new DanishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
DanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
checkOneTermReuse(a, "undersøgelse", "undersøgelse");
|
||||||
checkOneTermReuse(a, "undersøg", "undersøg");
|
checkOneTermReuse(a, "undersøg", "undersøg");
|
||||||
|
|
|
@ -29,38 +29,38 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "Tisch", "tisch");
|
checkOneTermReuse(a, "Tisch", "tisch");
|
||||||
checkOneTermReuse(a, "Tische", "tisch");
|
checkOneTermReuse(a, "Tische", "tisch");
|
||||||
checkOneTermReuse(a, "Tischen", "tisch");
|
checkOneTermReuse(a, "Tischen", "tisch");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testExclusionTableBWCompat() throws IOException {
|
public void testExclusionTableBWCompat() throws IOException {
|
||||||
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT,
|
GermanStemFilter filter = new GermanStemFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("Fischen Trinken")));
|
new StringReader("Fischen Trinken")));
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("fischen");
|
set.add("fischen");
|
||||||
filter.setExclusionSet(set);
|
filter.setExclusionSet(set);
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttribute() throws IOException {
|
public void testWithKeywordAttribute() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("fischen");
|
set.add("fischen");
|
||||||
GermanStemFilter filter = new GermanStemFilter(
|
GermanStemFilter filter = new GermanStemFilter(
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Fischen Trinken")), set));
|
"Fischen Trinken")), set));
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
public void testWithKeywordAttributeAndExclusionTable() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("fischen");
|
set.add("fischen");
|
||||||
CharArraySet set1 = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set1 = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set1.add("trinken");
|
set1.add("trinken");
|
||||||
set1.add("fischen");
|
set1.add("fischen");
|
||||||
GermanStemFilter filter = new GermanStemFilter(
|
GermanStemFilter filter = new GermanStemFilter(
|
||||||
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
new KeywordMarkerTokenFilter(new LowerCaseTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Fischen Trinken")), set));
|
"Fischen Trinken")), set));
|
||||||
filter.setExclusionSet(set1);
|
filter.setExclusionSet(set1);
|
||||||
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
assertTokenStreamContents(filter, new String[] { "fischen", "trinken" });
|
||||||
|
@ -71,7 +71,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* when using reusable token streams.
|
* when using reusable token streams.
|
||||||
*/
|
*/
|
||||||
public void testExclusionTableReuse() throws Exception {
|
public void testExclusionTableReuse() throws Exception {
|
||||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "tischen", "tisch");
|
checkOneTermReuse(a, "tischen", "tisch");
|
||||||
a.setStemExclusionTable(new String[] { "tischen" });
|
a.setStemExclusionTable(new String[] { "tischen" });
|
||||||
checkOneTermReuse(a, "tischen", "tischen");
|
checkOneTermReuse(a, "tischen", "tischen");
|
||||||
|
@ -81,7 +81,7 @@ public class TestGermanAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
|
* these only pass with LUCENE_CURRENT, not if you use o.a.l.a.de.GermanStemmer
|
||||||
*/
|
*/
|
||||||
public void testGermanSpecials() throws Exception {
|
public void testGermanSpecials() throws Exception {
|
||||||
GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_CURRENT);
|
GermanAnalyzer a = new GermanAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// a/o/u + e is equivalent to the umlaut form
|
// a/o/u + e is equivalent to the umlaut form
|
||||||
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
checkOneTermReuse(a, "Schaltflächen", "schaltflach");
|
||||||
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
checkOneTermReuse(a, "Schaltflaechen", "schaltflach");
|
||||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.LowerCaseFilter;
|
import org.apache.lucene.analysis.LowerCaseFilter;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the German stemmer. The stemming algorithm is known to work less
|
* Test the German stemmer. The stemming algorithm is known to work less
|
||||||
|
@ -40,7 +39,7 @@ public class TestGermanStemFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testStemming() throws Exception {
|
public void testStemming() throws Exception {
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(""));
|
||||||
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer));
|
TokenFilter filter = new GermanStemFilter(new LowerCaseFilter(TEST_VERSION_CURRENT, tokenizer));
|
||||||
// read test cases from external file:
|
// read test cases from external file:
|
||||||
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
File dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
File testFile = new File(dataDir, "org/apache/lucene/analysis/de/data.txt");
|
||||||
|
|
|
@ -32,7 +32,7 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
* @throws Exception in case an error occurs
|
* @throws Exception in case an error occurs
|
||||||
*/
|
*/
|
||||||
public void testAnalyzer() throws Exception {
|
public void testAnalyzer() throws Exception {
|
||||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// Verify the correct analysis of capitals and small accented letters
|
// Verify the correct analysis of capitals and small accented letters
|
||||||
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
assertAnalyzesTo(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||||
|
@ -48,7 +48,7 @@ public class GreekAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new GreekAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new GreekAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// Verify the correct analysis of capitals and small accented letters
|
// Verify the correct analysis of capitals and small accented letters
|
||||||
assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
assertAnalyzesToReuse(a, "\u039c\u03af\u03b1 \u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03ac \u03ba\u03b1\u03bb\u03ae \u03ba\u03b1\u03b9 \u03c0\u03bb\u03bf\u03cd\u03c3\u03b9\u03b1 \u03c3\u03b5\u03b9\u03c1\u03ac \u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03ae\u03c1\u03c9\u03bd \u03c4\u03b7\u03c2 \u0395\u03bb\u03bb\u03b7\u03bd\u03b9\u03ba\u03ae\u03c2 \u03b3\u03bb\u03ce\u03c3\u03c3\u03b1\u03c2",
|
||||||
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
new String[] { "\u03bc\u03b9\u03b1", "\u03b5\u03be\u03b1\u03b9\u03c1\u03b5\u03c4\u03b9\u03ba\u03b1", "\u03ba\u03b1\u03bb\u03b7", "\u03c0\u03bb\u03bf\u03c5\u03c3\u03b9\u03b1", "\u03c3\u03b5\u03b9\u03c1\u03b1", "\u03c7\u03b1\u03c1\u03b1\u03ba\u03c4\u03b7\u03c1\u03c9\u03bd",
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
new EnglishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "books", "book");
|
checkOneTermReuse(a, "books", "book");
|
||||||
checkOneTermReuse(a, "book", "book");
|
checkOneTermReuse(a, "book", "book");
|
||||||
|
@ -46,7 +45,7 @@ public class TestEnglishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("books");
|
exclusionSet.add("books");
|
||||||
Analyzer a = new EnglishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new EnglishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
EnglishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "books", "books");
|
checkOneTermReuse(a, "books", "books");
|
||||||
checkOneTermReuse(a, "book", "book");
|
checkOneTermReuse(a, "book", "book");
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
new SpanishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "chicana", "chican");
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
checkOneTermReuse(a, "chicano", "chican");
|
checkOneTermReuse(a, "chicano", "chican");
|
||||||
|
@ -46,7 +45,7 @@ public class TestSpanishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("chicano");
|
exclusionSet.add("chicano");
|
||||||
Analyzer a = new SpanishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new SpanishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
SpanishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "chicana", "chican");
|
checkOneTermReuse(a, "chicana", "chican");
|
||||||
checkOneTermReuse(a, "chicano", "chicano");
|
checkOneTermReuse(a, "chicano", "chicano");
|
||||||
|
|
|
@ -19,7 +19,6 @@ package org.apache.lucene.analysis.fa;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Persian Analyzer
|
* Test the Persian Analyzer
|
||||||
|
@ -31,7 +30,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* This test fails with NPE when the stopwords file is missing in classpath
|
* This test fails with NPE when the stopwords file is missing in classpath
|
||||||
*/
|
*/
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new PersianAnalyzer(Version.LUCENE_CURRENT);
|
new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -42,7 +41,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
|
* These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
|
||||||
*/
|
*/
|
||||||
public void testBehaviorVerbs() throws Exception {
|
public void testBehaviorVerbs() throws Exception {
|
||||||
Analyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// active present indicative
|
// active present indicative
|
||||||
assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
|
assertAnalyzesTo(a, "میخورد", new String[] { "خورد" });
|
||||||
// active preterite indicative
|
// active preterite indicative
|
||||||
|
@ -118,7 +117,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
|
* These verb forms are from http://en.wikipedia.org/wiki/Persian_grammar
|
||||||
*/
|
*/
|
||||||
public void testBehaviorVerbsDefective() throws Exception {
|
public void testBehaviorVerbsDefective() throws Exception {
|
||||||
Analyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// active present indicative
|
// active present indicative
|
||||||
assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
|
assertAnalyzesTo(a, "مي خورد", new String[] { "خورد" });
|
||||||
// active preterite indicative
|
// active preterite indicative
|
||||||
|
@ -189,7 +188,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* nouns, removing the plural -ha.
|
* nouns, removing the plural -ha.
|
||||||
*/
|
*/
|
||||||
public void testBehaviorNouns() throws Exception {
|
public void testBehaviorNouns() throws Exception {
|
||||||
Analyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" });
|
assertAnalyzesTo(a, "برگ ها", new String[] { "برگ" });
|
||||||
assertAnalyzesTo(a, "برگها", new String[] { "برگ" });
|
assertAnalyzesTo(a, "برگها", new String[] { "برگ" });
|
||||||
}
|
}
|
||||||
|
@ -199,7 +198,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* (lowercased, etc)
|
* (lowercased, etc)
|
||||||
*/
|
*/
|
||||||
public void testBehaviorNonPersian() throws Exception {
|
public void testBehaviorNonPersian() throws Exception {
|
||||||
Analyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(a, "English test.", new String[] { "english", "test" });
|
assertAnalyzesTo(a, "English test.", new String[] { "english", "test" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -207,7 +206,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Basic test ensuring that reusableTokenStream works correctly.
|
* Basic test ensuring that reusableTokenStream works correctly.
|
||||||
*/
|
*/
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
|
assertAnalyzesToReuse(a, "خورده مي شده بوده باشد", new String[] { "خورده" });
|
||||||
assertAnalyzesToReuse(a, "برگها", new String[] { "برگ" });
|
assertAnalyzesToReuse(a, "برگها", new String[] { "برگ" });
|
||||||
}
|
}
|
||||||
|
@ -216,7 +215,7 @@ public class TestPersianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test that custom stopwords work, and are not case-sensitive.
|
* Test that custom stopwords work, and are not case-sensitive.
|
||||||
*/
|
*/
|
||||||
public void testCustomStopwords() throws Exception {
|
public void testCustomStopwords() throws Exception {
|
||||||
PersianAnalyzer a = new PersianAnalyzer(Version.LUCENE_CURRENT, new String[] { "the", "and", "a" });
|
PersianAnalyzer a = new PersianAnalyzer(TEST_VERSION_CURRENT, new String[] { "the", "and", "a" });
|
||||||
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick",
|
||||||
"brown", "fox" });
|
"brown", "fox" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
import org.apache.lucene.analysis.ar.ArabicLetterTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Persian Normalization Filter
|
* Test the Persian Normalization Filter
|
||||||
|
@ -55,7 +54,7 @@ public class TestPersianNormalizationFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws IOException {
|
private void check(final String input, final String expected) throws IOException {
|
||||||
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(Version.LUCENE_CURRENT,
|
ArabicLetterTokenizer tokenStream = new ArabicLetterTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader(input));
|
new StringReader(input));
|
||||||
PersianNormalizationFilter filter = new PersianNormalizationFilter(
|
PersianNormalizationFilter filter = new PersianNormalizationFilter(
|
||||||
tokenStream);
|
tokenStream);
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
new FinnishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
|
checkOneTermReuse(a, "edeltäjistään", "edeltäj");
|
||||||
|
@ -46,7 +45,7 @@ public class TestFinnishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("edeltäjistään");
|
exclusionSet.add("edeltäjistään");
|
||||||
Analyzer a = new FinnishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new FinnishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
FinnishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
checkOneTermReuse(a, "edeltäjiinsä", "edeltäj");
|
||||||
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
checkOneTermReuse(a, "edeltäjistään", "edeltäjistään");
|
||||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
import org.apache.lucene.analysis.standard.StandardTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
*
|
*
|
||||||
|
@ -38,19 +37,19 @@ public class TestElision extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testElision() throws Exception {
|
public void testElision() throws Exception {
|
||||||
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
|
||||||
Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_CURRENT, new StringReader(test));
|
Tokenizer tokenizer = new StandardTokenizer(TEST_VERSION_CURRENT, new StringReader(test));
|
||||||
Set articles = new HashSet();
|
Set<String> articles = new HashSet<String>();
|
||||||
articles.add("l");
|
articles.add("l");
|
||||||
articles.add("M");
|
articles.add("M");
|
||||||
TokenFilter filter = new ElisionFilter(Version.LUCENE_CURRENT, tokenizer, articles);
|
TokenFilter filter = new ElisionFilter(TEST_VERSION_CURRENT, tokenizer, articles);
|
||||||
List tas = filtre(filter);
|
List<String> tas = filter(filter);
|
||||||
assertEquals("embrouille", tas.get(4));
|
assertEquals("embrouille", tas.get(4));
|
||||||
assertEquals("O'brian", tas.get(6));
|
assertEquals("O'brian", tas.get(6));
|
||||||
assertEquals("enfin", tas.get(7));
|
assertEquals("enfin", tas.get(7));
|
||||||
}
|
}
|
||||||
|
|
||||||
private List filtre(TokenFilter filter) throws IOException {
|
private List<String> filter(TokenFilter filter) throws IOException {
|
||||||
List tas = new ArrayList();
|
List<String> tas = new ArrayList<String>();
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||||
while (filter.incrementToken()) {
|
while (filter.incrementToken()) {
|
||||||
tas.add(termAtt.term());
|
tas.add(termAtt.term());
|
||||||
|
|
|
@ -32,7 +32,7 @@ import org.apache.lucene.util.Version;
|
||||||
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testAnalyzer() throws Exception {
|
public void testAnalyzer() throws Exception {
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(fa, "", new String[] {
|
assertAnalyzesTo(fa, "", new String[] {
|
||||||
});
|
});
|
||||||
|
@ -204,7 +204,7 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stopwords
|
// stopwords
|
||||||
assertAnalyzesToReuse(
|
assertAnalyzesToReuse(
|
||||||
fa,
|
fa,
|
||||||
|
@ -229,27 +229,27 @@ public class TestFrenchAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* when using reusable token streams.
|
* when using reusable token streams.
|
||||||
*/
|
*/
|
||||||
public void testExclusionTableReuse() throws Exception {
|
public void testExclusionTableReuse() throws Exception {
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
|
assertAnalyzesToReuse(fa, "habitable", new String[] { "habit" });
|
||||||
fa.setStemExclusionTable(new String[] { "habitable" });
|
fa.setStemExclusionTable(new String[] { "habitable" });
|
||||||
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
|
assertAnalyzesToReuse(fa, "habitable", new String[] { "habitable" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testExclusionTableViaCtor() throws Exception {
|
public void testExclusionTableViaCtor() throws Exception {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("habitable");
|
set.add("habitable");
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT,
|
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT,
|
||||||
CharArraySet.EMPTY_SET, set);
|
CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
|
assertAnalyzesToReuse(fa, "habitable chiste", new String[] { "habitable",
|
||||||
"chist" });
|
"chist" });
|
||||||
|
|
||||||
fa = new FrenchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
fa = new FrenchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
|
||||||
"chist" });
|
"chist" });
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testElision() throws Exception {
|
public void testElision() throws Exception {
|
||||||
FrenchAnalyzer fa = new FrenchAnalyzer(Version.LUCENE_CURRENT);
|
FrenchAnalyzer fa = new FrenchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
|
assertAnalyzesTo(fa, "voir l'embrouille", new String[] { "voir", "embrouill" });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -31,11 +30,11 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new HindiAnalyzer(Version.LUCENE_CURRENT);
|
new HindiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testBasics() throws Exception {
|
public void testBasics() throws Exception {
|
||||||
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// two ways to write 'hindi' itself.
|
// two ways to write 'hindi' itself.
|
||||||
checkOneTermReuse(a, "हिन्दी", "हिंद");
|
checkOneTermReuse(a, "हिन्दी", "हिंद");
|
||||||
checkOneTermReuse(a, "हिंदी", "हिंद");
|
checkOneTermReuse(a, "हिंदी", "हिंद");
|
||||||
|
@ -44,7 +43,7 @@ public class TestHindiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclusionSet() throws Exception {
|
public void testExclusionSet() throws Exception {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("हिंदी");
|
exclusionSet.add("हिंदी");
|
||||||
Analyzer a = new HindiAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new HindiAnalyzer(TEST_VERSION_CURRENT,
|
||||||
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
HindiAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
checkOneTermReuse(a, "हिंदी", "हिंदी");
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test HindiNormalizer
|
* Test HindiNormalizer
|
||||||
|
@ -60,7 +59,7 @@ public class TestHindiNormalizer extends BaseTokenStreamTestCase {
|
||||||
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
|
check("आईऊॠॡऐऔीूॄॣैौ", "अइउऋऌएओिुृॢेो");
|
||||||
}
|
}
|
||||||
private void check(String input, String output) throws IOException {
|
private void check(String input, String output) throws IOException {
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader(input));
|
new StringReader(input));
|
||||||
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
|
TokenFilter tf = new HindiNormalizationFilter(tokenizer);
|
||||||
assertTokenStreamContents(tf, new String[] { output });
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test HindiStemmer
|
* Test HindiStemmer
|
||||||
|
@ -82,7 +81,7 @@ public class TestHindiStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(String input, String output) throws IOException {
|
private void check(String input, String output) throws IOException {
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader(input));
|
new StringReader(input));
|
||||||
TokenFilter tf = new HindiStemFilter(tokenizer);
|
TokenFilter tf = new HindiStemFilter(tokenizer);
|
||||||
assertTokenStreamContents(tf, new String[] { output });
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
new HungarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "babakocsi", "babakocs");
|
checkOneTermReuse(a, "babakocsi", "babakocs");
|
||||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||||
|
@ -46,7 +45,7 @@ public class TestHungarianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("babakocsi");
|
exclusionSet.add("babakocsi");
|
||||||
Analyzer a = new HungarianAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new HungarianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
HungarianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
checkOneTermReuse(a, "babakocsi", "babakocsi");
|
||||||
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
checkOneTermReuse(a, "babakocsijáért", "babakocs");
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenFilter;
|
import org.apache.lucene.analysis.TokenFilter;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test IndicNormalizer
|
* Test IndicNormalizer
|
||||||
|
@ -45,7 +44,7 @@ public class TestIndicNormalizer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(String input, String output) throws IOException {
|
private void check(String input, String output) throws IOException {
|
||||||
Tokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader(input));
|
new StringReader(input));
|
||||||
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
TokenFilter tf = new IndicNormalizationFilter(tokenizer);
|
||||||
assertTokenStreamContents(tf, new String[] { output });
|
assertTokenStreamContents(tf, new String[] { output });
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.StringReader;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test IndicTokenizer
|
* Test IndicTokenizer
|
||||||
|
@ -30,7 +29,7 @@ import org.apache.lucene.util.Version;
|
||||||
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
||||||
/** Test tokenizing Indic vowels, signs, and punctuation */
|
/** Test tokenizing Indic vowels, signs, and punctuation */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
new StringReader("मुझे हिंदी का और अभ्यास करना होगा ।"));
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
new String[] { "मुझे", "हिंदी", "का", "और", "अभ्यास", "करना", "होगा" });
|
||||||
|
@ -38,7 +37,7 @@ public class TestIndicTokenizer extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
/** Test that words with format chars such as ZWJ are kept */
|
/** Test that words with format chars such as ZWJ are kept */
|
||||||
public void testFormat() throws Exception {
|
public void testFormat() throws Exception {
|
||||||
TokenStream ts = new IndicTokenizer(Version.LUCENE_CURRENT,
|
TokenStream ts = new IndicTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("शार्मा शार्मा"));
|
new StringReader("शार्मा शार्मा"));
|
||||||
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
assertTokenStreamContents(ts, new String[] { "शार्मा", "शार्मा" });
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
new ItalianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "abbandonata", "abbandon");
|
checkOneTermReuse(a, "abbandonata", "abbandon");
|
||||||
checkOneTermReuse(a, "abbandonati", "abbandon");
|
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||||
|
@ -46,7 +45,7 @@ public class TestItalianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("abbandonata");
|
exclusionSet.add("abbandonata");
|
||||||
Analyzer a = new ItalianAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new ItalianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
ItalianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
checkOneTermReuse(a, "abbandonata", "abbandonata");
|
||||||
checkOneTermReuse(a, "abbandonati", "abbandon");
|
checkOneTermReuse(a, "abbandonati", "abbandon");
|
||||||
|
|
|
@ -24,7 +24,6 @@ import java.util.regex.Pattern;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Verifies the behavior of PatternAnalyzer.
|
* Verifies the behavior of PatternAnalyzer.
|
||||||
|
@ -37,13 +36,13 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testNonWordPattern() throws IOException {
|
public void testNonWordPattern() throws IOException {
|
||||||
// Split on non-letter pattern, do not lowercase, no stopwords
|
// Split on non-letter pattern, do not lowercase, no stopwords
|
||||||
PatternAnalyzer a = new PatternAnalyzer(Version.LUCENE_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
||||||
false, null);
|
false, null);
|
||||||
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||||
"The", "quick", "brown", "Fox", "the", "abcd", "dc" });
|
"The", "quick", "brown", "Fox", "the", "abcd", "dc" });
|
||||||
|
|
||||||
// split on non-letter pattern, lowercase, english stopwords
|
// split on non-letter pattern, lowercase, english stopwords
|
||||||
PatternAnalyzer b = new PatternAnalyzer(Version.LUCENE_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.NON_WORD_PATTERN,
|
||||||
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||||
"quick", "brown", "fox", "abcd", "dc" });
|
"quick", "brown", "fox", "abcd", "dc" });
|
||||||
|
@ -55,13 +54,13 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testWhitespacePattern() throws IOException {
|
public void testWhitespacePattern() throws IOException {
|
||||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||||
PatternAnalyzer a = new PatternAnalyzer(Version.LUCENE_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||||
false, null);
|
false, null);
|
||||||
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
check(a, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||||
"The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
|
"The", "quick", "brown", "Fox,the", "abcd1234", "(56.78)", "dc." });
|
||||||
|
|
||||||
// Split on whitespace patterns, lowercase, english stopwords
|
// Split on whitespace patterns, lowercase, english stopwords
|
||||||
PatternAnalyzer b = new PatternAnalyzer(Version.LUCENE_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||||
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
true, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
check(b, "The quick brown Fox,the abcd1234 (56.78) dc.", new String[] {
|
||||||
"quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
|
"quick", "brown", "fox,the", "abcd1234", "(56.78)", "dc." });
|
||||||
|
@ -73,12 +72,12 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testCustomPattern() throws IOException {
|
public void testCustomPattern() throws IOException {
|
||||||
// Split on comma, do not lowercase, no stopwords
|
// Split on comma, do not lowercase, no stopwords
|
||||||
PatternAnalyzer a = new PatternAnalyzer(Version.LUCENE_CURRENT, Pattern.compile(","), false, null);
|
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), false, null);
|
||||||
check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
|
check(a, "Here,Are,some,Comma,separated,words,", new String[] { "Here",
|
||||||
"Are", "some", "Comma", "separated", "words" });
|
"Are", "some", "Comma", "separated", "words" });
|
||||||
|
|
||||||
// split on comma, lowercase, english stopwords
|
// split on comma, lowercase, english stopwords
|
||||||
PatternAnalyzer b = new PatternAnalyzer(Version.LUCENE_CURRENT, Pattern.compile(","), true,
|
PatternAnalyzer b = new PatternAnalyzer(TEST_VERSION_CURRENT, Pattern.compile(","), true,
|
||||||
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
StopAnalyzer.ENGLISH_STOP_WORDS_SET);
|
||||||
check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
|
check(b, "Here,Are,some,Comma,separated,words,", new String[] { "here",
|
||||||
"some", "comma", "separated", "words" });
|
"some", "comma", "separated", "words" });
|
||||||
|
@ -103,7 +102,7 @@ public class PatternAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
document.append(largeWord2);
|
document.append(largeWord2);
|
||||||
|
|
||||||
// Split on whitespace patterns, do not lowercase, no stopwords
|
// Split on whitespace patterns, do not lowercase, no stopwords
|
||||||
PatternAnalyzer a = new PatternAnalyzer(Version.LUCENE_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
PatternAnalyzer a = new PatternAnalyzer(TEST_VERSION_CURRENT, PatternAnalyzer.WHITESPACE_PATTERN,
|
||||||
false, null);
|
false, null);
|
||||||
check(a, document.toString(), new String[] { new String(largeWord),
|
check(a, document.toString(), new String[] { new String(largeWord),
|
||||||
new String(largeWord2) });
|
new String(largeWord2) });
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -31,7 +30,7 @@ public class TestPrefixAndSuffixAwareTokenFilter extends BaseTokenStreamTestCase
|
||||||
|
|
||||||
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
|
PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
|
||||||
new SingleTokenTokenStream(createToken("^", 0, 0)),
|
new SingleTokenTokenStream(createToken("^", 0, 0)),
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")),
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")),
|
||||||
new SingleTokenTokenStream(createToken("$", 0, 0)));
|
new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||||
|
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.miscellaneous;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.Token;
|
import org.apache.lucene.analysis.Token;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -42,7 +41,7 @@ public class TestPrefixAwareTokenFilter extends BaseTokenStreamTestCase {
|
||||||
// prefix and suffix using 2x prefix
|
// prefix and suffix using 2x prefix
|
||||||
|
|
||||||
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
|
ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(createToken("^", 0, 0)),
|
||||||
new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("hello world")));
|
new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("hello world")));
|
||||||
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
|
ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(createToken("$", 0, 0)));
|
||||||
|
|
||||||
assertTokenStreamContents(ts,
|
assertTokenStreamContents(ts,
|
||||||
|
|
|
@ -10,7 +10,6 @@ import org.apache.lucene.analysis.KeywordTokenizer;
|
||||||
import org.apache.lucene.analysis.PorterStemFilter;
|
import org.apache.lucene.analysis.PorterStemFilter;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
@ -38,7 +37,7 @@ public class TestStemmerOverrideFilter extends BaseTokenStreamTestCase {
|
||||||
dictionary.put("booked", "books");
|
dictionary.put("booked", "books");
|
||||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("booked"));
|
||||||
TokenStream stream = new PorterStemFilter(
|
TokenStream stream = new PorterStemFilter(
|
||||||
new StemmerOverrideFilter(Version.LUCENE_CURRENT, tokenizer, dictionary));
|
new StemmerOverrideFilter(TEST_VERSION_CURRENT, tokenizer, dictionary));
|
||||||
assertTokenStreamContents(stream, new String[] { "books" });
|
assertTokenStreamContents(stream, new String[] { "books" });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ngram;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
@ -31,9 +30,9 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
private TokenStream input;
|
private TokenStream input;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
|
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
|
@ -92,13 +91,13 @@ public class EdgeNGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
|
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
|
||||||
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
EdgeNGramTokenFilter tokenizer = new EdgeNGramTokenFilter(input, EdgeNGramTokenFilter.Side.FRONT, 3, 3);
|
||||||
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
assertTokenStreamContents(tokenizer, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
|
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||||
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, EdgeNGramTokenFilter.Side.FRONT, 1, 3);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
|
assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{1,2,3});
|
||||||
tokenizer.reset(new StringReader("abcde"));
|
tokenizer.reset(new StringReader("abcde"));
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class EdgeNGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
private StringReader input;
|
private StringReader input;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
input = new StringReader("abcde");
|
input = new StringReader("abcde");
|
||||||
}
|
}
|
||||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.lucene.analysis.ngram;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
@ -31,9 +30,9 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
private TokenStream input;
|
private TokenStream input;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
|
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testInvalidInput() throws Exception {
|
public void testInvalidInput() throws Exception {
|
||||||
|
@ -81,13 +80,13 @@ public class NGramTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSmallTokenInStream() throws Exception {
|
public void testSmallTokenInStream() throws Exception {
|
||||||
input = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abc de fgh"));
|
input = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abc de fgh"));
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
NGramTokenFilter filter = new NGramTokenFilter(input, 3, 3);
|
||||||
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
assertTokenStreamContents(filter, new String[]{"abc","fgh"}, new int[]{0,7}, new int[]{3,10});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("abcde"));
|
WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("abcde"));
|
||||||
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1);
|
||||||
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,1,2,3,4}, new int[]{1,2,3,4,5});
|
||||||
tokenizer.reset(new StringReader("abcde"));
|
tokenizer.reset(new StringReader("abcde"));
|
||||||
|
|
|
@ -29,7 +29,7 @@ public class NGramTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
private StringReader input;
|
private StringReader input;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
input = new StringReader("abcde");
|
input = new StringReader("abcde");
|
||||||
}
|
}
|
||||||
|
|
|
@ -127,14 +127,14 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testSnowballCorrectness() throws Exception {
|
public void testSnowballCorrectness() throws Exception {
|
||||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "opheffen", "opheff");
|
checkOneTermReuse(a, "opheffen", "opheff");
|
||||||
checkOneTermReuse(a, "opheffende", "opheff");
|
checkOneTermReuse(a, "opheffende", "opheff");
|
||||||
checkOneTermReuse(a, "opheffing", "opheff");
|
checkOneTermReuse(a, "opheffing", "opheff");
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
checkOneTermReuse(a, "lichaamsziek", "lichaamsziek");
|
||||||
checkOneTermReuse(a, "lichamelijk", "licham");
|
checkOneTermReuse(a, "lichamelijk", "licham");
|
||||||
checkOneTermReuse(a, "lichamelijke", "licham");
|
checkOneTermReuse(a, "lichamelijke", "licham");
|
||||||
|
@ -146,7 +146,7 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
* when using reusable token streams.
|
* when using reusable token streams.
|
||||||
*/
|
*/
|
||||||
public void testExclusionTableReuse() throws Exception {
|
public void testExclusionTableReuse() throws Exception {
|
||||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "lichamelijk", "licham");
|
checkOneTermReuse(a, "lichamelijk", "licham");
|
||||||
a.setStemExclusionTable(new String[] { "lichamelijk" });
|
a.setStemExclusionTable(new String[] { "lichamelijk" });
|
||||||
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
|
checkOneTermReuse(a, "lichamelijk", "lichamelijk");
|
||||||
|
@ -157,10 +157,10 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
public void testExclusionTableViaCtor() throws IOException {
|
public void testExclusionTableViaCtor() throws IOException {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
|
CharArraySet set = new CharArraySet(Version.LUCENE_30, 1, true);
|
||||||
set.add("lichamelijk");
|
set.add("lichamelijk");
|
||||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
assertAnalyzesToReuse(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||||
|
|
||||||
a = new DutchAnalyzer(Version.LUCENE_CURRENT, CharArraySet.EMPTY_SET, set);
|
a = new DutchAnalyzer(TEST_VERSION_CURRENT, CharArraySet.EMPTY_SET, set);
|
||||||
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
assertAnalyzesTo(a, "lichamelijk lichamelijke", new String[] { "lichamelijk", "licham" });
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -170,7 +170,7 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
* when using reusable token streams.
|
* when using reusable token streams.
|
||||||
*/
|
*/
|
||||||
public void testStemDictionaryReuse() throws Exception {
|
public void testStemDictionaryReuse() throws Exception {
|
||||||
DutchAnalyzer a = new DutchAnalyzer(Version.LUCENE_CURRENT);
|
DutchAnalyzer a = new DutchAnalyzer(TEST_VERSION_CURRENT);
|
||||||
checkOneTermReuse(a, "lichamelijk", "licham");
|
checkOneTermReuse(a, "lichamelijk", "licham");
|
||||||
a.setStemDictionary(customDictFile);
|
a.setStemDictionary(customDictFile);
|
||||||
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
|
checkOneTermReuse(a, "lichamelijk", "somethingentirelydifferent");
|
||||||
|
@ -196,7 +196,7 @@ public class TestDutchStemmer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void check(final String input, final String expected) throws Exception {
|
private void check(final String input, final String expected) throws Exception {
|
||||||
checkOneTerm(new DutchAnalyzer(Version.LUCENE_CURRENT), input, expected);
|
checkOneTerm(new DutchAnalyzer(TEST_VERSION_CURRENT), input, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
new NorwegianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
|
checkOneTermReuse(a, "havnedistriktene", "havnedistrikt");
|
||||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||||
|
@ -46,7 +45,7 @@ public class TestNorwegianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("havnedistriktene");
|
exclusionSet.add("havnedistriktene");
|
||||||
Analyzer a = new NorwegianAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new NorwegianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
NorwegianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
checkOneTermReuse(a, "havnedistriktene", "havnedistriktene");
|
||||||
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
checkOneTermReuse(a, "havnedistrikter", "havnedistrikt");
|
||||||
|
|
|
@ -22,21 +22,15 @@ import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.util.LuceneTestCase;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
*
|
|
||||||
*
|
|
||||||
**/
|
|
||||||
public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testPayloads() throws Exception {
|
public void testPayloads() throws Exception {
|
||||||
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||||
(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)),
|
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
||||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
|
@ -57,7 +51,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
|
|
||||||
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
String test = "The quick|JJ red|JJ fox|NN jumped|VB over the lazy|JJ brown|JJ dogs|NN";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter
|
||||||
(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)),
|
(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)),
|
||||||
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
DelimitedPayloadTokenFilter.DEFAULT_DELIMITER, new IdentityEncoder());
|
||||||
assertTermEquals("The", filter, null);
|
assertTermEquals("The", filter, null);
|
||||||
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
|
assertTermEquals("quick", filter, "JJ".getBytes("UTF-8"));
|
||||||
|
@ -75,7 +69,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testFloatEncoding() throws Exception {
|
public void testFloatEncoding() throws Exception {
|
||||||
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
|
String test = "The quick|1.0 red|2.0 fox|3.5 jumped|0.5 over the lazy|5 brown|99.3 dogs|83.7";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new FloatEncoder());
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new FloatEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||||
|
@ -93,7 +87,7 @@ public class DelimitedPayloadTokenFilterTest extends LuceneTestCase {
|
||||||
|
|
||||||
public void testIntEncoding() throws Exception {
|
public void testIntEncoding() throws Exception {
|
||||||
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
|
String test = "The quick|1 red|2 fox|3 jumped over the lazy|5 brown|99 dogs|83";
|
||||||
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
|
DelimitedPayloadTokenFilter filter = new DelimitedPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)), '|', new IntegerEncoder());
|
||||||
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = filter.getAttribute(TermAttribute.class);
|
||||||
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payAtt = filter.getAttribute(PayloadAttribute.class);
|
||||||
assertTermEquals("The", filter, termAtt, payAtt, null);
|
assertTermEquals("The", filter, termAtt, payAtt, null);
|
||||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -38,7 +37,7 @@ public class NumericPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))), 3, "D");
|
NumericPayloadTokenFilter nptf = new NumericPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))), 3, "D");
|
||||||
boolean seenDogs = false;
|
boolean seenDogs = false;
|
||||||
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
||||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||||
|
|
|
@ -21,7 +21,6 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.index.Payload;
|
import org.apache.lucene.index.Payload;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -36,7 +35,7 @@ public class TokenOffsetPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
|
TokenOffsetPayloadTokenFilter nptf = new TokenOffsetPayloadTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||||
int count = 0;
|
int count = 0;
|
||||||
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
PayloadAttribute payloadAtt = nptf.getAttribute(PayloadAttribute.class);
|
||||||
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
|
OffsetAttribute offsetAtt = nptf.getAttribute(OffsetAttribute.class);
|
||||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.StringReader;
|
import java.io.StringReader;
|
||||||
|
@ -38,7 +37,7 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
|
TypeAsPayloadTokenFilter nptf = new TypeAsPayloadTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||||
int count = 0;
|
int count = 0;
|
||||||
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = nptf.getAttribute(TermAttribute.class);
|
||||||
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
TypeAttribute typeAtt = nptf.getAttribute(TypeAttribute.class);
|
||||||
|
@ -48,7 +47,6 @@ public class TypeAsPayloadTokenFilterTest extends BaseTokenStreamTestCase {
|
||||||
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0]))));
|
assertTrue(typeAtt.type() + " is not null and it should be", typeAtt.type().equals(String.valueOf(Character.toUpperCase(termAtt.termBuffer()[0]))));
|
||||||
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
assertTrue("nextToken.getPayload() is null and it shouldn't be", payloadAtt.getPayload() != null);
|
||||||
String type = new String(payloadAtt.getPayload().getData(), "UTF-8");
|
String type = new String(payloadAtt.getPayload().getData(), "UTF-8");
|
||||||
assertTrue("type is null and it shouldn't be", type != null);
|
|
||||||
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
assertTrue(type + " is not equal to " + typeAtt.type(), type.equals(typeAtt.type()) == true);
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
new PortugueseAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "quilométricas", "quilométr");
|
checkOneTermReuse(a, "quilométricas", "quilométr");
|
||||||
checkOneTermReuse(a, "quilométricos", "quilométr");
|
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||||
|
@ -46,7 +45,7 @@ public class TestPortugueseAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("quilométricas");
|
exclusionSet.add("quilométricas");
|
||||||
Analyzer a = new PortugueseAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new PortugueseAnalyzer(TEST_VERSION_CURRENT,
|
||||||
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
PortugueseAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
checkOneTermReuse(a, "quilométricas", "quilométricas");
|
||||||
checkOneTermReuse(a, "quilométricos", "quilométr");
|
checkOneTermReuse(a, "quilométricos", "quilométr");
|
||||||
|
|
|
@ -37,7 +37,6 @@ import org.apache.lucene.queryParser.QueryParser;
|
||||||
import org.apache.lucene.search.IndexSearcher;
|
import org.apache.lucene.search.IndexSearcher;
|
||||||
import org.apache.lucene.search.Query;
|
import org.apache.lucene.search.Query;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
String variedFieldValues[] = {"the", "quick", "brown", "fox", "jumped", "over", "the", "lazy", "boring", "dog"};
|
||||||
|
@ -51,7 +50,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
protected void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
super.setUp();
|
super.setUp();
|
||||||
dir = new RAMDirectory();
|
dir = new RAMDirectory();
|
||||||
appAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_CURRENT);
|
appAnalyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
|
||||||
IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
|
IndexWriter writer = new IndexWriter(dir, appAnalyzer, true, IndexWriter.MaxFieldLength.UNLIMITED);
|
||||||
int numDocs = 200;
|
int numDocs = 200;
|
||||||
for (int i = 0; i < numDocs; i++) {
|
for (int i = 0; i < numDocs; i++) {
|
||||||
|
@ -64,7 +63,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
reader = IndexReader.open(dir, true);
|
reader = IndexReader.open(dir, true);
|
||||||
protectedAnalyzer = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, appAnalyzer);
|
protectedAnalyzer = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, appAnalyzer);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -75,7 +74,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
//Helper method to query
|
//Helper method to query
|
||||||
private int search(Analyzer a, String queryString) throws IOException, ParseException {
|
private int search(Analyzer a, String queryString) throws IOException, ParseException {
|
||||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "repetitiveField", a);
|
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "repetitiveField", a);
|
||||||
Query q = qp.parse(queryString);
|
Query q = qp.parse(queryString);
|
||||||
return new IndexSearcher(reader).search(q, null, 1000).totalHits;
|
return new IndexSearcher(reader).search(q, null, 1000).totalHits;
|
||||||
}
|
}
|
||||||
|
@ -157,14 +156,14 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
if (++invocationCount % 2 == 0)
|
if (++invocationCount % 2 == 0)
|
||||||
return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
|
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
else
|
else
|
||||||
return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
|
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testWrappingNonReusableAnalyzer() throws Exception {
|
public void testWrappingNonReusableAnalyzer() throws Exception {
|
||||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new NonreusableAnalyzer());
|
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new NonreusableAnalyzer());
|
||||||
a.addStopWords(reader, 10);
|
a.addStopWords(reader, 10);
|
||||||
int numHits = search(a, "repetitiveField:boring");
|
int numHits = search(a, "repetitiveField:boring");
|
||||||
assertTrue(numHits == 0);
|
assertTrue(numHits == 0);
|
||||||
|
@ -173,7 +172,7 @@ public class QueryAutoStopWordAnalyzerTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testTokenStream() throws Exception {
|
public void testTokenStream() throws Exception {
|
||||||
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(Version.LUCENE_CURRENT, new WhitespaceAnalyzer(Version.LUCENE_CURRENT));
|
QueryAutoStopWordAnalyzer a = new QueryAutoStopWordAnalyzer(TEST_VERSION_CURRENT, new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
a.addStopWords(reader, 10);
|
a.addStopWords(reader, 10);
|
||||||
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
TokenStream ts = a.tokenStream("repetitiveField", new StringReader("this boring"));
|
||||||
TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
|
TermAttribute termAtt = ts.getAttribute(TermAttribute.class);
|
||||||
|
|
|
@ -27,9 +27,9 @@ import org.apache.lucene.util.Version;
|
||||||
|
|
||||||
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||||
public void testFilter() throws Exception {
|
public void testFilter() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT,
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
|
||||||
new StringReader("Do have a nice day")); // 1-4 length string
|
new StringReader("Do have a nice day")); // 1-4 length string
|
||||||
ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream);
|
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream);
|
||||||
TermAttribute text = filter.getAttribute(TermAttribute.class);
|
TermAttribute text = filter.getAttribute(TermAttribute.class);
|
||||||
assertTrue(filter.incrementToken());
|
assertTrue(filter.incrementToken());
|
||||||
assertEquals("oD", text.term());
|
assertEquals("oD", text.term());
|
||||||
|
@ -45,9 +45,9 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testFilterWithMark() throws Exception {
|
public void testFilterWithMark() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"Do have a nice day")); // 1-4 length string
|
"Do have a nice day")); // 1-4 length string
|
||||||
ReverseStringFilter filter = new ReverseStringFilter(Version.LUCENE_CURRENT, stream, '\u0001');
|
ReverseStringFilter filter = new ReverseStringFilter(TEST_VERSION_CURRENT, stream, '\u0001');
|
||||||
TermAttribute text = filter
|
TermAttribute text = filter
|
||||||
.getAttribute(TermAttribute.class);
|
.getAttribute(TermAttribute.class);
|
||||||
assertTrue(filter.incrementToken());
|
assertTrue(filter.incrementToken());
|
||||||
|
@ -64,14 +64,14 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReverseString() throws Exception {
|
public void testReverseString() throws Exception {
|
||||||
assertEquals( "A", ReverseStringFilter.reverse( "A" ) );
|
assertEquals( "A", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "A" ) );
|
||||||
assertEquals( "BA", ReverseStringFilter.reverse( "AB" ) );
|
assertEquals( "BA", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "AB" ) );
|
||||||
assertEquals( "CBA", ReverseStringFilter.reverse( "ABC" ) );
|
assertEquals( "CBA", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "ABC" ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReverseChar() throws Exception {
|
public void testReverseChar() throws Exception {
|
||||||
char[] buffer = { 'A', 'B', 'C', 'D', 'E', 'F' };
|
char[] buffer = { 'A', 'B', 'C', 'D', 'E', 'F' };
|
||||||
ReverseStringFilter.reverse( buffer, 2, 3 );
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 2, 3 );
|
||||||
assertEquals( "ABEDCF", new String( buffer ) );
|
assertEquals( "ABEDCF", new String( buffer ) );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -84,37 +84,37 @@ public class TestReverseStringFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testReverseSupplementary() throws Exception {
|
public void testReverseSupplementary() throws Exception {
|
||||||
// supplementary at end
|
// supplementary at end
|
||||||
assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅"));
|
assertEquals("𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "瀛愯䇹鍟艱𩬅"));
|
||||||
// supplementary at end - 1
|
// supplementary at end - 1
|
||||||
assertEquals("a𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "瀛愯䇹鍟艱𩬅a"));
|
assertEquals("a𩬅艱鍟䇹愯瀛", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "瀛愯䇹鍟艱𩬅a"));
|
||||||
// supplementary at start
|
// supplementary at start
|
||||||
assertEquals("fedcba𩬅", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "𩬅abcdef"));
|
assertEquals("fedcba𩬅", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "𩬅abcdef"));
|
||||||
// supplementary at start + 1
|
// supplementary at start + 1
|
||||||
assertEquals("fedcba𩬅z", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "z𩬅abcdef"));
|
assertEquals("fedcba𩬅z", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "z𩬅abcdef"));
|
||||||
// supplementary medial
|
// supplementary medial
|
||||||
assertEquals("gfe𩬅dcba", ReverseStringFilter.reverse(Version.LUCENE_CURRENT, "abcd𩬅efg"));
|
assertEquals("gfe𩬅dcba", ReverseStringFilter.reverse(TEST_VERSION_CURRENT, "abcd𩬅efg"));
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReverseSupplementaryChar() throws Exception {
|
public void testReverseSupplementaryChar() throws Exception {
|
||||||
// supplementary at end
|
// supplementary at end
|
||||||
char[] buffer = "abc瀛愯䇹鍟艱𩬅".toCharArray();
|
char[] buffer = "abc瀛愯䇹鍟艱𩬅".toCharArray();
|
||||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
|
||||||
assertEquals("abc𩬅艱鍟䇹愯瀛", new String(buffer));
|
assertEquals("abc𩬅艱鍟䇹愯瀛", new String(buffer));
|
||||||
// supplementary at end - 1
|
// supplementary at end - 1
|
||||||
buffer = "abc瀛愯䇹鍟艱𩬅d".toCharArray();
|
buffer = "abc瀛愯䇹鍟艱𩬅d".toCharArray();
|
||||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 8);
|
||||||
assertEquals("abcd𩬅艱鍟䇹愯瀛", new String(buffer));
|
assertEquals("abcd𩬅艱鍟䇹愯瀛", new String(buffer));
|
||||||
// supplementary at start
|
// supplementary at start
|
||||||
buffer = "abc𩬅瀛愯䇹鍟艱".toCharArray();
|
buffer = "abc𩬅瀛愯䇹鍟艱".toCharArray();
|
||||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
|
||||||
assertEquals("abc艱鍟䇹愯瀛𩬅", new String(buffer));
|
assertEquals("abc艱鍟䇹愯瀛𩬅", new String(buffer));
|
||||||
// supplementary at start + 1
|
// supplementary at start + 1
|
||||||
buffer = "abcd𩬅瀛愯䇹鍟艱".toCharArray();
|
buffer = "abcd𩬅瀛愯䇹鍟艱".toCharArray();
|
||||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 8);
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 8);
|
||||||
assertEquals("abc艱鍟䇹愯瀛𩬅d", new String(buffer));
|
assertEquals("abc艱鍟䇹愯瀛𩬅d", new String(buffer));
|
||||||
// supplementary medial
|
// supplementary medial
|
||||||
buffer = "abc瀛愯𩬅def".toCharArray();
|
buffer = "abc瀛愯𩬅def".toCharArray();
|
||||||
ReverseStringFilter.reverse(Version.LUCENE_CURRENT, buffer, 3, 7);
|
ReverseStringFilter.reverse(TEST_VERSION_CURRENT, buffer, 3, 7);
|
||||||
assertEquals("abcfed𩬅愯瀛", new String(buffer));
|
assertEquals("abcfed𩬅愯瀛", new String(buffer));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
new RomanianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "absenţa", "absenţ");
|
checkOneTermReuse(a, "absenţa", "absenţ");
|
||||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||||
|
@ -46,7 +45,7 @@ public class TestRomanianAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("absenţa");
|
exclusionSet.add("absenţa");
|
||||||
Analyzer a = new RomanianAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new RomanianAnalyzer(TEST_VERSION_CURRENT,
|
||||||
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
RomanianAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "absenţa", "absenţa");
|
checkOneTermReuse(a, "absenţa", "absenţa");
|
||||||
checkOneTermReuse(a, "absenţi", "absenţ");
|
checkOneTermReuse(a, "absenţi", "absenţ");
|
||||||
|
|
|
@ -44,8 +44,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
private File dataDir;
|
private File dataDir;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected void setUp() throws Exception
|
protected void setUp() throws Exception {
|
||||||
{
|
|
||||||
super.setUp();
|
super.setUp();
|
||||||
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
dataDir = new File(System.getProperty("dataDir", "./bin"));
|
||||||
}
|
}
|
||||||
|
@ -71,7 +70,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
TokenStream in = ra.tokenStream("all", inWords);
|
TokenStream in = ra.tokenStream("all", inWords);
|
||||||
|
|
||||||
RussianLetterTokenizer sample =
|
RussianLetterTokenizer sample =
|
||||||
new RussianLetterTokenizer(Version.LUCENE_CURRENT,
|
new RussianLetterTokenizer(TEST_VERSION_CURRENT,
|
||||||
sampleUnicode);
|
sampleUnicode);
|
||||||
|
|
||||||
TermAttribute text = in.getAttribute(TermAttribute.class);
|
TermAttribute text = in.getAttribute(TermAttribute.class);
|
||||||
|
@ -98,7 +97,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
public void testDigitsInRussianCharset()
|
public void testDigitsInRussianCharset()
|
||||||
{
|
{
|
||||||
Reader reader = new StringReader("text 1000");
|
Reader reader = new StringReader("text 1000");
|
||||||
RussianAnalyzer ra = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
RussianAnalyzer ra = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
TokenStream stream = ra.tokenStream("", reader);
|
TokenStream stream = ra.tokenStream("", reader);
|
||||||
|
|
||||||
TermAttribute termText = stream.getAttribute(TermAttribute.class);
|
TermAttribute termText = stream.getAttribute(TermAttribute.class);
|
||||||
|
@ -126,7 +125,7 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представлен" });
|
||||||
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
assertAnalyzesToReuse(a, "Но знание это хранилось в тайне",
|
||||||
|
@ -135,9 +134,9 @@ public class TestRussianAnalyzer extends BaseTokenStreamTestCase
|
||||||
|
|
||||||
|
|
||||||
public void testWithStemExclusionSet() throws Exception {
|
public void testWithStemExclusionSet() throws Exception {
|
||||||
CharArraySet set = new CharArraySet(Version.LUCENE_CURRENT, 1, true);
|
CharArraySet set = new CharArraySet(TEST_VERSION_CURRENT, 1, true);
|
||||||
set.add("представление");
|
set.add("представление");
|
||||||
Analyzer a = new RussianAnalyzer(Version.LUCENE_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
|
Analyzer a = new RussianAnalyzer(TEST_VERSION_CURRENT, RussianAnalyzer.getDefaultStopSet() , set);
|
||||||
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
assertAnalyzesToReuse(a, "Вместе с тем о силе электромагнитной энергии имели представление еще",
|
||||||
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
|
new String[] { "вмест", "сил", "электромагнитн", "энерг", "имел", "представление" });
|
||||||
|
|
||||||
|
|
|
@ -30,8 +30,8 @@ import java.util.ArrayList;
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public class TestRussianStem extends LuceneTestCase
|
public class TestRussianStem extends LuceneTestCase
|
||||||
{
|
{
|
||||||
private ArrayList words = new ArrayList();
|
private ArrayList<String> words = new ArrayList<String>();
|
||||||
private ArrayList stems = new ArrayList();
|
private ArrayList<String> stems = new ArrayList<String>();
|
||||||
|
|
||||||
public TestRussianStem(String name)
|
public TestRussianStem(String name)
|
||||||
{
|
{
|
||||||
|
@ -42,8 +42,7 @@ public class TestRussianStem extends LuceneTestCase
|
||||||
* @see TestCase#setUp()
|
* @see TestCase#setUp()
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
protected void setUp() throws Exception
|
protected void setUp() throws Exception {
|
||||||
{
|
|
||||||
super.setUp();
|
super.setUp();
|
||||||
//System.out.println(new java.util.Date());
|
//System.out.println(new java.util.Date());
|
||||||
String str;
|
String str;
|
||||||
|
@ -75,15 +74,6 @@ public class TestRussianStem extends LuceneTestCase
|
||||||
inStems.close();
|
inStems.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* @see TestCase#tearDown()
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
protected void tearDown() throws Exception
|
|
||||||
{
|
|
||||||
super.tearDown();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void testStem()
|
public void testStem()
|
||||||
{
|
{
|
||||||
for (int i = 0; i < words.size(); i++)
|
for (int i = 0; i < words.size(); i++)
|
||||||
|
@ -91,7 +81,7 @@ public class TestRussianStem extends LuceneTestCase
|
||||||
//if ( (i % 100) == 0 ) System.err.println(i);
|
//if ( (i % 100) == 0 ) System.err.println(i);
|
||||||
String realStem =
|
String realStem =
|
||||||
RussianStemmer.stemWord(
|
RussianStemmer.stemWord(
|
||||||
(String) words.get(i));
|
words.get(i));
|
||||||
assertEquals("unicode", stems.get(i), realStem);
|
assertEquals("unicode", stems.get(i), realStem);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -42,7 +42,6 @@ import org.apache.lucene.search.ScoreDoc;
|
||||||
import org.apache.lucene.search.TermQuery;
|
import org.apache.lucene.search.TermQuery;
|
||||||
import org.apache.lucene.store.Directory;
|
import org.apache.lucene.store.Directory;
|
||||||
import org.apache.lucene.store.RAMDirectory;
|
import org.apache.lucene.store.RAMDirectory;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
|
* A test class for ShingleAnalyzerWrapper as regards queries and scoring.
|
||||||
|
@ -86,7 +85,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
protected ScoreDoc[] queryParsingTest(Analyzer analyzer, String qs) throws Exception {
|
protected ScoreDoc[] queryParsingTest(Analyzer analyzer, String qs) throws Exception {
|
||||||
searcher = setUpSearcher(analyzer);
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
QueryParser qp = new QueryParser(Version.LUCENE_CURRENT, "content", analyzer);
|
QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "content", analyzer);
|
||||||
|
|
||||||
Query q = qp.parse(qs);
|
Query q = qp.parse(qs);
|
||||||
|
|
||||||
|
@ -106,7 +105,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
|
public void testShingleAnalyzerWrapperQueryParsing() throws Exception {
|
||||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
|
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||||
"test sentence");
|
"test sentence");
|
||||||
int[] ranks = new int[] { 1, 2, 0 };
|
int[] ranks = new int[] { 1, 2, 0 };
|
||||||
compareRanks(hits, ranks);
|
compareRanks(hits, ranks);
|
||||||
|
@ -117,7 +116,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
|
public void testShingleAnalyzerWrapperPhraseQueryParsingFails() throws Exception {
|
||||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
|
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||||
"\"this sentence\"");
|
"\"this sentence\"");
|
||||||
int[] ranks = new int[] { 0 };
|
int[] ranks = new int[] { 0 };
|
||||||
compareRanks(hits, ranks);
|
compareRanks(hits, ranks);
|
||||||
|
@ -128,7 +127,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
|
public void testShingleAnalyzerWrapperPhraseQueryParsing() throws Exception {
|
||||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
|
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||||
"\"test sentence\"");
|
"\"test sentence\"");
|
||||||
int[] ranks = new int[] { 1 };
|
int[] ranks = new int[] { 1 };
|
||||||
compareRanks(hits, ranks);
|
compareRanks(hits, ranks);
|
||||||
|
@ -139,7 +138,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
|
public void testShingleAnalyzerWrapperRequiredQueryParsing() throws Exception {
|
||||||
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
ScoreDoc[] hits = queryParsingTest(new ShingleAnalyzerWrapper
|
||||||
(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2),
|
(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2),
|
||||||
"+test +sentence");
|
"+test +sentence");
|
||||||
int[] ranks = new int[] { 1, 2 };
|
int[] ranks = new int[] { 1, 2 };
|
||||||
compareRanks(hits, ranks);
|
compareRanks(hits, ranks);
|
||||||
|
@ -149,7 +148,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
* This shows how to construct a phrase query containing shingles.
|
* This shows how to construct a phrase query containing shingles.
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception {
|
||||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
|
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||||
searcher = setUpSearcher(analyzer);
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
PhraseQuery q = new PhraseQuery();
|
PhraseQuery q = new PhraseQuery();
|
||||||
|
@ -178,7 +177,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
* in the right order and adjacent to each other.
|
* in the right order and adjacent to each other.
|
||||||
*/
|
*/
|
||||||
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
public void testShingleAnalyzerWrapperBooleanQuery() throws Exception {
|
||||||
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
|
Analyzer analyzer = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||||
searcher = setUpSearcher(analyzer);
|
searcher = setUpSearcher(analyzer);
|
||||||
|
|
||||||
BooleanQuery q = new BooleanQuery();
|
BooleanQuery q = new BooleanQuery();
|
||||||
|
@ -200,7 +199,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(Version.LUCENE_CURRENT), 2);
|
Analyzer a = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 2);
|
||||||
assertAnalyzesToReuse(a, "please divide into shingles",
|
assertAnalyzesToReuse(a, "please divide into shingles",
|
||||||
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
new String[] { "please", "please divide", "divide", "divide into", "into", "into shingles", "shingles" },
|
||||||
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
new int[] { 0, 0, 7, 7, 14, 14, 19 },
|
||||||
|
@ -222,9 +221,9 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
@Override
|
@Override
|
||||||
public TokenStream tokenStream(String fieldName, Reader reader) {
|
public TokenStream tokenStream(String fieldName, Reader reader) {
|
||||||
if (++invocationCount % 2 == 0)
|
if (++invocationCount % 2 == 0)
|
||||||
return new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader);
|
return new WhitespaceTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
else
|
else
|
||||||
return new LetterTokenizer(Version.LUCENE_CURRENT, reader);
|
return new LetterTokenizer(TEST_VERSION_CURRENT, reader);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -249,7 +248,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testNonDefaultMinShingleSize() throws Exception {
|
public void testNonDefaultMinShingleSize() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer
|
ShingleAnalyzerWrapper analyzer
|
||||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 4);
|
||||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||||
new String[] { "please", "please divide this", "please divide this sentence",
|
new String[] { "please", "please divide this", "please divide this sentence",
|
||||||
"divide", "divide this sentence", "divide this sentence into",
|
"divide", "divide this sentence", "divide this sentence into",
|
||||||
|
@ -273,7 +272,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
|
public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer
|
ShingleAnalyzerWrapper analyzer
|
||||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT), 3, 3);
|
||||||
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
|
||||||
new String[] { "please", "please divide this",
|
new String[] { "please", "please divide this",
|
||||||
"divide", "divide this sentence",
|
"divide", "divide this sentence",
|
||||||
|
@ -297,7 +296,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testNoTokenSeparator() throws Exception {
|
public void testNoTokenSeparator() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer
|
ShingleAnalyzerWrapper analyzer
|
||||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
analyzer.setTokenSeparator("");
|
analyzer.setTokenSeparator("");
|
||||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "pleasedivide",
|
new String[] { "please", "pleasedivide",
|
||||||
|
@ -319,7 +318,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testNullTokenSeparator() throws Exception {
|
public void testNullTokenSeparator() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer
|
ShingleAnalyzerWrapper analyzer
|
||||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
analyzer.setTokenSeparator(null);
|
analyzer.setTokenSeparator(null);
|
||||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "pleasedivide",
|
new String[] { "please", "pleasedivide",
|
||||||
|
@ -340,7 +339,7 @@ public class ShingleAnalyzerWrapperTest extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
public void testAltTokenSeparator() throws Exception {
|
public void testAltTokenSeparator() throws Exception {
|
||||||
ShingleAnalyzerWrapper analyzer
|
ShingleAnalyzerWrapper analyzer
|
||||||
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
|
= new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(TEST_VERSION_CURRENT));
|
||||||
analyzer.setTokenSeparator("<SEP>");
|
analyzer.setTokenSeparator("<SEP>");
|
||||||
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
assertAnalyzesToReuse(analyzer, "please divide into shingles",
|
||||||
new String[] { "please", "please<SEP>divide",
|
new String[] { "please", "please<SEP>divide",
|
||||||
|
|
|
@ -26,7 +26,6 @@ import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.Tokenizer;
|
import org.apache.lucene.analysis.Tokenizer;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -836,7 +835,7 @@ public class ShingleFilterTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testReset() throws Exception {
|
public void testReset() throws Exception {
|
||||||
Tokenizer wsTokenizer = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("please divide this sentence"));
|
Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
|
||||||
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
TokenStream filter = new ShingleFilter(wsTokenizer, 2);
|
||||||
assertTokenStreamContents(filter,
|
assertTokenStreamContents(filter,
|
||||||
new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
|
new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
|
||||||
|
|
|
@ -31,7 +31,6 @@ import org.apache.lucene.analysis.payloads.PayloadHelper;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
|
||||||
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
|
||||||
import org.apache.lucene.analysis.tokenattributes.*;
|
import org.apache.lucene.analysis.tokenattributes.*;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -41,11 +40,11 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testIterator() throws IOException {
|
public void testIterator() throws IOException {
|
||||||
|
|
||||||
WhitespaceTokenizer wst = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader("one two three four five"));
|
WhitespaceTokenizer wst = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("one two three four five"));
|
||||||
ShingleMatrixFilter smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
|
ShingleMatrixFilter smf = new ShingleMatrixFilter(wst, 2, 2, '_', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
|
||||||
|
|
||||||
int i;
|
int i;
|
||||||
for(i=0; smf.incrementToken(); i++);
|
for(i=0; smf.incrementToken(); i++) {}
|
||||||
assertEquals(4, i);
|
assertEquals(4, i);
|
||||||
|
|
||||||
// call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
|
// call next once more. this should return false again rather than throwing an exception (LUCENE-1939)
|
||||||
|
@ -65,11 +64,11 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
assertFalse(ts.incrementToken());
|
assertFalse(ts.incrementToken());
|
||||||
|
|
||||||
TokenListStream tls;
|
TokenListStream tls;
|
||||||
LinkedList tokens;
|
LinkedList<Token> tokens;
|
||||||
|
|
||||||
// test a plain old token stream with synonyms translated to rows.
|
// test a plain old token stream with synonyms translated to rows.
|
||||||
|
|
||||||
tokens = new LinkedList();
|
tokens = new LinkedList<Token>();
|
||||||
tokens.add(createToken("please", 0, 6));
|
tokens.add(createToken("please", 0, 6));
|
||||||
tokens.add(createToken("divide", 7, 13));
|
tokens.add(createToken("divide", 7, 13));
|
||||||
tokens.add(createToken("this", 14, 18));
|
tokens.add(createToken("this", 14, 18));
|
||||||
|
@ -101,11 +100,11 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
TokenStream ts;
|
TokenStream ts;
|
||||||
TokenStream tls;
|
TokenStream tls;
|
||||||
LinkedList tokens;
|
LinkedList<Token> tokens;
|
||||||
|
|
||||||
// test a plain old token stream with synonyms tranlated to rows.
|
// test a plain old token stream with synonyms tranlated to rows.
|
||||||
|
|
||||||
tokens = new LinkedList();
|
tokens = new LinkedList<Token>();
|
||||||
tokens.add(tokenFactory("hello", 1, 0, 4));
|
tokens.add(tokenFactory("hello", 1, 0, 4));
|
||||||
tokens.add(tokenFactory("greetings", 0, 0, 4));
|
tokens.add(tokenFactory("greetings", 0, 0, 4));
|
||||||
tokens.add(tokenFactory("world", 1, 5, 10));
|
tokens.add(tokenFactory("world", 1, 5, 10));
|
||||||
|
@ -145,7 +144,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
|
ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
|
||||||
|
|
||||||
tokens = new LinkedList();
|
tokens = new LinkedList<Token>();
|
||||||
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
|
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
|
||||||
tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
|
tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
|
||||||
tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
|
tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
|
||||||
|
@ -286,7 +285,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
//
|
//
|
||||||
|
|
||||||
|
|
||||||
tokens = new LinkedList();
|
tokens = new LinkedList<Token>();
|
||||||
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
|
tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
|
||||||
tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
|
tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
|
||||||
tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
|
tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
|
||||||
|
@ -413,11 +412,6 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Token tokenFactory(String text, int startOffset, int endOffset) {
|
|
||||||
return tokenFactory(text, 1, 1f, startOffset, endOffset);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
|
private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
|
||||||
Token token = new Token(startOffset, endOffset);
|
Token token = new Token(startOffset, endOffset);
|
||||||
token.setTermBuffer(text);
|
token.setTermBuffer(text);
|
||||||
|
@ -430,10 +424,6 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
return tokenFactory(text, posIncr, 1f, 0, 0);
|
return tokenFactory(text, posIncr, 1f, 0, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, float weight) {
|
|
||||||
return tokenFactory(text, posIncr, weight, 0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
|
private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
|
||||||
Token token = new Token(startOffset, endOffset);
|
Token token = new Token(startOffset, endOffset);
|
||||||
token.setTermBuffer(text);
|
token.setTermBuffer(text);
|
||||||
|
@ -460,17 +450,6 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
assertEquals(text, termAtt.term());
|
assertEquals(text, termAtt.term());
|
||||||
}
|
}
|
||||||
|
|
||||||
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
|
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
|
||||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
|
||||||
PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
|
|
||||||
|
|
||||||
assertTrue(ts.incrementToken());
|
|
||||||
assertEquals(text, termAtt.term());
|
|
||||||
assertEquals(positionIncrement, posIncrAtt.getPositionIncrement());
|
|
||||||
assertEquals(boost, payloadAtt.getPayload() == null ? 1f : PayloadHelper.decodeFloat(payloadAtt.getPayload().getData()), 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
private void assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
|
||||||
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
TermAttribute termAtt = ts.addAttribute(TermAttribute.class);
|
||||||
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -505,7 +484,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public static class TokenListStream extends TokenStream {
|
public static class TokenListStream extends TokenStream {
|
||||||
|
|
||||||
private Collection tokens;
|
private Collection<Token> tokens;
|
||||||
TermAttribute termAtt;
|
TermAttribute termAtt;
|
||||||
PositionIncrementAttribute posIncrAtt;
|
PositionIncrementAttribute posIncrAtt;
|
||||||
PayloadAttribute payloadAtt;
|
PayloadAttribute payloadAtt;
|
||||||
|
@ -513,7 +492,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
TypeAttribute typeAtt;
|
TypeAttribute typeAtt;
|
||||||
FlagsAttribute flagsAtt;
|
FlagsAttribute flagsAtt;
|
||||||
|
|
||||||
public TokenListStream(Collection tokens) {
|
public TokenListStream(Collection<Token> tokens) {
|
||||||
this.tokens = tokens;
|
this.tokens = tokens;
|
||||||
termAtt = addAttribute(TermAttribute.class);
|
termAtt = addAttribute(TermAttribute.class);
|
||||||
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
posIncrAtt = addAttribute(PositionIncrementAttribute.class);
|
||||||
|
@ -523,7 +502,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
flagsAtt = addAttribute(FlagsAttribute.class);
|
flagsAtt = addAttribute(FlagsAttribute.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
private Iterator iterator;
|
private Iterator<Token> iterator;
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean incrementToken() throws IOException {
|
public boolean incrementToken() throws IOException {
|
||||||
|
@ -533,7 +512,7 @@ public class TestShingleMatrixFilter extends BaseTokenStreamTestCase {
|
||||||
if (!iterator.hasNext()) {
|
if (!iterator.hasNext()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
Token prototype = (Token) iterator.next();
|
Token prototype = iterator.next();
|
||||||
clearAttributes();
|
clearAttributes();
|
||||||
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
|
termAtt.setTermBuffer(prototype.termBuffer(), 0, prototype.termLength());
|
||||||
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
|
posIncrAtt.setPositionIncrement(prototype.getPositionIncrement());
|
||||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -37,7 +36,7 @@ public class DateRecognizerSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
|
DateRecognizerSinkFilter sinkFilter = new DateRecognizerSinkFilter(new SimpleDateFormat("MM/dd/yyyy", Locale.US));
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
String test = "The quick red fox jumped over the lazy brown dogs on 7/11/2006 The dogs finally reacted on 7/12/2006";
|
||||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
|
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||||
SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
SinkTokenStream sink = tee.newSinkTokenStream(sinkFilter);
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
||||||
|
|
|
@ -23,7 +23,6 @@ import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
@ -35,7 +34,7 @@ public class TokenRangeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
public void test() throws IOException {
|
public void test() throws IOException {
|
||||||
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
|
TokenRangeSinkFilter sinkFilter = new TokenRangeSinkFilter(2, 4);
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test)));
|
TeeSinkTokenFilter tee = new TeeSinkTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test)));
|
||||||
SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
SinkTokenStream rangeToks = tee.newSinkTokenStream(sinkFilter);
|
||||||
|
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
|
|
@ -27,11 +27,9 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
import org.apache.lucene.analysis.TeeSinkTokenFilter.SinkTokenStream;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public TokenTypeSinkTokenizerTest(String s) {
|
public TokenTypeSinkTokenizerTest(String s) {
|
||||||
super(s);
|
super(s);
|
||||||
}
|
}
|
||||||
|
@ -40,7 +38,7 @@ public class TokenTypeSinkTokenizerTest extends BaseTokenStreamTestCase {
|
||||||
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
|
TokenTypeSinkFilter sinkFilter = new TokenTypeSinkFilter("D");
|
||||||
String test = "The quick red fox jumped over the lazy brown dogs";
|
String test = "The quick red fox jumped over the lazy brown dogs";
|
||||||
|
|
||||||
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(test))));
|
TeeSinkTokenFilter ttf = new TeeSinkTokenFilter(new WordTokenFilter(new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(test))));
|
||||||
SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
SinkTokenStream sink = ttf.newSinkTokenStream(sinkFilter);
|
||||||
|
|
||||||
boolean seenDogs = false;
|
boolean seenDogs = false;
|
||||||
|
|
|
@ -33,13 +33,13 @@ import org.apache.lucene.util.Version;
|
||||||
public class TestSnowball extends BaseTokenStreamTestCase {
|
public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
public void testEnglish() throws Exception {
|
public void testEnglish() throws Exception {
|
||||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||||
assertAnalyzesTo(a, "he abhorred accents",
|
assertAnalyzesTo(a, "he abhorred accents",
|
||||||
new String[]{"he", "abhor", "accent"});
|
new String[]{"he", "abhor", "accent"});
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testStopwords() throws Exception {
|
public void testStopwords() throws Exception {
|
||||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
|
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English",
|
||||||
StandardAnalyzer.STOP_WORDS_SET);
|
StandardAnalyzer.STOP_WORDS_SET);
|
||||||
assertAnalyzesTo(a, "the quick brown fox jumped",
|
assertAnalyzesTo(a, "the quick brown fox jumped",
|
||||||
new String[]{"quick", "brown", "fox", "jump"});
|
new String[]{"quick", "brown", "fox", "jump"});
|
||||||
|
@ -50,7 +50,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
* we lowercase I correct for non-Turkish languages in either case.
|
* we lowercase I correct for non-Turkish languages in either case.
|
||||||
*/
|
*/
|
||||||
public void testEnglishLowerCase() throws Exception {
|
public void testEnglishLowerCase() throws Exception {
|
||||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||||
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
|
assertAnalyzesTo(a, "cryogenic", new String[] { "cryogen" });
|
||||||
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
|
assertAnalyzesTo(a, "CRYOGENIC", new String[] { "cryogen" });
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
* Test turkish lowercasing
|
* Test turkish lowercasing
|
||||||
*/
|
*/
|
||||||
public void testTurkish() throws Exception {
|
public void testTurkish() throws Exception {
|
||||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "Turkish");
|
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "Turkish");
|
||||||
|
|
||||||
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
assertAnalyzesTo(a, "ağacı", new String[] { "ağaç" });
|
||||||
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
assertAnalyzesTo(a, "AĞACI", new String[] { "ağaç" });
|
||||||
|
@ -84,7 +84,7 @@ public class TestSnowball extends BaseTokenStreamTestCase {
|
||||||
|
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
Analyzer a = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English");
|
Analyzer a = new SnowballAnalyzer(TEST_VERSION_CURRENT, "English");
|
||||||
assertAnalyzesToReuse(a, "he abhorred accents",
|
assertAnalyzesToReuse(a, "he abhorred accents",
|
||||||
new String[]{"he", "abhor", "accent"});
|
new String[]{"he", "abhor", "accent"});
|
||||||
assertAnalyzesToReuse(a, "she abhorred him",
|
assertAnalyzesToReuse(a, "she abhorred him",
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
new SwedishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarl");
|
||||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||||
|
@ -46,7 +45,7 @@ public class TestSwedishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("jaktkarlarne");
|
exclusionSet.add("jaktkarlarne");
|
||||||
Analyzer a = new SwedishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new SwedishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
SwedishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
checkOneTermReuse(a, "jaktkarlarne", "jaktkarlarne");
|
||||||
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
checkOneTermReuse(a, "jaktkarlens", "jaktkarl");
|
||||||
|
|
|
@ -18,7 +18,6 @@ package org.apache.lucene.analysis.th;
|
||||||
*/
|
*/
|
||||||
|
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
* Test case for ThaiAnalyzer, modified from TestFrenchAnalyzer
|
||||||
|
@ -32,7 +31,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* testcase for offsets
|
* testcase for offsets
|
||||||
*/
|
*/
|
||||||
public void testOffsets() throws Exception {
|
public void testOffsets() throws Exception {
|
||||||
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_CURRENT), "เดอะนิวยอร์กไทมส์",
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "เดอะนิวยอร์กไทมส์",
|
||||||
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"},
|
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์"},
|
||||||
new int[] { 0, 2, 7, 9, 12 },
|
new int[] { 0, 2, 7, 9, 12 },
|
||||||
new int[] { 2, 7, 9, 12, 17});
|
new int[] { 2, 7, 9, 12, 17});
|
||||||
|
@ -50,7 +49,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
* Instead, allow the definition of alphanum to include relevant categories like nonspacing marks!
|
||||||
*/
|
*/
|
||||||
public void testBuggyTokenType() throws Exception {
|
public void testBuggyTokenType() throws Exception {
|
||||||
assertAnalyzesTo(new ThaiAnalyzer(Version.LUCENE_CURRENT), "เดอะนิวยอร์กไทมส์ ๑๒๓",
|
assertAnalyzesTo(new ThaiAnalyzer(TEST_VERSION_CURRENT), "เดอะนิวยอร์กไทมส์ ๑๒๓",
|
||||||
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
|
new String[] { "เด", "อะนิว", "ยอ", "ร์ก", "ไทมส์", "๑๒๓" },
|
||||||
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
new String[] { "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
|
||||||
}
|
}
|
||||||
|
@ -64,7 +63,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public void testAnalyzer() throws Exception {
|
public void testAnalyzer() throws Exception {
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(analyzer, "", new String[] {});
|
assertAnalyzesTo(analyzer, "", new String[] {});
|
||||||
|
|
||||||
|
@ -89,7 +88,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
* Test that position increments are adjusted correctly for stopwords.
|
* Test that position increments are adjusted correctly for stopwords.
|
||||||
*/
|
*/
|
||||||
public void testPositionIncrements() throws Exception {
|
public void testPositionIncrements() throws Exception {
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
|
|
||||||
assertAnalyzesTo(analyzer, "ประโยคว่า the ประโยคว่า",
|
assertAnalyzesTo(analyzer, "ประโยคว่า the ประโยคว่า",
|
||||||
new String[] { "ประโยค", "ว่า", "ประโยค", "ว่า" },
|
new String[] { "ประโยค", "ว่า", "ประโยค", "ว่า" },
|
||||||
|
@ -106,7 +105,7 @@ public class TestThaiAnalyzer extends BaseTokenStreamTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void testReusableTokenStream() throws Exception {
|
public void testReusableTokenStream() throws Exception {
|
||||||
ThaiAnalyzer analyzer = new ThaiAnalyzer(Version.LUCENE_CURRENT);
|
ThaiAnalyzer analyzer = new ThaiAnalyzer(TEST_VERSION_CURRENT);
|
||||||
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
assertAnalyzesToReuse(analyzer, "", new String[] {});
|
||||||
|
|
||||||
assertAnalyzesToReuse(
|
assertAnalyzesToReuse(
|
||||||
|
|
|
@ -23,18 +23,17 @@ import java.util.Set;
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
/** This test fails with NPE when the
|
/** This test fails with NPE when the
|
||||||
* stopwords file is missing in classpath */
|
* stopwords file is missing in classpath */
|
||||||
public void testResourcesAvailable() {
|
public void testResourcesAvailable() {
|
||||||
new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
new TurkishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** test stopwords and stemming */
|
/** test stopwords and stemming */
|
||||||
public void testBasics() throws IOException {
|
public void testBasics() throws IOException {
|
||||||
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT);
|
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT);
|
||||||
// stemming
|
// stemming
|
||||||
checkOneTermReuse(a, "ağacı", "ağaç");
|
checkOneTermReuse(a, "ağacı", "ağaç");
|
||||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||||
|
@ -46,7 +45,7 @@ public class TestTurkishAnalyzer extends BaseTokenStreamTestCase {
|
||||||
public void testExclude() throws IOException {
|
public void testExclude() throws IOException {
|
||||||
Set<String> exclusionSet = new HashSet<String>();
|
Set<String> exclusionSet = new HashSet<String>();
|
||||||
exclusionSet.add("ağacı");
|
exclusionSet.add("ağacı");
|
||||||
Analyzer a = new TurkishAnalyzer(Version.LUCENE_CURRENT,
|
Analyzer a = new TurkishAnalyzer(TEST_VERSION_CURRENT,
|
||||||
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
|
||||||
checkOneTermReuse(a, "ağacı", "ağacı");
|
checkOneTermReuse(a, "ağacı", "ağacı");
|
||||||
checkOneTermReuse(a, "ağaç", "ağaç");
|
checkOneTermReuse(a, "ağaç", "ağaç");
|
||||||
|
|
|
@ -22,7 +22,6 @@ import java.io.StringReader;
|
||||||
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
|
||||||
import org.apache.lucene.analysis.TokenStream;
|
import org.apache.lucene.analysis.TokenStream;
|
||||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||||
import org.apache.lucene.util.Version;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test the Turkish lowercase filter.
|
* Test the Turkish lowercase filter.
|
||||||
|
@ -33,7 +32,7 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
||||||
* Test composed forms
|
* Test composed forms
|
||||||
*/
|
*/
|
||||||
public void testTurkishLowerCaseFilter() throws Exception {
|
public void testTurkishLowerCaseFilter() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
|
"\u0130STANBUL \u0130ZM\u0130R ISPARTA"));
|
||||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||||
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
||||||
|
@ -44,7 +43,7 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
||||||
* Test decomposed forms
|
* Test decomposed forms
|
||||||
*/
|
*/
|
||||||
public void testDecomposed() throws Exception {
|
public void testDecomposed() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
|
"\u0049\u0307STANBUL \u0049\u0307ZM\u0049\u0307R ISPARTA"));
|
||||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||||
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
assertTokenStreamContents(filter, new String[] {"istanbul", "izmir",
|
||||||
|
@ -57,7 +56,7 @@ public class TestTurkishLowerCaseFilter extends BaseTokenStreamTestCase {
|
||||||
* to U+0130 + U+0316, and is lowercased the same way.
|
* to U+0130 + U+0316, and is lowercased the same way.
|
||||||
*/
|
*/
|
||||||
public void testDecomposed2() throws Exception {
|
public void testDecomposed2() throws Exception {
|
||||||
TokenStream stream = new WhitespaceTokenizer(Version.LUCENE_CURRENT, new StringReader(
|
TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
|
||||||
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
|
"\u0049\u0316\u0307STANBUL \u0049\u0307ZM\u0049\u0307R I\u0316SPARTA"));
|
||||||
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
TurkishLowerCaseFilter filter = new TurkishLowerCaseFilter(stream);
|
||||||
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
|
assertTokenStreamContents(filter, new String[] {"i\u0316stanbul", "izmir",
|
||||||
|
|
|
@ -20,8 +20,6 @@ package org.apache.lucene.ant;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import junit.framework.TestCase;
|
|
||||||
|
|
||||||
import org.apache.lucene.analysis.Analyzer;
|
import org.apache.lucene.analysis.Analyzer;
|
||||||
import org.apache.lucene.analysis.StopAnalyzer;
|
import org.apache.lucene.analysis.StopAnalyzer;
|
||||||
import org.apache.lucene.queryParser.QueryParser;
|
import org.apache.lucene.queryParser.QueryParser;
|
||||||
|
@ -31,13 +29,13 @@ import org.apache.lucene.search.Searcher;
|
||||||
import org.apache.lucene.store.FSDirectory;
|
import org.apache.lucene.store.FSDirectory;
|
||||||
import org.apache.tools.ant.Project;
|
import org.apache.tools.ant.Project;
|
||||||
import org.apache.tools.ant.types.FileSet;
|
import org.apache.tools.ant.types.FileSet;
|
||||||
import org.apache.lucene.util.Version;
|
import org.apache.lucene.util.LuceneTestCase;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test cases for index task
|
* Test cases for index task
|
||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public class IndexTaskTest extends TestCase {
|
public class IndexTaskTest extends LuceneTestCase {
|
||||||
private final static String docHandler =
|
private final static String docHandler =
|
||||||
"org.apache.lucene.ant.FileExtensionDocumentHandler";
|
"org.apache.lucene.ant.FileExtensionDocumentHandler";
|
||||||
|
|
||||||
|
@ -55,7 +53,8 @@ public class IndexTaskTest extends TestCase {
|
||||||
*@exception IOException Description of Exception
|
*@exception IOException Description of Exception
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void setUp() throws Exception {
|
protected void setUp() throws Exception {
|
||||||
|
super.setUp();
|
||||||
Project project = new Project();
|
Project project = new Project();
|
||||||
|
|
||||||
IndexTask task = new IndexTask();
|
IndexTask task = new IndexTask();
|
||||||
|
@ -71,12 +70,12 @@ public class IndexTaskTest extends TestCase {
|
||||||
|
|
||||||
dir = FSDirectory.open(indexDir);
|
dir = FSDirectory.open(indexDir);
|
||||||
searcher = new IndexSearcher(dir, true);
|
searcher = new IndexSearcher(dir, true);
|
||||||
analyzer = new StopAnalyzer(Version.LUCENE_CURRENT);
|
analyzer = new StopAnalyzer(TEST_VERSION_CURRENT);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void testSearch() throws Exception {
|
public void testSearch() throws Exception {
|
||||||
Query query = new QueryParser(Version.LUCENE_CURRENT, "contents",analyzer).parse("test");
|
Query query = new QueryParser(TEST_VERSION_CURRENT, "contents",analyzer).parse("test");
|
||||||
|
|
||||||
int numHits = searcher.search(query, null, 1000).totalHits;
|
int numHits = searcher.search(query, null, 1000).totalHits;
|
||||||
|
|
||||||
|
@ -88,9 +87,10 @@ public class IndexTaskTest extends TestCase {
|
||||||
* TODO: remove indexDir?
|
* TODO: remove indexDir?
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public void tearDown() throws IOException {
|
protected void tearDown() throws Exception {
|
||||||
searcher.close();
|
searcher.close();
|
||||||
dir.close();
|
dir.close();
|
||||||
|
super.tearDown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue