mirror of https://github.com/apache/lucene.git
SOLR-1674: Improve analysis tests and cut over to new TokenStream API
git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@892821 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
5be5c31bb0
commit
b105beef66
|
@ -175,6 +175,9 @@ Other Changes
|
|||
* SOLR-1662: Added Javadocs in BufferedTokenStream and fixed incorrect cloning
|
||||
in TestBufferedTokenStream (Robert Muir, Uwe Schindler via shalin)
|
||||
|
||||
* SOLR-1674: Improve analysis tests and cut over to new TokenStream API.
|
||||
(Robert Muir via Mark Miller)
|
||||
|
||||
Build
|
||||
----------------------
|
||||
|
||||
|
|
|
@ -17,19 +17,21 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
import org.apache.solr.core.SolrConfig;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.util.TestHarness;
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
*
|
||||
*/
|
||||
abstract public class AnalysisTestCase extends TestCase {
|
||||
abstract public class AnalysisTestCase extends AbstractSolrTestCase {
|
||||
protected SolrConfig solrConfig;
|
||||
/** Creates a new instance of AnalysisTestCase */
|
||||
public AnalysisTestCase() {
|
||||
}
|
||||
|
||||
public String getSolrConfigFile() { return "solrconfig.xml"; }
|
||||
public String getSchemaFile() { return "schema.xml"; }
|
||||
|
||||
public void setUp() throws Exception {
|
||||
// if you override setUp or tearDown, you better call
|
||||
|
|
|
@ -18,174 +18,134 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/**
|
||||
* General token testing helper functions
|
||||
*/
|
||||
public abstract class BaseTokenTestCase extends AnalysisTestCase
|
||||
{
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(new String(t.termBuffer(), 0, t.termLength()));
|
||||
// some helpers to test Analyzers and TokenStreams:
|
||||
// these are taken from Lucene's BaseTokenStreamTestCase
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(new String(t.termBuffer(), 0, t.termLength()));
|
||||
}
|
||||
in.close();
|
||||
return out.toString();
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output,
|
||||
int startOffsets[], int endOffsets[], String types[], int posIncrements[])
|
||||
throws IOException {
|
||||
assertNotNull(output);
|
||||
assertTrue("has TermAttribute", ts.hasAttribute(TermAttribute.class));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
|
||||
OffsetAttribute offsetAtt = null;
|
||||
if (startOffsets != null || endOffsets != null) {
|
||||
assertTrue("has OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
|
||||
offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
|
||||
}
|
||||
|
||||
public List<String> tok2str(Iterable<Token> tokLst) {
|
||||
ArrayList<String> lst = new ArrayList<String>();
|
||||
for ( Token t : tokLst ) {
|
||||
lst.add( new String(t.termBuffer(), 0, t.termLength()));
|
||||
}
|
||||
return lst;
|
||||
TypeAttribute typeAtt = null;
|
||||
if (types != null) {
|
||||
assertTrue("has TypeAttribute", ts.hasAttribute(TypeAttribute.class));
|
||||
typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
|
||||
}
|
||||
|
||||
|
||||
public void assertTokEqual(List<Token> a, List<Token> b) {
|
||||
assertTokEq(a,b,false);
|
||||
assertTokEq(b,a,false);
|
||||
PositionIncrementAttribute posIncrAtt = null;
|
||||
if (posIncrements != null) {
|
||||
assertTrue("has PositionIncrementAttribute", ts
|
||||
.hasAttribute(PositionIncrementAttribute.class));
|
||||
posIncrAtt = (PositionIncrementAttribute) ts
|
||||
.getAttribute(PositionIncrementAttribute.class);
|
||||
}
|
||||
|
||||
public void assertTokEqualOff(List<Token> a, List<Token> b) {
|
||||
assertTokEq(a,b,true);
|
||||
assertTokEq(b,a,true);
|
||||
ts.reset();
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
ts.clearAttributes();
|
||||
termAtt.setTermBuffer("bogusTerm");
|
||||
if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
|
||||
if (typeAtt != null) typeAtt.setType("bogusType");
|
||||
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
|
||||
|
||||
assertTrue("token " + i + " exists", ts.incrementToken());
|
||||
assertEquals("term " + i, output[i], termAtt.term());
|
||||
if (startOffsets != null) assertEquals("startOffset " + i,
|
||||
startOffsets[i], offsetAtt.startOffset());
|
||||
if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i],
|
||||
offsetAtt.endOffset());
|
||||
if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
|
||||
if (posIncrements != null) assertEquals("posIncrement " + i,
|
||||
posIncrements[i], posIncrAtt.getPositionIncrement());
|
||||
}
|
||||
assertFalse("end of stream", ts.incrementToken());
|
||||
ts.end();
|
||||
ts.close();
|
||||
}
|
||||
|
||||
private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = a.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
|
||||
, checkOff ? tok.startOffset() : -1
|
||||
, checkOff ? tok.endOffset() : -1
|
||||
))
|
||||
{
|
||||
fail(a + "!=" + b);
|
||||
}
|
||||
}
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output)
|
||||
throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, null, null);
|
||||
}
|
||||
|
||||
public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = lst.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val)
|
||||
&& (startOff==-1 || tok.startOffset()==startOff)
|
||||
&& (endOff ==-1 || tok.endOffset() ==endOff )
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output,
|
||||
String[] types) throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, types, null);
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* Return a list of tokens according to a test string format:
|
||||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
*/
|
||||
public List<Token> tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
List<Token> result = new ArrayList<Token>();
|
||||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output,
|
||||
int[] posIncrements) throws IOException {
|
||||
assertTokenStreamContents(ts, output, null, null, null, posIncrements);
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output,
|
||||
int startOffsets[], int endOffsets[]) throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null);
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
public static void assertTokenStreamContents(TokenStream ts, String[] output,
|
||||
int startOffsets[], int endOffsets[], int[] posIncrements)
|
||||
throws IOException {
|
||||
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null,
|
||||
posIncrements);
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
t.setPositionIncrement(0);
|
||||
result.add(t);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
public static void assertAnalyzesTo(Analyzer a, String input,
|
||||
String[] output, int startOffsets[], int endOffsets[], String types[],
|
||||
int posIncrements[]) throws IOException {
|
||||
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)),
|
||||
output, startOffsets, endOffsets, types, posIncrements);
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// These may be useful beyond test cases...
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
static List<Token> getTokens(TokenStream tstream) throws IOException {
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
while (true) {
|
||||
Token t = tstream.next();
|
||||
if (t==null) break;
|
||||
tokens.add(t);
|
||||
}
|
||||
return tokens;
|
||||
public static void assertAnalyzesTo(Analyzer a, String input, String[] output)
|
||||
throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, null);
|
||||
}
|
||||
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
public static void assertAnalyzesTo(Analyzer a, String input,
|
||||
String[] output, String[] types) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, types, null);
|
||||
}
|
||||
public IterTokenStream(Iterable<Token> toks) {
|
||||
this.toks = toks.iterator();
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input,
|
||||
String[] output, int[] posIncrements) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
|
||||
}
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
public IterTokenStream(String ... text) {
|
||||
int off = 0;
|
||||
ArrayList<Token> t = new ArrayList<Token>( text.length );
|
||||
for( String txt : text ) {
|
||||
t.add( new Token( txt, off, off+txt.length() ) );
|
||||
off += txt.length() + 2;
|
||||
}
|
||||
this.toks = t.iterator();
|
||||
}
|
||||
@Override
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input,
|
||||
String[] output, int startOffsets[], int endOffsets[]) throws IOException {
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
|
||||
}
|
||||
|
||||
public static void assertAnalyzesTo(Analyzer a, String input,
|
||||
String[] output, int startOffsets[], int endOffsets[], int[] posIncrements)
|
||||
throws IOException {
|
||||
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null,
|
||||
posIncrements);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,9 +17,13 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
@ -29,7 +33,7 @@ import java.util.HashMap;
|
|||
* used by the StopFilterFactoryTest TODO: consider creating separate test files
|
||||
* so this won't break if stop filter test files change
|
||||
**/
|
||||
public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
|
||||
public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
@ -66,4 +70,23 @@ public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
|
|||
.isIgnoreCase() == true);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* If no words are provided, then a set of english default stopwords is used.
|
||||
*/
|
||||
public void testDefaults() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,29 +16,20 @@
|
|||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
|
||||
|
||||
/**
|
||||
* Tests CommonGramsQueryFilter
|
||||
*/
|
||||
public class CommonGramsFilterTest extends TestCase {
|
||||
public class CommonGramsFilterTest extends BaseTokenTestCase {
|
||||
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
|
||||
"of" };
|
||||
|
||||
|
@ -63,18 +54,6 @@ public class CommonGramsFilterTest extends TestCase {
|
|||
assertEquals("How", term.term());
|
||||
}
|
||||
|
||||
public void testCommonGramsQueryFilter() throws Exception {
|
||||
Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
|
||||
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
|
||||
.hasNext();) {
|
||||
Map.Entry<String, String> me = i.next();
|
||||
String input = me.getKey();
|
||||
String expected = me.getValue();
|
||||
String message = "message: input value is: " + input;
|
||||
assertEquals(message, expected, testFilter(input, "query"));
|
||||
}
|
||||
}
|
||||
|
||||
public void testQueryReset() throws Exception {
|
||||
final String input = "How the s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
|
@ -93,18 +72,6 @@ public class CommonGramsFilterTest extends TestCase {
|
|||
assertEquals("How_the", term.term());
|
||||
}
|
||||
|
||||
public void testCommonGramsFilter() throws Exception {
|
||||
Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
|
||||
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
|
||||
.hasNext();) {
|
||||
Map.Entry<String, String> me = i.next();
|
||||
String input = me.getKey();
|
||||
String expected = me.getValue();
|
||||
String message = "message: input value is: " + input;
|
||||
assertEquals(message, expected, testFilter(input, "common"));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This is for testing CommonGramsQueryFilter which outputs a set of tokens
|
||||
* optimized for querying with only one token at each position, either a
|
||||
|
@ -116,150 +83,226 @@ public class CommonGramsFilterTest extends TestCase {
|
|||
*
|
||||
* @return Map<String,String>
|
||||
*/
|
||||
private static Map<String, String> initQueryMap() {
|
||||
Map<String, String> input2expected = new LinkedHashMap<String, String>();
|
||||
public void testCommonGramsQueryFilter() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsQueryFilter(new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(in), commonWords));
|
||||
}
|
||||
};
|
||||
|
||||
// Stop words used below are "of" "the" and "s"
|
||||
|
||||
// two word queries
|
||||
input2expected.put("brown fox", "/brown/fox");
|
||||
input2expected.put("the fox", "/the_fox");
|
||||
input2expected.put("fox of", "/fox_of");
|
||||
input2expected.put("of the", "/of_the");
|
||||
assertAnalyzesTo(a, "brown fox",
|
||||
new String[] { "brown", "fox" });
|
||||
assertAnalyzesTo(a, "the fox",
|
||||
new String[] { "the_fox" });
|
||||
assertAnalyzesTo(a, "fox of",
|
||||
new String[] { "fox_of" });
|
||||
assertAnalyzesTo(a, "of the",
|
||||
new String[] { "of_the" });
|
||||
|
||||
// one word queries
|
||||
input2expected.put("the", "/the");
|
||||
input2expected.put("foo", "/foo");
|
||||
assertAnalyzesTo(a, "the",
|
||||
new String[] { "the" });
|
||||
assertAnalyzesTo(a, "foo",
|
||||
new String[] { "foo" });
|
||||
|
||||
// 3 word combinations s=stopword/common word n=not a stop word
|
||||
input2expected.put("n n n", "/n/n/n");
|
||||
input2expected.put("quick brown fox", "/quick/brown/fox");
|
||||
assertAnalyzesTo(a, "n n n",
|
||||
new String[] { "n", "n", "n" });
|
||||
assertAnalyzesTo(a, "quick brown fox",
|
||||
new String[] { "quick", "brown", "fox" });
|
||||
|
||||
input2expected.put("n n s", "/n/n_s");
|
||||
input2expected.put("quick brown the", "/quick/brown_the");
|
||||
assertAnalyzesTo(a, "n n s",
|
||||
new String[] { "n", "n_s" });
|
||||
assertAnalyzesTo(a, "quick brown the",
|
||||
new String[] { "quick", "brown_the" });
|
||||
|
||||
input2expected.put("n s n", "/n_s/s_n");
|
||||
input2expected.put("quick the brown", "/quick_the/the_brown");
|
||||
assertAnalyzesTo(a, "n s n",
|
||||
new String[] { "n_s", "s_n" });
|
||||
assertAnalyzesTo(a, "quick the brown",
|
||||
new String[] { "quick_the", "the_brown" });
|
||||
|
||||
input2expected.put("n s s", "/n_s/s_s");
|
||||
input2expected.put("fox of the", "/fox_of/of_the");
|
||||
assertAnalyzesTo(a, "n s s",
|
||||
new String[] { "n_s", "s_s" });
|
||||
assertAnalyzesTo(a, "fox of the",
|
||||
new String[] { "fox_of", "of_the" });
|
||||
|
||||
input2expected.put("s n n", "/s_n/n/n");
|
||||
input2expected.put("the quick brown", "/the_quick/quick/brown");
|
||||
assertAnalyzesTo(a, "s n n",
|
||||
new String[] { "s_n", "n", "n" });
|
||||
assertAnalyzesTo(a, "the quick brown",
|
||||
new String[] { "the_quick", "quick", "brown" });
|
||||
|
||||
input2expected.put("s n s", "/s_n/n_s");
|
||||
input2expected.put("the fox of", "/the_fox/fox_of");
|
||||
assertAnalyzesTo(a, "s n s",
|
||||
new String[] { "s_n", "n_s" });
|
||||
assertAnalyzesTo(a, "the fox of",
|
||||
new String[] { "the_fox", "fox_of" });
|
||||
|
||||
input2expected.put("s s n", "/s_s/s_n");
|
||||
input2expected.put("of the fox", "/of_the/the_fox");
|
||||
assertAnalyzesTo(a, "s s n",
|
||||
new String[] { "s_s", "s_n" });
|
||||
assertAnalyzesTo(a, "of the fox",
|
||||
new String[] { "of_the", "the_fox" });
|
||||
|
||||
input2expected.put("s s s", "/s_s/s_s");
|
||||
input2expected.put("of the of", "/of_the/the_of");
|
||||
|
||||
return input2expected;
|
||||
assertAnalyzesTo(a, "s s s",
|
||||
new String[] { "s_s", "s_s" });
|
||||
assertAnalyzesTo(a, "of the of",
|
||||
new String[] { "of_the", "the_of" });
|
||||
}
|
||||
|
||||
private static Map<String, String> initMap() {
|
||||
Map<String, String> input2expected = new HashMap<String, String>();
|
||||
public void testCommonGramsFilter() throws Exception {
|
||||
Analyzer a = new Analyzer() {
|
||||
@Override
|
||||
public TokenStream tokenStream(String field, Reader in) {
|
||||
return new CommonGramsFilter(
|
||||
new WhitespaceTokenizer(in), commonWords);
|
||||
}
|
||||
};
|
||||
|
||||
// Stop words used below are "of" "the" and "s"
|
||||
// one word queries
|
||||
input2expected.put("the", "/the");
|
||||
input2expected.put("foo", "/foo");
|
||||
assertAnalyzesTo(a, "the", new String[] { "the" });
|
||||
assertAnalyzesTo(a, "foo", new String[] { "foo" });
|
||||
|
||||
// two word queries
|
||||
input2expected.put("brown fox", "/brown/fox");
|
||||
input2expected.put("the fox", "/the,the_fox/fox");
|
||||
input2expected.put("fox of", "/fox,fox_of/of");
|
||||
input2expected.put("of the", "/of,of_the/the");
|
||||
assertAnalyzesTo(a, "brown fox",
|
||||
new String[] { "brown", "fox" },
|
||||
new int[] { 1, 1 });
|
||||
assertAnalyzesTo(a, "the fox",
|
||||
new String[] { "the", "the_fox", "fox" },
|
||||
new int[] { 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "fox of",
|
||||
new String[] { "fox", "fox_of", "of" },
|
||||
new int[] { 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "of the",
|
||||
new String[] { "of", "of_the", "the" },
|
||||
new int[] { 1, 0, 1 });
|
||||
|
||||
// 3 word combinations s=stopword/common word n=not a stop word
|
||||
input2expected.put("n n n", "/n/n/n");
|
||||
input2expected.put("quick brown fox", "/quick/brown/fox");
|
||||
assertAnalyzesTo(a, "n n n",
|
||||
new String[] { "n", "n", "n" },
|
||||
new int[] { 1, 1, 1 });
|
||||
assertAnalyzesTo(a, "quick brown fox",
|
||||
new String[] { "quick", "brown", "fox" },
|
||||
new int[] { 1, 1, 1 });
|
||||
|
||||
input2expected.put("n n s", "/n/n,n_s/s");
|
||||
input2expected.put("quick brown the", "/quick/brown,brown_the/the");
|
||||
assertAnalyzesTo(a, "n n s",
|
||||
new String[] { "n", "n", "n_s", "s" },
|
||||
new int[] { 1, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "quick brown the",
|
||||
new String[] { "quick", "brown", "brown_the", "the" },
|
||||
new int[] { 1, 1, 0, 1 });
|
||||
|
||||
input2expected.put("n s n", "/n,n_s/s,s_n/n");
|
||||
input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
|
||||
assertAnalyzesTo(a, "n s n",
|
||||
new String[] { "n", "n_s", "s", "s_n", "n" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "quick the fox",
|
||||
new String[] { "quick", "quick_the", "the", "the_fox", "fox" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
|
||||
input2expected.put("n s s", "/n,n_s/s,s_s/s");
|
||||
input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
|
||||
assertAnalyzesTo(a, "n s s",
|
||||
new String[] { "n", "n_s", "s", "s_s", "s" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "fox of the",
|
||||
new String[] { "fox", "fox_of", "of", "of_the", "the" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
|
||||
input2expected.put("s n n", "/s,s_n/n/n");
|
||||
input2expected.put("the quick brown", "/the,the_quick/quick/brown");
|
||||
assertAnalyzesTo(a, "s n n",
|
||||
new String[] { "s", "s_n", "n", "n" },
|
||||
new int[] { 1, 0, 1, 1 });
|
||||
assertAnalyzesTo(a, "the quick brown",
|
||||
new String[] { "the", "the_quick", "quick", "brown" },
|
||||
new int[] { 1, 0, 1, 1 });
|
||||
|
||||
input2expected.put("s n s", "/s,s_n/n,n_s/s");
|
||||
input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
|
||||
assertAnalyzesTo(a, "s n s",
|
||||
new String[] { "s", "s_n", "n", "n_s", "s" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "the fox of",
|
||||
new String[] { "the", "the_fox", "fox", "fox_of", "of" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
|
||||
input2expected.put("s s n", "/s,s_s/s,s_n/n");
|
||||
input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
|
||||
assertAnalyzesTo(a, "s s n",
|
||||
new String[] { "s", "s_s", "s", "s_n", "n" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "of the fox",
|
||||
new String[] { "of", "of_the", "the", "the_fox", "fox" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
|
||||
input2expected.put("s s s", "/s,s_s/s,s_s/s");
|
||||
input2expected.put("of the of", "/of,of_the/the,the_of/of");
|
||||
|
||||
return input2expected;
|
||||
assertAnalyzesTo(a, "s s s",
|
||||
new String[] { "s", "s_s", "s", "s_s", "s" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
assertAnalyzesTo(a, "of the of",
|
||||
new String[] { "of", "of_the", "the", "the_of", "of" },
|
||||
new int[] { 1, 0, 1, 0, 1 });
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
|
||||
* modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
|
||||
*/
|
||||
/**
|
||||
* Very simple tokenizer that breaks up a string into a series of Lucene
|
||||
* {@link Token Token}s.
|
||||
* Test that CommonGramsFilter works correctly in case-insensitive mode
|
||||
*/
|
||||
static class StringTokenStream extends TokenStream {
|
||||
private String str;
|
||||
|
||||
private int prevEnd = 0;
|
||||
|
||||
private StringTokenizer tok;
|
||||
|
||||
private int count = 0;
|
||||
|
||||
public StringTokenStream(String str, String delim) {
|
||||
this.str = str;
|
||||
tok = new StringTokenizer(str, delim);
|
||||
public void testCaseSensitive() throws Exception {
|
||||
final String input = "How The s a brown s cow d like A B thing?";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
Set common = CommonGramsFilter.makeCommonSet(commonWords);
|
||||
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
|
||||
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
|
||||
"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
|
||||
"cow_d", "d", "d_like", "like", "A", "B", "thing?"});
|
||||
}
|
||||
|
||||
public Token next() {
|
||||
if (!tok.hasMoreTokens())
|
||||
return null;
|
||||
count++;
|
||||
String term = tok.nextToken();
|
||||
Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
|
||||
prevEnd)
|
||||
+ term.length(), "word");
|
||||
prevEnd = t.endOffset();
|
||||
return t;
|
||||
}
|
||||
/**
|
||||
* Test CommonGramsQueryFilter in the case that the last word is a stopword
|
||||
*/
|
||||
public void testLastWordisStopWord() throws Exception {
|
||||
final String input = "dog the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "dog_the" });
|
||||
}
|
||||
|
||||
public static String testFilter(String in, String type) throws IOException {
|
||||
TokenStream nsf;
|
||||
StringTokenStream ts = new StringTokenStream(in, " .");
|
||||
if (type.equals("query")) {
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
|
||||
nsf = new CommonGramsQueryFilter(cgf);
|
||||
} else {
|
||||
nsf = new CommonGramsFilter(ts, commonWords);
|
||||
/**
|
||||
* Test CommonGramsQueryFilter in the case that the first word is a stopword
|
||||
*/
|
||||
public void testFirstWordisStopWord() throws Exception {
|
||||
final String input = "the dog";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_dog" });
|
||||
}
|
||||
|
||||
StringBuffer outBuf = new StringBuffer();
|
||||
while (true) {
|
||||
Token t = nsf.next();
|
||||
if (t == null)
|
||||
break;
|
||||
for (int i = 0; i < t.getPositionIncrement(); i++)
|
||||
outBuf.append('/');
|
||||
if (t.getPositionIncrement() == 0)
|
||||
outBuf.append(',');
|
||||
outBuf.append(t.term());
|
||||
/**
|
||||
* Test CommonGramsQueryFilter in the case of a single (stop)word query
|
||||
*/
|
||||
public void testOneWordQueryStopWord() throws Exception {
|
||||
final String input = "the";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the" });
|
||||
}
|
||||
|
||||
String out = outBuf.toString();
|
||||
out = out.replaceAll(" ", "");
|
||||
return out;
|
||||
/**
|
||||
* Test CommonGramsQueryFilter in the case of a single word query
|
||||
*/
|
||||
public void testOneWordQuery() throws Exception {
|
||||
final String input = "monster";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "monster" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CommonGramsQueryFilter when first and last words are stopwords.
|
||||
*/
|
||||
public void TestFirstAndLastStopWord() throws Exception {
|
||||
final String input = "the of";
|
||||
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
|
||||
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
|
||||
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
|
||||
assertTokenStreamContents(nsf, new String[] { "the_of" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,9 +16,12 @@
|
|||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.Set;
|
||||
import java.util.Map;
|
||||
import java.util.HashMap;
|
||||
|
@ -28,7 +31,7 @@ import java.util.HashMap;
|
|||
* used by the StopFilterFactoryTest TODO: consider creating separate test files
|
||||
* so this won't break if stop filter test files change
|
||||
**/
|
||||
public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
|
||||
public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
|
||||
public String getSchemaFile() {
|
||||
return "schema-stop-keep.xml";
|
||||
}
|
||||
|
@ -65,4 +68,23 @@ public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
|
|||
.isIgnoreCase() == true);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* If no words are provided, then a set of english default stopwords is used.
|
||||
*/
|
||||
public void testDefaults() throws Exception {
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
assertTrue("loader is null and it shouldn't be", loader != null);
|
||||
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
|
||||
Map<String, String> args = new HashMap<String, String>();
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Set words = factory.getCommonWords();
|
||||
assertTrue("words is null and it shouldn't be", words != null);
|
||||
assertTrue(words.contains("the"));
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "testing_the", "the_factory" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,36 +16,24 @@
|
|||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
|
||||
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
|
||||
|
||||
public void testDefaults() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
factory.init(new HashMap<String, String>());
|
||||
TokenStream inputStream = new IterTokenStream("international");
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
||||
Token token = filteredStream.next(new Token());
|
||||
assertEquals(13, token.termLength());
|
||||
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
token = filteredStream.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filteredStream.next(new Token()));
|
||||
assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" });
|
||||
}
|
||||
|
||||
public void testSettingSizeAndInject() throws Exception {
|
||||
|
@ -55,17 +43,31 @@ public class DoubleMetaphoneFilterFactoryTest extends TestCase {
|
|||
parameters.put("maxCodeLength", "8");
|
||||
factory.init(parameters);
|
||||
|
||||
TokenStream inputStream = new IterTokenStream("international");
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that reset() removes any state (buffered tokens)
|
||||
*/
|
||||
public void testReset() throws Exception {
|
||||
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
|
||||
factory.init(new HashMap<String, String>());
|
||||
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
|
||||
TokenStream filteredStream = factory.create(inputStream);
|
||||
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
|
||||
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
|
||||
|
||||
Token token = filteredStream.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
assertTrue(filteredStream.incrementToken());
|
||||
assertEquals(13, termAtt.termLength());
|
||||
assertEquals("international", termAtt.term());
|
||||
filteredStream.reset();
|
||||
|
||||
assertNull(filteredStream.next(new Token()));
|
||||
// ensure there are no more tokens, such as ANTRNXNL
|
||||
assertFalse(filteredStream.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,94 +16,52 @@
|
|||
*/
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
public class DoubleMetaphoneFilterTest extends TestCase {
|
||||
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
|
||||
|
||||
public void testSize4FalseInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
assertTokenStreamContents(filter, new String[] { "ANTR" });
|
||||
}
|
||||
|
||||
public void testSize4TrueInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(13, token.termLength());
|
||||
assertEquals("international", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
|
||||
}
|
||||
|
||||
public void testAlternateInjectFalse() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("Kuczewski");
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(4, token.termLength());
|
||||
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertNull(filter.next(new Token()));
|
||||
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
|
||||
}
|
||||
|
||||
public void testSize8FalseInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream("international");
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
|
||||
assertNull(filter.next(new Token()));
|
||||
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&" });
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
|
||||
|
||||
Token token = filter.next(new Token());
|
||||
assertEquals(5, token.termLength());
|
||||
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
|
||||
|
||||
token = filter.next(new Token());
|
||||
assertEquals(8, token.termLength());
|
||||
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
|
||||
.termLength()));
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
}
|
||||
|
||||
public void testNonConvertableStringsWithoutInject() throws Exception {
|
||||
TokenStream stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&" });
|
||||
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
|
||||
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
|
||||
assertEquals("12345", filter.next(new Token()).term());
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
|
||||
|
||||
// should have something after the stream
|
||||
stream = new IterTokenStream(
|
||||
new String[] { "12345", "#$%@#^%&", "hello" });
|
||||
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
|
||||
filter = new DoubleMetaphoneFilter(stream, 8, false);
|
||||
assertNotNull(filter.next(new Token()));
|
||||
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -16,11 +16,17 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -32,11 +38,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
public void test() throws IOException {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
gold[i] = stemmer.getCurrent();
|
||||
}
|
||||
|
||||
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
|
||||
|
@ -44,21 +50,23 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
public void testProtected() throws Exception {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
gold[i] = stemmer.getCurrent();
|
||||
} else {
|
||||
gold.append(test[i]).append(' ');
|
||||
gold[i] = test[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -69,8 +77,10 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
|
|
|
@ -17,9 +17,13 @@ package org.apache.solr.analysis;
|
|||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
public class LengthFilterTest extends BaseTokenTestCase {
|
||||
|
||||
public void test() throws IOException {
|
||||
|
@ -28,9 +32,8 @@ public class LengthFilterTest extends BaseTokenTestCase {
|
|||
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
|
||||
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
|
||||
factory.init(args);
|
||||
String[] test = {"foo", "foobar", "super-duper-trooper"};
|
||||
String gold = "foobar";
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString(), out);
|
||||
String test = "foo foobar super-duper-trooper";
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
|
||||
assertTokenStreamContents(stream, new String[] { "foobar" });
|
||||
}
|
||||
}
|
|
@ -16,11 +16,18 @@ package org.apache.solr.analysis;
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
import org.apache.solr.common.util.StrUtils;
|
||||
import org.tartarus.snowball.ext.EnglishStemmer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -32,11 +39,11 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
public void test() throws IOException {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
for (String aTest : test) {
|
||||
stemmer.setCurrent(aTest);
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
gold[i] = stemmer.getCurrent();
|
||||
}
|
||||
|
||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||
|
@ -45,21 +52,27 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
|
||||
factory.init(args);
|
||||
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
public void testProtected() throws Exception {
|
||||
/**
|
||||
* Tests the protected words mechanism of EnglishPorterFilterFactory
|
||||
*/
|
||||
@Deprecated
|
||||
public void testProtectedOld() throws Exception {
|
||||
EnglishStemmer stemmer = new EnglishStemmer();
|
||||
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
|
||||
StringBuilder gold = new StringBuilder();
|
||||
String[] gold = new String[test.length];
|
||||
for (int i = 0; i < test.length; i++) {
|
||||
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
|
||||
stemmer.setCurrent(test[i]);
|
||||
stemmer.stem();
|
||||
gold.append(stemmer.getCurrent()).append(' ');
|
||||
gold[i] = stemmer.getCurrent();
|
||||
} else {
|
||||
gold.append(test[i]).append(' ');
|
||||
gold[i] = test[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,8 +83,10 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
List<String> lines = new ArrayList<String>();
|
||||
Collections.addAll(lines, "banks", "fledgling");
|
||||
factory.inform(new LinesMockSolrResourceLoader(lines));
|
||||
String out = tsToString(factory.create(new IterTokenStream(test)));
|
||||
assertEquals(gold.toString().trim(), out);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, gold);
|
||||
}
|
||||
|
||||
class LinesMockSolrResourceLoader implements ResourceLoader {
|
||||
|
@ -93,5 +108,22 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
|
|||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Test the protected words mechanism of SnowballPorterFilterFactory
|
||||
*/
|
||||
public void testProtected() throws Exception {
|
||||
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("protected", "protwords.txt");
|
||||
args.put("language", "English");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
Reader reader = new StringReader("ridding of some stemming");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,65 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Arabic filter Factories are working.
|
||||
*/
|
||||
public class TestArabicFilters extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test ArabicLetterTokenizerFactory
|
||||
*/
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream, new String[] {"الذين", "مَلكت", "أيمانكم"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ArabicNormalizationFilterFactory
|
||||
*/
|
||||
public void testNormalizer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = filterFactory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test ArabicStemFilterFactory
|
||||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("الذين مَلكت أيمانكم");
|
||||
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
|
||||
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
|
||||
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = normFactory.create(tokenizer);
|
||||
stream = stemFactory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Brazilian stem filter factory is working.
|
||||
*/
|
||||
public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems and normalizes text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Brasília");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "brasil" });
|
||||
}
|
||||
}
|
|
@ -60,9 +60,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
|||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||
TokenStream ts = new AB_Q_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
final String actual = tsToString(ts);
|
||||
//System.out.println(actual);
|
||||
assertEquals(expected, actual);
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testABAAB() throws Exception {
|
||||
|
@ -70,9 +68,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
|
|||
final String expected = "How now A A B brown A cow B like A A B thing?";
|
||||
TokenStream ts = new AB_AAB_Stream
|
||||
(new WhitespaceTokenizer(new StringReader(input)));
|
||||
final String actual = tsToString(ts);
|
||||
//System.out.println(actual);
|
||||
assertEquals(expected, actual);
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testReset() throws Exception {
|
||||
|
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the CJK tokenizer factory is working.
|
||||
*/
|
||||
public class TestCJKTokenizerFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the tokenizer actually tokenizes CJK text correctly
|
||||
*/
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("我是中国人");
|
||||
CJKTokenizerFactory factory = new CJKTokenizerFactory();
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream, new String[] {"我是", "是中", "中国", "国人"});
|
||||
}
|
||||
}
|
|
@ -17,14 +17,18 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
*
|
||||
*/
|
||||
public class TestCapitalizationFilter extends BaseTokenTestCase {
|
||||
|
||||
|
@ -64,39 +68,46 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
|
||||
|
||||
String out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
|
||||
assertEquals( "Hello there my name is ryan", out );
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
|
||||
|
||||
// now each token
|
||||
factory.onlyFirstWord = false;
|
||||
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
|
||||
assertEquals( "Hello There My Name Is Ryan", out );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
|
||||
|
||||
// now only the long words
|
||||
factory.minWordLength = 3;
|
||||
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
|
||||
assertEquals( "Hello There my Name is Ryan", out );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
|
||||
|
||||
// without prefix
|
||||
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
|
||||
assertEquals( "Mckinley", out );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Mckinley" });
|
||||
|
||||
// Now try some prefixes
|
||||
factory = new CapitalizationFilterFactory();
|
||||
args.put( "okPrefix", "McK" ); // all words
|
||||
factory.init( args );
|
||||
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
|
||||
assertEquals( "McKinley", out );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "McKinley" });
|
||||
|
||||
// now try some stuff with numbers
|
||||
factory.forceFirstLetter = false;
|
||||
factory.onlyFirstWord = false;
|
||||
out = tsToString( factory.create( new IterTokenStream( "1st 2nd third" ) ) );
|
||||
assertEquals( "1st 2nd Third", out );
|
||||
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
|
||||
|
||||
factory.forceFirstLetter = true;
|
||||
out = tsToString( factory.create( new IterTokenStream( "the The the" ) ) );
|
||||
assertEquals( "The The the", out );
|
||||
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
|
||||
stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "The The the" });
|
||||
}
|
||||
|
||||
public void testKeepIgnoreCase() throws Exception {
|
||||
|
@ -123,4 +134,80 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
|
|||
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
|
||||
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CapitalizationFilterFactory's minWordLength option.
|
||||
*
|
||||
* This is very weird when combined with ONLY_FIRST_WORD!!!
|
||||
*/
|
||||
public void testMinWordLength() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
|
||||
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
"helo testing"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CapitalizationFilterFactory's maxWordCount option with only words of 1
|
||||
* in each token (it should do nothing)
|
||||
*/
|
||||
public void testMaxWordCount() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
"one two three four"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
|
||||
*/
|
||||
public void testMaxWordCount2() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
|
||||
"one two three four"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"one two three four"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CapitalizationFilterFactory's maxTokenLength option when exceeded
|
||||
*
|
||||
* This is weird, it is not really a max, but inclusive (look at 'is')
|
||||
*/
|
||||
public void testMaxTokenLength() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
|
||||
"this is a test"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test CapitalizationFilterFactory's forceFirstLetter option
|
||||
*/
|
||||
public void testForceFirstLetter() throws Exception {
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put(CapitalizationFilterFactory.KEEP, "kitten");
|
||||
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
|
||||
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
|
||||
factory.init(args);
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
|
||||
TokenStream ts = factory.create(tokenizer);
|
||||
assertTokenStreamContents(ts, new String[] {"Kitten"});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Chinese filter factory is working.
|
||||
*/
|
||||
public class TestChineseFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually normalizes text (numerics, stopwords)
|
||||
*/
|
||||
public void testFiltering() throws Exception {
|
||||
Reader reader = new StringReader("this 1234 Is such a silly filter");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ChineseFilterFactory factory = new ChineseFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,38 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Chinese tokenizer factory is working.
|
||||
*/
|
||||
public class TestChineseTokenizerFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the tokenizer actually tokenizes chinese text correctly
|
||||
*/
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("我是中国人");
|
||||
ChineseTokenizerFactory factory = new ChineseTokenizerFactory();
|
||||
TokenStream stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream, new String[] {"我", "是", "中", "国", "人"});
|
||||
}
|
||||
}
|
|
@ -20,6 +20,7 @@ package org.apache.solr.analysis;
|
|||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.StringReader;
|
||||
import java.text.Collator;
|
||||
import java.text.RuleBasedCollator;
|
||||
import java.util.HashMap;
|
||||
|
@ -27,7 +28,9 @@ import java.util.List;
|
|||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
||||
|
@ -39,18 +42,80 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
* Then things will sort and match correctly.
|
||||
*/
|
||||
public void testBasicUsage() throws IOException {
|
||||
String[] turkishUpperCase = { "I", "WİLL", "USE", "TURKİSH", "CASING" };
|
||||
String[] turkishLowerCase = { "ı", "will", "use", "turkish", "casıng" };
|
||||
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
|
||||
String turkishLowerCase = "ı will use turkish casıng";
|
||||
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("language", "tr");
|
||||
args.put("strength", "primary");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsUpper = factory.create(new IterTokenStream(turkishUpperCase));
|
||||
TokenStream tsLower = factory.create(new IterTokenStream(turkishLowerCase));
|
||||
assertTokEqual(BaseTokenTestCase.getTokens(tsUpper),
|
||||
BaseTokenTestCase.getTokens(tsLower));
|
||||
TokenStream tsUpper = factory.create(
|
||||
new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||
TokenStream tsLower = factory.create(
|
||||
new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test usage of the decomposition option for unicode normalization.
|
||||
*/
|
||||
public void testNormalization() throws IOException {
|
||||
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
|
||||
String turkishLowerCase = "ı will use turkish casıng";
|
||||
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("language", "tr");
|
||||
args.put("strength", "primary");
|
||||
args.put("decomposition", "canonical");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsUpper = factory.create(
|
||||
new KeywordTokenizer(new StringReader(turkishUpperCase)));
|
||||
TokenStream tsLower = factory.create(
|
||||
new KeywordTokenizer(new StringReader(turkishLowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test usage of the K decomposition option for unicode normalization.
|
||||
* This works even with identical strength.
|
||||
*/
|
||||
public void testFullDecomposition() throws IOException {
|
||||
String fullWidth = "Testing";
|
||||
String halfWidth = "Testing";
|
||||
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("language", "zh");
|
||||
args.put("strength", "identical");
|
||||
args.put("decomposition", "full");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsFull = factory.create(
|
||||
new KeywordTokenizer(new StringReader(fullWidth)));
|
||||
TokenStream tsHalf = factory.create(
|
||||
new KeywordTokenizer(new StringReader(halfWidth)));
|
||||
assertCollatesToSame(tsFull, tsHalf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Test secondary strength, for english case is not significant.
|
||||
*/
|
||||
public void testSecondaryStrength() throws IOException {
|
||||
String upperCase = "TESTING";
|
||||
String lowerCase = "testing";
|
||||
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("language", "en");
|
||||
args.put("strength", "secondary");
|
||||
args.put("decomposition", "no");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(""));
|
||||
TokenStream tsUpper = factory.create(
|
||||
new KeywordTokenizer(new StringReader(upperCase)));
|
||||
TokenStream tsLower = factory.create(
|
||||
new KeywordTokenizer(new StringReader(lowerCase)));
|
||||
assertCollatesToSame(tsUpper, tsLower);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -74,18 +139,20 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
// at this point, you would save these tailoredRules to a file,
|
||||
// and use the custom parameter.
|
||||
//
|
||||
String[] germanUmlaut = { "Töne" };
|
||||
String[] germanOE = { "Toene" };
|
||||
String germanUmlaut = "Töne";
|
||||
String germanOE = "Toene";
|
||||
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("custom", "rules.txt");
|
||||
args.put("strength", "primary");
|
||||
factory.init(args);
|
||||
factory.inform(new StringMockSolrResourceLoader(tailoredRules));
|
||||
TokenStream tsUmlaut = factory.create(new IterTokenStream(germanUmlaut));
|
||||
TokenStream tsOE = factory.create(new IterTokenStream(germanOE));
|
||||
assertTokEqual(BaseTokenTestCase.getTokens(tsUmlaut),
|
||||
BaseTokenTestCase.getTokens(tsOE));
|
||||
TokenStream tsUmlaut = factory.create(
|
||||
new KeywordTokenizer(new StringReader(germanUmlaut)));
|
||||
TokenStream tsOE = factory.create(
|
||||
new KeywordTokenizer(new StringReader(germanOE)));
|
||||
|
||||
assertCollatesToSame(tsUmlaut, tsOE);
|
||||
}
|
||||
|
||||
private class StringMockSolrResourceLoader implements ResourceLoader {
|
||||
|
@ -107,4 +174,17 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
|
|||
return new ByteArrayInputStream(text.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
|
||||
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
|
||||
throws IOException {
|
||||
TermAttribute term1 = (TermAttribute) stream1
|
||||
.addAttribute(TermAttribute.class);
|
||||
TermAttribute term2 = (TermAttribute) stream2
|
||||
.addAttribute(TermAttribute.class);
|
||||
assertTrue(stream1.incrementToken());
|
||||
assertTrue(stream2.incrementToken());
|
||||
assertEquals(term1.term(), term2.term());
|
||||
assertFalse(stream1.incrementToken());
|
||||
assertFalse(stream2.incrementToken());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,51 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Dictionary compound filter factory is working.
|
||||
*/
|
||||
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually decompounds text.
|
||||
*/
|
||||
public void testDecompounding() throws Exception {
|
||||
Reader reader = new StringReader("I like to play softball");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("dictionary", "compoundDictionary.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "I", "like", "to", "play", "softball", "soft", "ball" });
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Dutch stem filter factory is working.
|
||||
*/
|
||||
public class TestDutchStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("lichamelijkheden");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
DutchStemFilterFactory factory = new DutchStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "licham" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.solr.common.ResourceLoader;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the French elision filter factory is working.
|
||||
*/
|
||||
public class TestElisionFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually normalizes text.
|
||||
*/
|
||||
public void testElision() throws Exception {
|
||||
Reader reader = new StringReader("l'avion");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ElisionFilterFactory factory = new ElisionFilterFactory();
|
||||
ResourceLoader loader = solrConfig.getResourceLoader();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("articles", "frenchArticles.txt");
|
||||
factory.init(args);
|
||||
factory.inform(loader);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "avion" });
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the French stem filter factory is working.
|
||||
*/
|
||||
public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("habitable");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "habit" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the German stem filter factory is working.
|
||||
*/
|
||||
public class TestGermanStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Tischen");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
GermanStemFilterFactory factory = new GermanStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "tisch" });
|
||||
}
|
||||
}
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Greek lowercase filter factory is working.
|
||||
*/
|
||||
public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually lowercases (and a bit more) greek text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
|
||||
}
|
||||
}
|
|
@ -28,12 +28,24 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|||
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
|
||||
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||
ts = new HyphenatedWordsFilter(ts);
|
||||
String actual = tsToString(ts);
|
||||
assertEquals("Testing HyphenatedWordsFilter",
|
||||
outputAfterHyphenatedWordsFilter, actual);
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
|
||||
*/
|
||||
public void testHyphenAtEnd() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
|
||||
// first test
|
||||
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
|
||||
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
|
||||
ts = factory.create(ts);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,13 +17,14 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -37,7 +38,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
|||
words.add( "aaa" );
|
||||
words.add( "bbb" );
|
||||
|
||||
List<Token> input = tokens( "aaa BBB ccc ddd EEE" );
|
||||
String input = "aaa BBB ccc ddd EEE";
|
||||
Map<String,String> args = new HashMap<String, String>();
|
||||
|
||||
|
||||
|
@ -47,18 +48,28 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
|
|||
factory.init( args );
|
||||
factory.inform( solrConfig.getResourceLoader() );
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
List<Token> expect = tokens( "aaa BBB" );
|
||||
List<Token> real = getTokens(factory.create( new IterTokenStream(input) ));
|
||||
assertTokEqual( expect, real );
|
||||
// Test Stopwords (ignoreCase via the setter instead)
|
||||
factory = new KeepWordFilterFactory();
|
||||
args = new HashMap<String, String>();
|
||||
factory.init( args );
|
||||
factory.inform( solrConfig.getResourceLoader() );
|
||||
factory.setIgnoreCase(true);
|
||||
factory.setWords( words );
|
||||
assertTrue(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
|
||||
|
||||
// Now force case
|
||||
args = new HashMap<String, String>();
|
||||
args.put( "ignoreCase", "false" );
|
||||
factory.init( args );
|
||||
factory.inform( solrConfig.getResourceLoader() );
|
||||
|
||||
expect = tokens( "aaa" );
|
||||
real = getTokens(factory.create( new IterTokenStream(input) ));
|
||||
assertTokEqual( expect, real );
|
||||
assertFalse(factory.isIgnoreCase());
|
||||
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
|
||||
assertTokenStreamContents(stream, new String[] { "aaa" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,37 +1,27 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.junit.Assert;
|
||||
import org.junit.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* @version $Id$
|
||||
* @since solr 1.4
|
||||
*/
|
||||
public class TestMultiWordSynonyms {
|
||||
public class TestMultiWordSynonyms extends BaseTokenTestCase {
|
||||
|
||||
@Test
|
||||
public void testMultiWordSynonmys() throws IOException {
|
||||
public void testMultiWordSynonyms() throws IOException {
|
||||
List<String> rules = new ArrayList<String>();
|
||||
rules.add("a b c,d");
|
||||
SynonymMap synMap = new SynonymMap(true);
|
||||
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
|
||||
|
||||
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
|
||||
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
|
||||
ts.reset();
|
||||
List<String> tokens = new ArrayList<String>();
|
||||
while (ts.incrementToken()) tokens.add(termAtt.term());
|
||||
|
||||
// This fails because ["e","e"] is the value of the token stream
|
||||
Assert.assertEquals(Arrays.asList("a", "e"), tokens);
|
||||
assertTokenStreamContents(ts, new String[] { "a", "e" });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,163 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the NGram filter factories are working.
|
||||
*/
|
||||
public class TestNGramFilters extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test NGramTokenizerFactory
|
||||
*/
|
||||
public void testNGramTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
NGramTokenizerFactory factory = new NGramTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "e", "s", "t", "te", "es", "st" });
|
||||
}
|
||||
/**
|
||||
* Test NGramTokenizerFactory with min and max gram options
|
||||
*/
|
||||
public void testNGramTokenizer2() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minGramSize", "2");
|
||||
args.put("maxGramSize", "3");
|
||||
NGramTokenizerFactory factory = new NGramTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "te", "es", "st", "tes", "est" });
|
||||
}
|
||||
/**
|
||||
* Test the NGramFilterFactory
|
||||
*/
|
||||
public void testNGramFilter() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
NGramFilterFactory factory = new NGramFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "e", "s", "t", "te", "es", "st" });
|
||||
}
|
||||
/**
|
||||
* Test the NGramFilterFactory with min and max gram options
|
||||
*/
|
||||
public void testNGramFilter2() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minGramSize", "2");
|
||||
args.put("maxGramSize", "3");
|
||||
NGramFilterFactory factory = new NGramFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "te", "es", "st", "tes", "est" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramTokenizerFactory
|
||||
*/
|
||||
public void testEdgeNGramTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramTokenizerFactory with min and max gram size
|
||||
*/
|
||||
public void testEdgeNGramTokenizer2() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minGramSize", "1");
|
||||
args.put("maxGramSize", "2");
|
||||
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "te" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramTokenizerFactory with side option
|
||||
*/
|
||||
public void testEdgeNGramTokenizer3() throws Exception {
|
||||
Reader reader = new StringReader("ready");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("side", "back");
|
||||
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "y" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory
|
||||
*/
|
||||
public void testEdgeNGramFilter() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory with min and max gram size
|
||||
*/
|
||||
public void testEdgeNGramFilter2() throws Exception {
|
||||
Reader reader = new StringReader("test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("minGramSize", "1");
|
||||
args.put("maxGramSize", "2");
|
||||
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "t", "te" });
|
||||
}
|
||||
/**
|
||||
* Test EdgeNGramFilterFactory with side option
|
||||
*/
|
||||
public void testEdgeNGramFilter3() throws Exception {
|
||||
Reader reader = new StringReader("ready");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("side", "back");
|
||||
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "y" });
|
||||
}
|
||||
}
|
|
@ -19,6 +19,8 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.CharReader;
|
||||
import org.apache.lucene.analysis.CharStream;
|
||||
|
@ -37,20 +39,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
// this is test.
|
||||
public void testNothingChange() throws IOException {
|
||||
final String BLOCK = "this is test.";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1$2$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "this", "is", "test." },
|
||||
new int[] { 0, 5, 8 },
|
||||
new int[] { 4, 7, 13 },
|
||||
new int[] { 1, 1, 1 });
|
||||
}
|
||||
|
||||
// 012345678
|
||||
// aa bb cc
|
||||
public void testReplaceByEmpty() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertEquals( 0, getTokens( ts ).size() );
|
||||
assertFalse(ts.incrementToken());
|
||||
}
|
||||
|
||||
// 012345678
|
||||
|
@ -58,10 +73,19 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
// aa#bb#cc
|
||||
public void test1block1matchSameLength() throws IOException {
|
||||
final String BLOCK = "aa bb cc";
|
||||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
|
||||
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
|
||||
args.put("replacement", "$1#$2#$3");
|
||||
factory.init(args);
|
||||
CharStream cs = factory.create(
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb#cc" },
|
||||
new int[] { 0 },
|
||||
new int[] { 8 },
|
||||
new int[] { 1 });
|
||||
}
|
||||
|
||||
// 11111
|
||||
|
@ -73,7 +97,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb###cc", "dd" },
|
||||
new int[] { 0, 9 },
|
||||
new int[] { 8, 11 },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
// 01234567
|
||||
|
@ -84,7 +112,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharStream cs = new PatternReplaceCharFilter( "a", "aa",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "aa" },
|
||||
new int[] { 1, 4 },
|
||||
new int[] { 2, 5 },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
// 11111
|
||||
|
@ -96,7 +128,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa#bb", "dd" },
|
||||
new int[] { 0, 12 },
|
||||
new int[] { 11, 14 },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
// 111111111122222222223333
|
||||
|
@ -108,8 +144,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
|
||||
getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
|
||||
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
|
||||
new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
// 11111111112222222222333333333
|
||||
|
@ -121,8 +160,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
|
||||
CharReader.get( new StringReader( BLOCK ) ) );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
|
||||
getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
|
||||
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
|
||||
new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
// 11111111112222222222333333333
|
||||
|
@ -136,7 +178,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
|
|||
cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
|
||||
cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
|
||||
TokenStream ts = new WhitespaceTokenizer( cs );
|
||||
assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
|
||||
getTokens( ts ) );
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
|
||||
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
|
||||
new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
|
||||
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
|
@ -27,7 +26,7 @@ import java.util.regex.Pattern;
|
|||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestPatternReplaceFilter extends AnalysisTestCase {
|
||||
public class TestPatternReplaceFilter extends BaseTokenTestCase {
|
||||
|
||||
public void testReplaceAll() throws Exception {
|
||||
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
|
||||
|
@ -35,14 +34,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", true);
|
||||
Token token = ts.next();
|
||||
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "-foo-foo-foo-", "-", "c-" });
|
||||
}
|
||||
|
||||
public void testReplaceFirst() throws Exception {
|
||||
|
@ -51,14 +44,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
"-", false);
|
||||
Token token = ts.next();
|
||||
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "-fooaabfooabfoob", "-", "c-" });
|
||||
}
|
||||
|
||||
public void testStripFirst() throws Exception {
|
||||
|
@ -67,14 +54,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, false);
|
||||
Token token = ts.next();
|
||||
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "fooaabfooabfoob", "", "c" });
|
||||
}
|
||||
|
||||
public void testStripAll() throws Exception {
|
||||
|
@ -83,14 +64,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("a*b"),
|
||||
null, true);
|
||||
Token token = ts.next();
|
||||
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "foofoofoo", "", "c" });
|
||||
}
|
||||
|
||||
public void testReplaceAllWithBackRef() throws Exception {
|
||||
|
@ -99,14 +74,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
|
|||
(new WhitespaceTokenizer(new StringReader(input)),
|
||||
Pattern.compile("(a*)b"),
|
||||
"$1\\$", true);
|
||||
Token token = ts.next();
|
||||
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
|
||||
token = ts.next();
|
||||
assertNull(token);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
@ -27,8 +28,8 @@ import org.apache.lucene.analysis.CharReader;
|
|||
import org.apache.lucene.analysis.CharStream;
|
||||
import org.apache.lucene.analysis.MappingCharFilter;
|
||||
import org.apache.lucene.analysis.NormalizeCharMap;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
public class TestPatternTokenizerFactory extends BaseTokenTestCase
|
||||
{
|
||||
|
@ -57,7 +58,7 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
|
|||
tokenizer.init( args );
|
||||
|
||||
TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
|
||||
String out = TestHyphenatedWordsFilter.tsToString( stream );
|
||||
String out = tsToString( stream );
|
||||
System.out.println( test[2] + " ==> " + out );
|
||||
|
||||
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
|
||||
|
@ -93,20 +94,45 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
|
|||
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
|
||||
tokFactory.init( args );
|
||||
TokenStream stream = tokFactory.create( charStream );
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther", "is", "here" },
|
||||
new int[] { 0, 13, 26, 29 },
|
||||
new int[] { 12, 25, 28, 33 },
|
||||
new int[] { 1, 1, 1, 1 });
|
||||
|
||||
List<Token> result = getTokens( stream );
|
||||
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
|
||||
assertTokEqualOff( expect, result );
|
||||
|
||||
charStream.reset();
|
||||
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
|
||||
args.put( PatternTokenizerFactory.PATTERN, "Günther" );
|
||||
args.put( PatternTokenizerFactory.GROUP, "0" );
|
||||
tokFactory = new PatternTokenizerFactory();
|
||||
tokFactory.init( args );
|
||||
stream = tokFactory.create( charStream );
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] { "Günther", "Günther" },
|
||||
new int[] { 0, 13 },
|
||||
new int[] { 12, 25 },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
result = getTokens( stream );
|
||||
expect = tokens( "Günther,1,0,12 Günther,1,13,25" );
|
||||
assertTokEqualOff( expect, result );
|
||||
/**
|
||||
* TODO: rewrite tests not to use string comparison.
|
||||
* @deprecated only tests TermAttribute!
|
||||
*/
|
||||
private static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuilder out = new StringBuilder();
|
||||
TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class);
|
||||
// extra safety to enforce, that the state is not preserved and also
|
||||
// assign bogus values
|
||||
in.clearAttributes();
|
||||
termAtt.setTermBuffer("bogusTerm");
|
||||
while (in.incrementToken()) {
|
||||
if (out.length() > 0)
|
||||
out.append(' ');
|
||||
out.append(termAtt.term());
|
||||
in.clearAttributes();
|
||||
termAtt.setTermBuffer("bogusTerm");
|
||||
}
|
||||
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Persian normalization factory is working.
|
||||
*/
|
||||
public class TestPersianNormalizationFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually normalizes persian text.
|
||||
*/
|
||||
public void testNormalization() throws Exception {
|
||||
Reader reader = new StringReader("های");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "هاي" });
|
||||
}
|
||||
}
|
|
@ -17,16 +17,14 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
|
||||
/**
|
||||
|
@ -61,50 +59,38 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
|
|||
assertFalse( ff.inject );
|
||||
}
|
||||
|
||||
public void runner( Encoder enc, boolean inject ) throws Exception
|
||||
{
|
||||
String[] input = new String[] {
|
||||
"aaa", "bbb", "ccc", "easgasg"
|
||||
};
|
||||
public void testAlgorithms() throws Exception {
|
||||
assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
|
||||
assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "B", "KKK", "ESKS" });
|
||||
|
||||
ArrayList<Token> stream = new ArrayList<Token>();
|
||||
ArrayList<Token> output = new ArrayList<Token>();
|
||||
for( String s : input ) {
|
||||
stream.add( new Token( s, 0, s.length() ) );
|
||||
assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
|
||||
assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg",
|
||||
new String[] { "A", "PP", "KK", "ASKS" });
|
||||
|
||||
// phonetic token is added first in the current impl
|
||||
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
|
||||
assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg",
|
||||
new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
|
||||
assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg",
|
||||
new String[] { "A000", "B000", "C000", "E220" });
|
||||
|
||||
// add the original if applicable
|
||||
if( inject ) {
|
||||
output.add( new Token( s, 0, s.length() ) );
|
||||
}
|
||||
assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg",
|
||||
new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
|
||||
assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg",
|
||||
new String[] { "A0", "B1", "C3", "E034034" });
|
||||
}
|
||||
|
||||
// System.out.println("###stream="+stream);
|
||||
// System.out.println("###output="+output);
|
||||
|
||||
PhoneticFilter filter = new PhoneticFilter(
|
||||
new IterTokenStream(stream.iterator()), enc, "text", inject );
|
||||
|
||||
Token got = new Token();
|
||||
for( Token t : output ) {
|
||||
got = filter.next(got);
|
||||
// System.out.println("##### expect=" + t + " got="+got);
|
||||
assertEquals( t.term(), got.term());
|
||||
}
|
||||
assertNull( filter.next() ); // no more tokens
|
||||
}
|
||||
|
||||
public void testEncodes() throws Exception {
|
||||
runner( new DoubleMetaphone(), true );
|
||||
runner( new Metaphone(), true );
|
||||
runner( new Soundex(), true );
|
||||
runner( new RefinedSoundex(), true );
|
||||
|
||||
runner( new DoubleMetaphone(), false );
|
||||
runner( new Metaphone(), false );
|
||||
runner( new Soundex(), false );
|
||||
runner( new RefinedSoundex(), false );
|
||||
static void assertAlgorithm(String algName, String inject, String input,
|
||||
String[] expected) throws Exception {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(
|
||||
new StringReader(input));
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("encoder", algName);
|
||||
args.put("inject", inject);
|
||||
PhoneticFilterFactory factory = new PhoneticFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, expected);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Porter stem filter factory is working.
|
||||
*/
|
||||
public class TestPorterStemFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually stems text.
|
||||
*/
|
||||
public void testStemming() throws Exception {
|
||||
Reader reader = new StringReader("dogs");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
PorterStemFilterFactory factory = new PorterStemFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "dog" });
|
||||
}
|
||||
}
|
|
@ -20,10 +20,14 @@ package org.apache.solr.analysis;
|
|||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase {
|
||||
public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
|
||||
|
||||
public static Token tok(int pos, String t, int start, int end) {
|
||||
Token tok = new Token(t,start,end);
|
||||
|
@ -38,15 +42,27 @@ public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase {
|
|||
throws Exception {
|
||||
|
||||
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
|
||||
|
||||
final TokenStream ts = new RemoveDuplicatesTokenFilter
|
||||
RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
|
||||
final TokenStream ts = factory.create
|
||||
(new TokenStream() {
|
||||
public Token next() { return toks.hasNext() ? toks.next() : null; }
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
public boolean incrementToken() {
|
||||
if (toks.hasNext()) {
|
||||
clearAttributes();
|
||||
Token tok = toks.next();
|
||||
termAtt.setTermBuffer(tok.term());
|
||||
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
|
||||
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
final String actual = TestBufferedTokenStream.tsToString(ts);
|
||||
assertEquals(expected + " != " + actual, expected, actual);
|
||||
|
||||
assertTokenStreamContents(ts, expected.split("\\s"));
|
||||
}
|
||||
|
||||
public void testNoDups() throws Exception {
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Reverse string filter factory is working.
|
||||
*/
|
||||
public class TestReverseStringFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually reverses text.
|
||||
*/
|
||||
public void testReversing() throws Exception {
|
||||
Reader reader = new StringReader("simple test");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ReverseStringFilterFactory factory = new ReverseStringFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "elpmis", "tset" });
|
||||
}
|
||||
}
|
|
@ -21,11 +21,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.queryParser.ParseException;
|
||||
|
@ -53,57 +51,52 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
|
|||
|
||||
public void testReversedTokens() throws IOException {
|
||||
String text = "simple text";
|
||||
String expected1 = "simple \u0001elpmis text \u0001txet";
|
||||
String expected2 = "\u0001elpmis \u0001txet";
|
||||
args.put("withOriginal", "true");
|
||||
factory.init(args);
|
||||
TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
|
||||
List<Token> realTokens = getTokens(input);
|
||||
List<Token> expectedTokens = tokens(expected1);
|
||||
// set positionIncrements in expected tokens
|
||||
for (int i = 1; i < expectedTokens.size(); i += 2) {
|
||||
expectedTokens.get(i).setPositionIncrement(0);
|
||||
}
|
||||
assertTokEqual(realTokens, expectedTokens);
|
||||
assertTokenStreamContents(input,
|
||||
new String[] { "\u0001elpmis", "simple", "\u0001txet", "text" },
|
||||
new int[] { 1, 0, 1, 0 });
|
||||
|
||||
// now without original tokens
|
||||
args.put("withOriginal", "false");
|
||||
factory.init(args);
|
||||
input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
|
||||
realTokens = getTokens(input);
|
||||
expectedTokens = tokens(expected2);
|
||||
assertTokEqual(realTokens, expectedTokens);
|
||||
assertTokenStreamContents(input,
|
||||
new String[] { "\u0001elpmis", "\u0001txet" },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
public void testIndexingAnalysis() throws Exception {
|
||||
Analyzer a = schema.getAnalyzer();
|
||||
String text = "one two three si\uD834\uDD1Ex";
|
||||
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
|
||||
List<Token> expectedTokens1 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected1)));
|
||||
// set positionIncrements and offsets in expected tokens
|
||||
for (int i = 1; i < expectedTokens1.size(); i += 2) {
|
||||
Token t = expectedTokens1.get(i);
|
||||
t.setPositionIncrement(0);
|
||||
}
|
||||
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
|
||||
List<Token> expectedTokens2 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected2)));
|
||||
String expected3 = "one two three si\uD834\uDD1Ex";
|
||||
List<Token> expectedTokens3 = getTokens(
|
||||
new WhitespaceTokenizer(new StringReader(expected3)));
|
||||
|
||||
// field one
|
||||
TokenStream input = a.tokenStream("one", new StringReader(text));
|
||||
List<Token> realTokens = getTokens(input);
|
||||
assertTokEqual(realTokens, expectedTokens1);
|
||||
assertTokenStreamContents(input,
|
||||
new String[] { "\u0001eno", "one", "\u0001owt", "two",
|
||||
"\u0001eerht", "three", "\u0001x\uD834\uDD1Eis", "si\uD834\uDD1Ex" },
|
||||
new int[] { 0, 0, 4, 4, 8, 8, 14, 14 },
|
||||
new int[] { 3, 3, 7, 7, 13, 13, 19, 19 },
|
||||
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }
|
||||
);
|
||||
// field two
|
||||
input = a.tokenStream("two", new StringReader(text));
|
||||
realTokens = getTokens(input);
|
||||
assertTokEqual(realTokens, expectedTokens2);
|
||||
assertTokenStreamContents(input,
|
||||
new String[] { "\u0001eno", "\u0001owt",
|
||||
"\u0001eerht", "\u0001x\uD834\uDD1Eis" },
|
||||
new int[] { 0, 4, 8, 14 },
|
||||
new int[] { 3, 7, 13, 19 },
|
||||
new int[] { 1, 1, 1, 1 }
|
||||
);
|
||||
// field three
|
||||
input = a.tokenStream("three", new StringReader(text));
|
||||
realTokens = getTokens(input);
|
||||
assertTokEqual(realTokens, expectedTokens3);
|
||||
assertTokenStreamContents(input,
|
||||
new String[] { "one", "two", "three", "si\uD834\uDD1Ex" },
|
||||
new int[] { 0, 4, 8, 14 },
|
||||
new int[] { 3, 7, 13, 19 },
|
||||
new int[] { 1, 1, 1, 1 }
|
||||
);
|
||||
}
|
||||
|
||||
public void testQueryParsing() throws IOException, ParseException {
|
||||
|
|
|
@ -0,0 +1,79 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Russian filter factories are working.
|
||||
*/
|
||||
public class TestRussianFilters extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test RussianLetterTokenizerFactory
|
||||
*/
|
||||
public void testTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
|
||||
factory.init(args);
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream, new String[] {"Вместе", "с", "тем", "о",
|
||||
"силе", "электромагнитной", "100"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test RussianLowerCaseFilterFactory
|
||||
*/
|
||||
public void testLowerCase() throws Exception {
|
||||
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
|
||||
factory.init(args);
|
||||
RussianLowerCaseFilterFactory filterFactory = new RussianLowerCaseFilterFactory();
|
||||
filterFactory.init(args);
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = filterFactory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] {"вместе", "с", "тем", "о",
|
||||
"силе", "электромагнитной", "100"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test RussianStemFilterFactory
|
||||
*/
|
||||
public void testStemmer() throws Exception {
|
||||
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
|
||||
factory.init(args);
|
||||
RussianLowerCaseFilterFactory caseFactory = new RussianLowerCaseFilterFactory();
|
||||
caseFactory.init(args);
|
||||
RussianStemFilterFactory stemFactory = new RussianStemFilterFactory();
|
||||
stemFactory.init(args);
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = caseFactory.create(tokenizer);
|
||||
stream = stemFactory.create(stream);
|
||||
assertTokenStreamContents(stream, new String[] {"вмест", "с", "тем", "о",
|
||||
"сил", "электромагнитн", "100"});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,73 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Shingle filter factory works.
|
||||
*/
|
||||
public class TestShingleFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test the defaults
|
||||
*/
|
||||
public void testDefaults() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream, new String[] {"this", "this is", "is",
|
||||
"is a", "a", "a test", "test"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with unigrams disabled
|
||||
*/
|
||||
public void testNoUnigrams() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("outputUnigrams", "false");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"this is", "is a", "a test"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test with a higher max shingle size
|
||||
*/
|
||||
public void testMaxShingleSize() throws Exception {
|
||||
Reader reader = new StringReader("this is a test");
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("maxShingleSize", "3");
|
||||
ShingleFilterFactory factory = new ShingleFilterFactory();
|
||||
factory.init(args);
|
||||
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"this", "this is", "this is a", "is",
|
||||
"is a", "is a test", "a", "a test", "test"});
|
||||
}
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the standard lucene factories are working.
|
||||
*/
|
||||
public class TestStandardFactories extends BaseTokenTestCase {
|
||||
/**
|
||||
* Test StandardTokenizerFactory
|
||||
*/
|
||||
public void testStandardTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What's", "this", "thing", "do" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Test StandardFilterFactory
|
||||
*/
|
||||
public void testStandardFilter() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
StandardTokenizerFactory factory = new StandardTokenizerFactory();
|
||||
StandardFilterFactory filterFactory = new StandardFilterFactory();
|
||||
Tokenizer tokenizer = factory.create(reader);
|
||||
TokenStream stream = filterFactory.create(tokenizer);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What", "this", "thing", "do"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test KeywordTokenizerFactory
|
||||
*/
|
||||
public void testKeywordTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
KeywordTokenizerFactory factory = new KeywordTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What's this thing do?"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test WhitespaceTokenizerFactory
|
||||
*/
|
||||
public void testWhitespaceTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What's", "this", "thing", "do?"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test LetterTokenizerFactory
|
||||
*/
|
||||
public void testLetterTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
LetterTokenizerFactory factory = new LetterTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"What", "s", "this", "thing", "do"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Test LowerCaseTokenizerFactory
|
||||
*/
|
||||
public void testLowerCaseTokenizer() throws Exception {
|
||||
Reader reader = new StringReader("What's this thing do?");
|
||||
LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory();
|
||||
Tokenizer stream = factory.create(reader);
|
||||
assertTokenStreamContents(stream,
|
||||
new String[] {"what", "s", "this", "thing", "do"});
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure the ASCIIFoldingFilterFactory works
|
||||
*/
|
||||
public void testASCIIFolding() throws Exception {
|
||||
Reader reader = new StringReader("Česká");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Ceska" });
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure the ISOLatin1AccentFilterFactory works
|
||||
* (sometimes, at least not uppercase hacek)
|
||||
*/
|
||||
public void testISOLatin1Folding() throws Exception {
|
||||
Reader reader = new StringReader("Česká");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] { "Česka" });
|
||||
}
|
||||
}
|
|
@ -19,11 +19,20 @@ package org.apache.solr.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.StringReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
|
@ -31,33 +40,41 @@ import java.util.List;
|
|||
*/
|
||||
public class TestSynonymFilter extends BaseTokenTestCase {
|
||||
|
||||
public List strings(String str) {
|
||||
static List<String> strings(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
return Arrays.asList(arr);
|
||||
}
|
||||
|
||||
|
||||
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
|
||||
ArrayList<Token> lst = new ArrayList<Token>();
|
||||
final List toks = tokens(input);
|
||||
TokenStream ts = new TokenStream() {
|
||||
Iterator iter = toks.iterator();
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
return iter.hasNext() ? (Token)iter.next() : null;
|
||||
}
|
||||
};
|
||||
|
||||
SynonymFilter sf = new SynonymFilter(ts, dict);
|
||||
|
||||
Token target = new Token(); // test with token reuse
|
||||
while(true) {
|
||||
Token t = sf.next(target);
|
||||
if (t==null) return lst;
|
||||
lst.add((Token)t.clone());
|
||||
}
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
String expected[]) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, String input,
|
||||
String expected[], int posIncs[]) throws IOException {
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, posIncs);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
||||
String expected[], int posIncs[])
|
||||
throws IOException {
|
||||
TokenStream tokenizer = new IterTokenStream(input);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, posIncs);
|
||||
}
|
||||
|
||||
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
|
||||
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
|
||||
throws IOException {
|
||||
TokenStream tokenizer = new IterTokenStream(input);
|
||||
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
|
||||
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
|
||||
posIncs);
|
||||
}
|
||||
|
||||
public void testMatching() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
|
@ -71,28 +88,29 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
|
||||
map.add(strings("x c"), tokens("xc"), orig, merge);
|
||||
|
||||
// System.out.println(map);
|
||||
// System.out.println(getTokList(map,"a",false));
|
||||
|
||||
assertTokEqual(getTokList(map,"$",false), tokens("$"));
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("aa"));
|
||||
assertTokEqual(getTokList(map,"a $",false), tokens("aa $"));
|
||||
assertTokEqual(getTokList(map,"$ a",false), tokens("$ aa"));
|
||||
assertTokEqual(getTokList(map,"a a",false), tokens("aa aa"));
|
||||
assertTokEqual(getTokList(map,"b",false), tokens("bb"));
|
||||
assertTokEqual(getTokList(map,"z x c v",false), tokens("zxcv"));
|
||||
assertTokEqual(getTokList(map,"z x c $",false), tokens("z xc $"));
|
||||
assertTokenizesTo(map, "$", new String[] { "$" });
|
||||
assertTokenizesTo(map, "a", new String[] { "aa" });
|
||||
assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
|
||||
assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
|
||||
assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
|
||||
assertTokenizesTo(map, "b", new String[] { "bb" });
|
||||
assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
|
||||
assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });
|
||||
|
||||
// repeats
|
||||
map.add(strings("a b"), tokens("ab"), orig, merge);
|
||||
map.add(strings("a b"), tokens("ab"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a b",false), tokens("ab"));
|
||||
|
||||
// FIXME: the below test intended to be { "ab" }
|
||||
assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" });
|
||||
|
||||
// check for lack of recursion
|
||||
map.add(strings("zoo"), tokens("zoo"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo $ zoo"));
|
||||
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
|
||||
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo zoo zoo $ zoo zoo"));
|
||||
// FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
|
||||
// maybe this was just a typo in the old test????
|
||||
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
|
||||
}
|
||||
|
||||
public void testIncludeOrig() throws IOException {
|
||||
|
@ -107,25 +125,48 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
|
||||
map.add(strings("x c"), tokens("xc"), orig, merge);
|
||||
|
||||
// System.out.println(map);
|
||||
// System.out.println(getTokList(map,"a",false));
|
||||
|
||||
assertTokEqual(getTokList(map,"$",false), tokens("$"));
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a/aa"));
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a/aa"));
|
||||
assertTokEqual(getTokList(map,"$ a",false), tokens("$ a/aa"));
|
||||
assertTokEqual(getTokList(map,"a $",false), tokens("a/aa $"));
|
||||
assertTokEqual(getTokList(map,"$ a !",false), tokens("$ a/aa !"));
|
||||
assertTokEqual(getTokList(map,"a a",false), tokens("a/aa a/aa"));
|
||||
assertTokEqual(getTokList(map,"b",false), tokens("b/bb"));
|
||||
assertTokEqual(getTokList(map,"z x c v",false), tokens("z/zxcv x c v"));
|
||||
assertTokEqual(getTokList(map,"z x c $",false), tokens("z x/xc c $"));
|
||||
assertTokenizesTo(map, "$",
|
||||
new String[] { "$" },
|
||||
new int[] { 1 });
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 1, 0 });
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 1, 0 });
|
||||
assertTokenizesTo(map, "$ a",
|
||||
new String[] { "$", "a", "aa" },
|
||||
new int[] { 1, 1, 0 });
|
||||
assertTokenizesTo(map, "a $",
|
||||
new String[] { "a", "aa", "$" },
|
||||
new int[] { 1, 0, 1 });
|
||||
assertTokenizesTo(map, "$ a !",
|
||||
new String[] { "$", "a", "aa", "!" },
|
||||
new int[] { 1, 1, 0, 1 });
|
||||
assertTokenizesTo(map, "a a",
|
||||
new String[] { "a", "aa", "a", "aa" },
|
||||
new int[] { 1, 0, 1, 0 });
|
||||
assertTokenizesTo(map, "b",
|
||||
new String[] { "b", "bb" },
|
||||
new int[] { 1, 0 });
|
||||
assertTokenizesTo(map, "z x c v",
|
||||
new String[] { "z", "zxcv", "x", "c", "v" },
|
||||
new int[] { 1, 0, 1, 1, 1 });
|
||||
assertTokenizesTo(map, "z x c $",
|
||||
new String[] { "z", "x", "xc", "c", "$" },
|
||||
new int[] { 1, 1, 0, 1, 1 });
|
||||
|
||||
// check for lack of recursion
|
||||
map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo/zoo $ zoo/zoo"));
|
||||
// CHECKME: I think the previous test (with 4 zoo's), was just a typo.
|
||||
assertTokenizesTo(map, "zoo zoo $ zoo",
|
||||
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
|
||||
new int[] { 1, 0, 1, 1, 1 });
|
||||
|
||||
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo $ zoo/zoo zoo"));
|
||||
assertTokenizesTo(map, "zoo zoo $ zoo",
|
||||
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
|
||||
new int[] { 1, 0, 1, 1, 1, 0, 1 });
|
||||
}
|
||||
|
||||
|
||||
|
@ -136,25 +177,35 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
boolean merge = true;
|
||||
map.add(strings("a"), tokens("a5,5"), orig, merge);
|
||||
map.add(strings("a"), tokens("a3,3"), orig, merge);
|
||||
// System.out.println(map);
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2"));
|
||||
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a3", "a5" },
|
||||
new int[] { 1, 2 });
|
||||
|
||||
map.add(strings("b"), tokens("b3,3"), orig, merge);
|
||||
map.add(strings("b"), tokens("b5,5"), orig, merge);
|
||||
//System.out.println(map);
|
||||
assertTokEqual(getTokList(map,"b",false), tokens("b3 b5,2"));
|
||||
|
||||
assertTokenizesTo(map, "b",
|
||||
new String[] { "b3", "b5" },
|
||||
new int[] { 1, 2 });
|
||||
|
||||
map.add(strings("a"), tokens("A3,3"), orig, merge);
|
||||
map.add(strings("a"), tokens("A5,5"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a3/A3 a5,2/A5"));
|
||||
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a3", "A3", "a5", "A5" },
|
||||
new int[] { 1, 0, 2, 0 });
|
||||
|
||||
map.add(strings("a"), tokens("a1"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a1 a3,2/A3 a5,2/A5"));
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a1", "a3", "A3", "a5", "A5" },
|
||||
new int[] { 1, 2, 0, 2, 0 });
|
||||
|
||||
map.add(strings("a"), tokens("a2,2"), orig, merge);
|
||||
map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a1 a2 a3/A3 a4 a5/A5 a6"));
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
|
||||
new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
|
||||
}
|
||||
|
||||
|
||||
|
@ -167,41 +218,56 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
map.add(strings("qwe"), tokens("xx"), orig, merge);
|
||||
map.add(strings("qwe"), tokens("yy"), orig, merge);
|
||||
map.add(strings("qwe"), tokens("zz"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"$",false), tokens("$"));
|
||||
assertTokEqual(getTokList(map,"qwe",false), tokens("qq/ww/ee/xx/yy/zz"));
|
||||
assertTokenizesTo(map, "$", new String[] { "$" });
|
||||
assertTokenizesTo(map, "qwe",
|
||||
new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
|
||||
new int[] { 1, 0, 0, 0, 0, 0 });
|
||||
|
||||
// test merging within the map
|
||||
|
||||
map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
|
||||
map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2 a7,2 a8 a9 a10 a11 a111,100"));
|
||||
assertTokenizesTo(map, "a",
|
||||
new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
|
||||
new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
|
||||
}
|
||||
|
||||
public void testOffsets() throws IOException {
|
||||
public void testPositionIncrements() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
|
||||
boolean orig = false;
|
||||
boolean merge = true;
|
||||
|
||||
// test that generated tokens start at the same offset as the original
|
||||
// test that generated tokens start at the same posInc as the original
|
||||
map.add(strings("a"), tokens("aa"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a,5",false), tokens("aa,5"));
|
||||
assertTokEqual(getTokList(map,"a,0",false), tokens("aa,0"));
|
||||
assertTokenizesTo(map, tokens("a,5"),
|
||||
new String[] { "aa" },
|
||||
new int[] { 5 });
|
||||
assertTokenizesTo(map, tokens("a,0"),
|
||||
new String[] { "aa" },
|
||||
new int[] { 0 });
|
||||
|
||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5"));
|
||||
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0"));
|
||||
assertTokenizesTo(map, tokens("b,5"),
|
||||
new String[] { "bb" },
|
||||
new int[] { 5 });
|
||||
assertTokenizesTo(map, tokens("b,0"),
|
||||
new String[] { "bb" },
|
||||
new int[] { 0 });
|
||||
|
||||
// test that subsequent tokens are adjusted accordingly
|
||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5 c2,2"));
|
||||
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0 c2,2"));
|
||||
|
||||
assertTokenizesTo(map, tokens("c,5"),
|
||||
new String[] { "cc", "c2" },
|
||||
new int[] { 5, 2 });
|
||||
assertTokenizesTo(map, tokens("c,0"),
|
||||
new String[] { "cc", "c2" },
|
||||
new int[] { 0, 2 });
|
||||
}
|
||||
|
||||
|
||||
public void testOffsetsWithOrig() throws IOException {
|
||||
public void testPositionIncrementsWithOrig() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
|
||||
boolean orig = true;
|
||||
|
@ -209,18 +275,30 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
|
||||
// test that generated tokens start at the same offset as the original
|
||||
map.add(strings("a"), tokens("aa"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"a,5",false), tokens("a,5/aa"));
|
||||
assertTokEqual(getTokList(map,"a,0",false), tokens("a,0/aa"));
|
||||
assertTokenizesTo(map, tokens("a,5"),
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 5, 0 });
|
||||
assertTokenizesTo(map, tokens("a,0"),
|
||||
new String[] { "a", "aa" },
|
||||
new int[] { 0, 0 });
|
||||
|
||||
// test that offset of first replacement is ignored (always takes the orig offset)
|
||||
map.add(strings("b"), tokens("bb,100"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5/b"));
|
||||
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0/b"));
|
||||
assertTokenizesTo(map, tokens("b,5"),
|
||||
new String[] { "b", "bb" },
|
||||
new int[] { 5, 0 });
|
||||
assertTokenizesTo(map, tokens("b,0"),
|
||||
new String[] { "b", "bb" },
|
||||
new int[] { 0, 0 });
|
||||
|
||||
// test that subsequent tokens are adjusted accordingly
|
||||
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
|
||||
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5/c c2,2"));
|
||||
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0/c c2,2"));
|
||||
assertTokenizesTo(map, tokens("c,5"),
|
||||
new String[] { "c", "cc", "c2" },
|
||||
new int[] { 5, 0, 2 });
|
||||
assertTokenizesTo(map, tokens("c,0"),
|
||||
new String[] { "c", "cc", "c2" },
|
||||
new int[] { 0, 0, 2 });
|
||||
}
|
||||
|
||||
|
||||
|
@ -238,10 +316,101 @@ public class TestSynonymFilter extends BaseTokenTestCase {
|
|||
map.add(strings("a a"), tokens("b"), orig, merge);
|
||||
map.add(strings("x"), tokens("y"), orig, merge);
|
||||
|
||||
System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false));
|
||||
|
||||
// "a a x" => "b y"
|
||||
assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5"));
|
||||
assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
|
||||
new String[] { "b", "y" },
|
||||
new int[] { 0, 4 },
|
||||
new int[] { 3, 5 },
|
||||
new int[] { 1, 1 });
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* Return a list of tokens according to a test string format:
|
||||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
* @deprecated does not support attributes api
|
||||
*/
|
||||
private List<Token> tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
List<Token> result = new ArrayList<Token>();
|
||||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
t.setPositionIncrement(0);
|
||||
result.add(t);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated does not support custom attributes
|
||||
*/
|
||||
private static class IterTokenStream extends TokenStream {
|
||||
final Token tokens[];
|
||||
int index = 0;
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
|
||||
public IterTokenStream(Token... tokens) {
|
||||
super();
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
public IterTokenStream(Collection<Token> tokens) {
|
||||
this(tokens.toArray(new Token[tokens.size()]));
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index >= tokens.length)
|
||||
return false;
|
||||
else {
|
||||
clearAttributes();
|
||||
Token token = tokens[index++];
|
||||
termAtt.setTermBuffer(token.term());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
typeAtt.setType(token.type());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,42 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
|
||||
/**
|
||||
* Simple tests to ensure the Thai word filter factory is working.
|
||||
*/
|
||||
public class TestThaiWordFilterFactory extends BaseTokenTestCase {
|
||||
/**
|
||||
* Ensure the filter actually decomposes text.
|
||||
*/
|
||||
public void testWordBreak() throws Exception {
|
||||
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
|
||||
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
|
||||
ThaiWordFilterFactory factory = new ThaiWordFilterFactory();
|
||||
TokenStream stream = factory.create(tokenizer);
|
||||
assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้",
|
||||
"ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
|
||||
}
|
||||
}
|
|
@ -17,12 +17,19 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
|
@ -35,46 +42,75 @@ public class TestTrimFilter extends BaseTokenTestCase {
|
|||
char[] ccc = "cCc".toCharArray();
|
||||
char[] whitespace = " ".toCharArray();
|
||||
char[] empty = "".toCharArray();
|
||||
TokenStream ts = new TrimFilter
|
||||
(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
|
||||
TrimFilterFactory factory = new TrimFilterFactory();
|
||||
Map<String,String> args = new HashMap<String,String>();
|
||||
args.put("updateOffsets", "false");
|
||||
factory.init(args);
|
||||
TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
|
||||
new Token(b, 0, b.length, 6, 10),
|
||||
new Token(ccc, 0, ccc.length, 11, 15),
|
||||
new Token(whitespace, 0, whitespace.length, 16, 20),
|
||||
new Token(empty, 0, empty.length, 21, 21)), false);
|
||||
new Token(empty, 0, empty.length, 21, 21)));
|
||||
|
||||
TermAttribute token;
|
||||
assertTrue(ts.incrementToken());
|
||||
token = (TermAttribute) ts.getAttribute(TermAttribute.class);
|
||||
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
|
||||
assertFalse(ts.incrementToken());
|
||||
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
|
||||
|
||||
a = " a".toCharArray();
|
||||
b = "b ".toCharArray();
|
||||
ccc = " c ".toCharArray();
|
||||
whitespace = " ".toCharArray();
|
||||
ts = new TrimFilter(new IterTokenStream(
|
||||
factory = new TrimFilterFactory();
|
||||
args = new HashMap<String,String>();
|
||||
args.put("updateOffsets", "true");
|
||||
factory.init(args);
|
||||
ts = factory.create(new IterTokenStream(
|
||||
new Token(a, 0, a.length, 0, 2),
|
||||
new Token(b, 0, b.length, 0, 2),
|
||||
new Token(ccc, 0, ccc.length, 0, 3),
|
||||
new Token(whitespace, 0, whitespace.length, 0, 3)), true);
|
||||
new Token(whitespace, 0, whitespace.length, 0, 3)));
|
||||
|
||||
List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
|
||||
List<Token> real = getTokens(ts);
|
||||
for (Token t : expect) {
|
||||
System.out.println("TEST:" + t);
|
||||
}
|
||||
for (Token t : real) {
|
||||
System.out.println("REAL:" + t);
|
||||
}
|
||||
assertTokEqualOff(expect, real);
|
||||
assertTokenStreamContents(ts,
|
||||
new String[] { "a", "b", "c", "" },
|
||||
new int[] { 1, 0, 1, 3 },
|
||||
new int[] { 2, 1, 2, 3 },
|
||||
new int[] { 1, 1, 1, 1 });
|
||||
}
|
||||
|
||||
/**
|
||||
* @deprecated does not support custom attributes
|
||||
*/
|
||||
private static class IterTokenStream extends TokenStream {
|
||||
final Token tokens[];
|
||||
int index = 0;
|
||||
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
|
||||
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
|
||||
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
|
||||
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
|
||||
|
||||
public IterTokenStream(Token... tokens) {
|
||||
super();
|
||||
this.tokens = tokens;
|
||||
}
|
||||
|
||||
public IterTokenStream(Collection<Token> tokens) {
|
||||
this(tokens.toArray(new Token[tokens.size()]));
|
||||
}
|
||||
|
||||
public boolean incrementToken() throws IOException {
|
||||
if (index >= tokens.length)
|
||||
return false;
|
||||
else {
|
||||
clearAttributes();
|
||||
Token token = tokens[index++];
|
||||
termAtt.setTermBuffer(token.term());
|
||||
offsetAtt.setOffset(token.startOffset(), token.endOffset());
|
||||
posIncAtt.setPositionIncrement(token.getPositionIncrement());
|
||||
flagsAtt.setFlags(token.getFlags());
|
||||
typeAtt.setType(token.type());
|
||||
payloadAtt.setPayload(token.getPayload());
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,14 +17,14 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import org.apache.solr.util.AbstractSolrTestCase;
|
||||
import org.apache.lucene.analysis.Analyzer;
|
||||
import org.apache.lucene.analysis.CharArraySet;
|
||||
import org.apache.lucene.analysis.KeywordTokenizer;
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.WhitespaceTokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
|
||||
|
||||
|
@ -37,7 +37,7 @@ import java.util.HashSet;
|
|||
/**
|
||||
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
|
||||
*/
|
||||
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
||||
public class TestWordDelimiterFilter extends BaseTokenTestCase {
|
||||
public String getSchemaFile() { return "solr/conf/schema.xml"; }
|
||||
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
|
||||
|
||||
|
@ -144,148 +144,74 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
// test that subwords and catenated subwords have
|
||||
// the correct offsets.
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
Token t;
|
||||
public Token next() throws IOException {
|
||||
if (t!=null) return null;
|
||||
t = new Token("foo-bar", 5, 12); // actual
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
|
||||
1,1,0,0,1,1,0);
|
||||
|
||||
int i=0;
|
||||
for(Token t; (t=wdf.next())!=null;) {
|
||||
String termText = new String(t.termBuffer(), 0, t.termLength());
|
||||
if (termText.equals("foo")) {
|
||||
assertEquals(5, t.startOffset());
|
||||
assertEquals(8, t.endOffset());
|
||||
i++;
|
||||
}
|
||||
if (termText.equals("bar")) {
|
||||
assertEquals(9, t.startOffset());
|
||||
assertEquals(12, t.endOffset());
|
||||
i++;
|
||||
}
|
||||
if (termText.equals("foobar")) {
|
||||
assertEquals(5, t.startOffset());
|
||||
assertEquals(12, t.endOffset());
|
||||
i++;
|
||||
}
|
||||
}
|
||||
assertEquals(3,i); // make sure all 3 tokens were generated
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
new int[] { 5, 9, 5 },
|
||||
new int[] { 8, 12, 12 });
|
||||
|
||||
// test that if splitting or catenating a synonym, that the offsets
|
||||
// are not altered (they would be incorrect).
|
||||
wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
Token t;
|
||||
public Token next() throws IOException {
|
||||
if (t!=null) return null;
|
||||
t = new Token("foo-bar", 5, 6); // a synonym
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
|
||||
1,1,0,0,1,1,0);
|
||||
for(Token t; (t=wdf.next())!=null;) {
|
||||
assertEquals(5, t.startOffset());
|
||||
assertEquals(6, t.endOffset());
|
||||
}
|
||||
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar" },
|
||||
new int[] { 5, 5, 5 },
|
||||
new int[] { 6, 6, 6 });
|
||||
}
|
||||
|
||||
public void testOffsetChange() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
Token t;
|
||||
public Token next() {
|
||||
if (t != null) return null;
|
||||
t = new Token("übelkeit)", 7, 16);
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
|
||||
1,1,0,0,1,1,0
|
||||
);
|
||||
|
||||
Token t = wdf.next();
|
||||
|
||||
assertNotNull(t);
|
||||
assertEquals("übelkeit", t.term());
|
||||
assertEquals(7, t.startOffset());
|
||||
assertEquals(15, t.endOffset());
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
new int[] { 7 },
|
||||
new int[] { 15 });
|
||||
}
|
||||
|
||||
public void testOffsetChange2() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
Token t;
|
||||
public Token next() {
|
||||
if (t != null) return null;
|
||||
t = new Token("(übelkeit", 7, 17);
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
|
||||
1,1,0,0,1,1,0
|
||||
);
|
||||
|
||||
Token t = wdf.next();
|
||||
|
||||
assertNotNull(t);
|
||||
assertEquals("übelkeit", t.term());
|
||||
assertEquals(8, t.startOffset());
|
||||
assertEquals(17, t.endOffset());
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
new int[] { 8 },
|
||||
new int[] { 17 });
|
||||
}
|
||||
|
||||
public void testOffsetChange3() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
Token t;
|
||||
public Token next() {
|
||||
if (t != null) return null;
|
||||
t = new Token("(übelkeit", 7, 16);
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
|
||||
1,1,0,0,1,1,0
|
||||
);
|
||||
|
||||
Token t = wdf.next();
|
||||
|
||||
assertNotNull(t);
|
||||
assertEquals("übelkeit", t.term());
|
||||
assertEquals(8, t.startOffset());
|
||||
assertEquals(16, t.endOffset());
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "übelkeit" },
|
||||
new int[] { 8 },
|
||||
new int[] { 16 });
|
||||
}
|
||||
|
||||
public void testOffsetChange4() throws Exception
|
||||
{
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(
|
||||
new TokenStream() {
|
||||
private Token t;
|
||||
public Token next() {
|
||||
if (t != null) return null;
|
||||
t = new Token("(foo,bar)", 7, 16);
|
||||
return t;
|
||||
}
|
||||
},
|
||||
new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
|
||||
1,1,0,0,1,1,0
|
||||
);
|
||||
|
||||
Token t = wdf.next();
|
||||
|
||||
assertNotNull(t);
|
||||
assertEquals("foo", t.term());
|
||||
assertEquals(8, t.startOffset());
|
||||
assertEquals(11, t.endOffset());
|
||||
|
||||
t = wdf.next();
|
||||
|
||||
assertNotNull(t);
|
||||
assertEquals("bar", t.term());
|
||||
assertEquals(12, t.startOffset());
|
||||
assertEquals(15, t.endOffset());
|
||||
assertTokenStreamContents(wdf,
|
||||
new String[] { "foo", "bar", "foobar"},
|
||||
new int[] { 8, 12, 8 },
|
||||
new int[] { 11, 15, 15 });
|
||||
}
|
||||
|
||||
public void testAlphaNumericWords(){
|
||||
|
@ -338,24 +264,10 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
|
||||
|
||||
public void doSplit(final String input, String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
|
||||
boolean done=false;
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
if (done) return null;
|
||||
done = true;
|
||||
return new Token(input,0,input.length());
|
||||
}
|
||||
}
|
||||
,1,1,0,0,0
|
||||
);
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
|
||||
new StringReader(input)), 1, 1, 0, 0, 0);
|
||||
|
||||
for(String expected : output) {
|
||||
Token t = wdf.next();
|
||||
assertEquals(expected, t.term());
|
||||
}
|
||||
|
||||
assertEquals(null, wdf.next());
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
||||
public void testSplits() throws Exception {
|
||||
|
@ -365,29 +277,38 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
// non-space marking symbol shouldn't cause split
|
||||
// this is an example in Thai
|
||||
doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
|
||||
// possessive followed by delimiter
|
||||
doSplit("test's'", "test");
|
||||
|
||||
// some russian upper and lowercase
|
||||
doSplit("Роберт", "Роберт");
|
||||
// now cause a split (russian camelCase)
|
||||
doSplit("РобЕрт", "Роб", "Ерт");
|
||||
|
||||
// a composed titlecase character, don't split
|
||||
doSplit("aDžungla", "aDžungla");
|
||||
|
||||
// a modifier letter, don't split
|
||||
doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام");
|
||||
|
||||
// enclosing mark, don't split
|
||||
doSplit("۞test", "۞test");
|
||||
|
||||
// combining spacing mark (the virama), don't split
|
||||
doSplit("हिन्दी", "हिन्दी");
|
||||
|
||||
// don't split non-ascii digits
|
||||
doSplit("١٢٣٤", "١٢٣٤");
|
||||
|
||||
// don't split supplementaries into unpaired surrogates
|
||||
doSplit("𠀀𠀀", "𠀀𠀀");
|
||||
}
|
||||
|
||||
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
|
||||
boolean done=false;
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
if (done) return null;
|
||||
done = true;
|
||||
return new Token(input,0,input.length());
|
||||
}
|
||||
}
|
||||
,1,1,0,0,0,1,0,1,stemPossessive,null
|
||||
);
|
||||
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
|
||||
new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
|
||||
|
||||
for(String expected : output) {
|
||||
Token t = wdf.next();
|
||||
assertEquals(expected, t.term());
|
||||
}
|
||||
|
||||
assertEquals(null, wdf.next());
|
||||
assertTokenStreamContents(wdf, output);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -485,25 +406,4 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
|
|||
new int[] { 6, 14, 19 },
|
||||
new int[] { 1, 11, 1 });
|
||||
}
|
||||
|
||||
private void assertAnalyzesTo(Analyzer a, String input, String[] output,
|
||||
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
|
||||
|
||||
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
|
||||
TermAttribute termAtt = (TermAttribute) ts
|
||||
.getAttribute(TermAttribute.class);
|
||||
OffsetAttribute offsetAtt = (OffsetAttribute) ts
|
||||
.getAttribute(OffsetAttribute.class);
|
||||
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
|
||||
.getAttribute(PositionIncrementAttribute.class);
|
||||
for (int i = 0; i < output.length; i++) {
|
||||
assertTrue(ts.incrementToken());
|
||||
assertEquals(output[i], termAtt.term());
|
||||
assertEquals(startOffsets[i], offsetAtt.startOffset());
|
||||
assertEquals(endOffsets[i], offsetAtt.endOffset());
|
||||
assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
|
||||
}
|
||||
assertFalse(ts.incrementToken());
|
||||
ts.close();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,19 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# A set of words for testing the DictionaryCompound factory
|
||||
soft
|
||||
ball
|
||||
team
|
|
@ -0,0 +1,24 @@
|
|||
# Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
# contributor license agreements. See the NOTICE file distributed with
|
||||
# this work for additional information regarding copyright ownership.
|
||||
# The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
# (the "License"); you may not use this file except in compliance with
|
||||
# the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
# A set of articles for testing the French Elision filter.
|
||||
# Requiring a text file is a bit weird here...
|
||||
l
|
||||
m
|
||||
t
|
||||
qu
|
||||
n
|
||||
s
|
||||
j
|
Loading…
Reference in New Issue