SOLR-1674: Improve analysis tests and cut over to new TokenStream API

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@892821 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-12-21 13:53:50 +00:00
parent 5be5c31bb0
commit b105beef66
47 changed files with 2418 additions and 912 deletions

View File

@ -175,6 +175,9 @@ Other Changes
* SOLR-1662: Added Javadocs in BufferedTokenStream and fixed incorrect cloning
in TestBufferedTokenStream (Robert Muir, Uwe Schindler via shalin)
* SOLR-1674: Improve analysis tests and cut over to new TokenStream API.
(Robert Muir via Mark Miller)
Build
----------------------

View File

@ -17,19 +17,21 @@
package org.apache.solr.analysis;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness;
import junit.framework.TestCase;
/**
*
*/
abstract public class AnalysisTestCase extends TestCase {
abstract public class AnalysisTestCase extends AbstractSolrTestCase {
protected SolrConfig solrConfig;
/** Creates a new instance of AnalysisTestCase */
public AnalysisTestCase() {
}
public String getSolrConfigFile() { return "solrconfig.xml"; }
public String getSchemaFile() { return "schema.xml"; }
public void setUp() throws Exception {
// if you override setUp or tearDown, you better call

View File

@ -18,174 +18,134 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.io.StringReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import junit.framework.TestCase;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* General token testing helper functions
*/
public abstract class BaseTokenTestCase extends AnalysisTestCase
{
public static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
Token t = in.next();
if (null != t)
out.append(new String(t.termBuffer(), 0, t.termLength()));
// some helpers to test Analyzers and TokenStreams:
// these are taken from Lucene's BaseTokenStreamTestCase
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(new String(t.termBuffer(), 0, t.termLength()));
}
in.close();
return out.toString();
public static void assertTokenStreamContents(TokenStream ts, String[] output,
int startOffsets[], int endOffsets[], String types[], int posIncrements[])
throws IOException {
assertNotNull(output);
assertTrue("has TermAttribute", ts.hasAttribute(TermAttribute.class));
TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = null;
if (startOffsets != null || endOffsets != null) {
assertTrue("has OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
}
public List<String> tok2str(Iterable<Token> tokLst) {
ArrayList<String> lst = new ArrayList<String>();
for ( Token t : tokLst ) {
lst.add( new String(t.termBuffer(), 0, t.termLength()));
}
return lst;
TypeAttribute typeAtt = null;
if (types != null) {
assertTrue("has TypeAttribute", ts.hasAttribute(TypeAttribute.class));
typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
}
public void assertTokEqual(List<Token> a, List<Token> b) {
assertTokEq(a,b,false);
assertTokEq(b,a,false);
PositionIncrementAttribute posIncrAtt = null;
if (posIncrements != null) {
assertTrue("has PositionIncrementAttribute", ts
.hasAttribute(PositionIncrementAttribute.class));
posIncrAtt = (PositionIncrementAttribute) ts
.getAttribute(PositionIncrementAttribute.class);
}
public void assertTokEqualOff(List<Token> a, List<Token> b) {
assertTokEq(a,b,true);
assertTokEq(b,a,true);
ts.reset();
for (int i = 0; i < output.length; i++) {
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
ts.clearAttributes();
termAtt.setTermBuffer("bogusTerm");
if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
if (typeAtt != null) typeAtt.setType("bogusType");
if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
assertTrue("token " + i + " exists", ts.incrementToken());
assertEquals("term " + i, output[i], termAtt.term());
if (startOffsets != null) assertEquals("startOffset " + i,
startOffsets[i], offsetAtt.startOffset());
if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i],
offsetAtt.endOffset());
if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
if (posIncrements != null) assertEquals("posIncrement " + i,
posIncrements[i], posIncrAtt.getPositionIncrement());
}
assertFalse("end of stream", ts.incrementToken());
ts.end();
ts.close();
}
private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
int pos=0;
for (Iterator iter = a.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
, checkOff ? tok.startOffset() : -1
, checkOff ? tok.endOffset() : -1
))
{
fail(a + "!=" + b);
}
}
public static void assertTokenStreamContents(TokenStream ts, String[] output)
throws IOException {
assertTokenStreamContents(ts, output, null, null, null, null);
}
public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
int pos=0;
for (Iterator iter = lst.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val)
&& (startOff==-1 || tok.startOffset()==startOff)
&& (endOff ==-1 || tok.endOffset() ==endOff )
)
{
return true;
}
}
return false;
public static void assertTokenStreamContents(TokenStream ts, String[] output,
String[] types) throws IOException {
assertTokenStreamContents(ts, output, null, null, types, null);
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
*/
public List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
public static void assertTokenStreamContents(TokenStream ts, String[] output,
int[] posIncrements) throws IOException {
assertTokenStreamContents(ts, output, null, null, null, posIncrements);
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
public static void assertTokenStreamContents(TokenStream ts, String[] output,
int startOffsets[], int endOffsets[]) throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null);
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
public static void assertTokenStreamContents(TokenStream ts, String[] output,
int startOffsets[], int endOffsets[], int[] posIncrements)
throws IOException {
assertTokenStreamContents(ts, output, startOffsets, endOffsets, null,
posIncrements);
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int startOffsets[], int endOffsets[], String types[],
int posIncrements[]) throws IOException {
assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)),
output, startOffsets, endOffsets, types, posIncrements);
}
//------------------------------------------------------------------------
// These may be useful beyond test cases...
//------------------------------------------------------------------------
static List<Token> getTokens(TokenStream tstream) throws IOException {
List<Token> tokens = new ArrayList<Token>();
while (true) {
Token t = tstream.next();
if (t==null) break;
tokens.add(t);
}
return tokens;
public static void assertAnalyzesTo(Analyzer a, String input, String[] output)
throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null);
}
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Token... toks) {
this.toks = Arrays.asList(toks).iterator();
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, String[] types) throws IOException {
assertAnalyzesTo(a, input, output, null, null, types, null);
}
public IterTokenStream(Iterable<Token> toks) {
this.toks = toks.iterator();
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
}
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}
public IterTokenStream(String ... text) {
int off = 0;
ArrayList<Token> t = new ArrayList<Token>( text.length );
for( String txt : text ) {
t.add( new Token( txt, off, off+txt.length() ) );
off += txt.length() + 2;
}
this.toks = t.iterator();
}
@Override
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int startOffsets[], int endOffsets[], int[] posIncrements)
throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null,
posIncrements);
}
}

View File

@ -17,9 +17,13 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader;
import java.io.StringReader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
@ -29,7 +33,7 @@ import java.util.HashMap;
* used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change
**/
public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
@ -66,4 +70,23 @@ public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
.isIgnoreCase() == true);
}
/**
* If no words are provided, then a set of english default stopwords is used.
*/
public void testDefaults() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
}
}

View File

@ -16,29 +16,20 @@
*/
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
/**
* Tests CommonGramsQueryFilter
*/
public class CommonGramsFilterTest extends TestCase {
public class CommonGramsFilterTest extends BaseTokenTestCase {
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
"of" };
@ -63,18 +54,6 @@ public class CommonGramsFilterTest extends TestCase {
assertEquals("How", term.term());
}
public void testCommonGramsQueryFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "query"));
}
}
public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
@ -93,18 +72,6 @@ public class CommonGramsFilterTest extends TestCase {
assertEquals("How_the", term.term());
}
public void testCommonGramsFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "common"));
}
}
/**
* This is for testing CommonGramsQueryFilter which outputs a set of tokens
* optimized for querying with only one token at each position, either a
@ -116,150 +83,226 @@ public class CommonGramsFilterTest extends TestCase {
*
* @return Map<String,String>
*/
private static Map<String, String> initQueryMap() {
Map<String, String> input2expected = new LinkedHashMap<String, String>();
public void testCommonGramsQueryFilter() throws Exception {
Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords));
}
};
// Stop words used below are "of" "the" and "s"
// two word queries
input2expected.put("brown fox", "/brown/fox");
input2expected.put("the fox", "/the_fox");
input2expected.put("fox of", "/fox_of");
input2expected.put("of the", "/of_the");
assertAnalyzesTo(a, "brown fox",
new String[] { "brown", "fox" });
assertAnalyzesTo(a, "the fox",
new String[] { "the_fox" });
assertAnalyzesTo(a, "fox of",
new String[] { "fox_of" });
assertAnalyzesTo(a, "of the",
new String[] { "of_the" });
// one word queries
input2expected.put("the", "/the");
input2expected.put("foo", "/foo");
assertAnalyzesTo(a, "the",
new String[] { "the" });
assertAnalyzesTo(a, "foo",
new String[] { "foo" });
// 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n");
input2expected.put("quick brown fox", "/quick/brown/fox");
assertAnalyzesTo(a, "n n n",
new String[] { "n", "n", "n" });
assertAnalyzesTo(a, "quick brown fox",
new String[] { "quick", "brown", "fox" });
input2expected.put("n n s", "/n/n_s");
input2expected.put("quick brown the", "/quick/brown_the");
assertAnalyzesTo(a, "n n s",
new String[] { "n", "n_s" });
assertAnalyzesTo(a, "quick brown the",
new String[] { "quick", "brown_the" });
input2expected.put("n s n", "/n_s/s_n");
input2expected.put("quick the brown", "/quick_the/the_brown");
assertAnalyzesTo(a, "n s n",
new String[] { "n_s", "s_n" });
assertAnalyzesTo(a, "quick the brown",
new String[] { "quick_the", "the_brown" });
input2expected.put("n s s", "/n_s/s_s");
input2expected.put("fox of the", "/fox_of/of_the");
assertAnalyzesTo(a, "n s s",
new String[] { "n_s", "s_s" });
assertAnalyzesTo(a, "fox of the",
new String[] { "fox_of", "of_the" });
input2expected.put("s n n", "/s_n/n/n");
input2expected.put("the quick brown", "/the_quick/quick/brown");
assertAnalyzesTo(a, "s n n",
new String[] { "s_n", "n", "n" });
assertAnalyzesTo(a, "the quick brown",
new String[] { "the_quick", "quick", "brown" });
input2expected.put("s n s", "/s_n/n_s");
input2expected.put("the fox of", "/the_fox/fox_of");
assertAnalyzesTo(a, "s n s",
new String[] { "s_n", "n_s" });
assertAnalyzesTo(a, "the fox of",
new String[] { "the_fox", "fox_of" });
input2expected.put("s s n", "/s_s/s_n");
input2expected.put("of the fox", "/of_the/the_fox");
assertAnalyzesTo(a, "s s n",
new String[] { "s_s", "s_n" });
assertAnalyzesTo(a, "of the fox",
new String[] { "of_the", "the_fox" });
input2expected.put("s s s", "/s_s/s_s");
input2expected.put("of the of", "/of_the/the_of");
return input2expected;
assertAnalyzesTo(a, "s s s",
new String[] { "s_s", "s_s" });
assertAnalyzesTo(a, "of the of",
new String[] { "of_the", "the_of" });
}
private static Map<String, String> initMap() {
Map<String, String> input2expected = new HashMap<String, String>();
public void testCommonGramsFilter() throws Exception {
Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords);
}
};
// Stop words used below are "of" "the" and "s"
// one word queries
input2expected.put("the", "/the");
input2expected.put("foo", "/foo");
assertAnalyzesTo(a, "the", new String[] { "the" });
assertAnalyzesTo(a, "foo", new String[] { "foo" });
// two word queries
input2expected.put("brown fox", "/brown/fox");
input2expected.put("the fox", "/the,the_fox/fox");
input2expected.put("fox of", "/fox,fox_of/of");
input2expected.put("of the", "/of,of_the/the");
assertAnalyzesTo(a, "brown fox",
new String[] { "brown", "fox" },
new int[] { 1, 1 });
assertAnalyzesTo(a, "the fox",
new String[] { "the", "the_fox", "fox" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(a, "fox of",
new String[] { "fox", "fox_of", "of" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(a, "of the",
new String[] { "of", "of_the", "the" },
new int[] { 1, 0, 1 });
// 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n");
input2expected.put("quick brown fox", "/quick/brown/fox");
assertAnalyzesTo(a, "n n n",
new String[] { "n", "n", "n" },
new int[] { 1, 1, 1 });
assertAnalyzesTo(a, "quick brown fox",
new String[] { "quick", "brown", "fox" },
new int[] { 1, 1, 1 });
input2expected.put("n n s", "/n/n,n_s/s");
input2expected.put("quick brown the", "/quick/brown,brown_the/the");
assertAnalyzesTo(a, "n n s",
new String[] { "n", "n", "n_s", "s" },
new int[] { 1, 1, 0, 1 });
assertAnalyzesTo(a, "quick brown the",
new String[] { "quick", "brown", "brown_the", "the" },
new int[] { 1, 1, 0, 1 });
input2expected.put("n s n", "/n,n_s/s,s_n/n");
input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox");
assertAnalyzesTo(a, "n s n",
new String[] { "n", "n_s", "s", "s_n", "n" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "quick the fox",
new String[] { "quick", "quick_the", "the", "the_fox", "fox" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("n s s", "/n,n_s/s,s_s/s");
input2expected.put("fox of the", "/fox,fox_of/of,of_the/the");
assertAnalyzesTo(a, "n s s",
new String[] { "n", "n_s", "s", "s_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "fox of the",
new String[] { "fox", "fox_of", "of", "of_the", "the" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s n n", "/s,s_n/n/n");
input2expected.put("the quick brown", "/the,the_quick/quick/brown");
assertAnalyzesTo(a, "s n n",
new String[] { "s", "s_n", "n", "n" },
new int[] { 1, 0, 1, 1 });
assertAnalyzesTo(a, "the quick brown",
new String[] { "the", "the_quick", "quick", "brown" },
new int[] { 1, 0, 1, 1 });
input2expected.put("s n s", "/s,s_n/n,n_s/s");
input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of");
assertAnalyzesTo(a, "s n s",
new String[] { "s", "s_n", "n", "n_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "the fox of",
new String[] { "the", "the_fox", "fox", "fox_of", "of" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s s n", "/s,s_s/s,s_n/n");
input2expected.put("of the fox", "/of,of_the/the,the_fox/fox");
assertAnalyzesTo(a, "s s n",
new String[] { "s", "s_s", "s", "s_n", "n" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "of the fox",
new String[] { "of", "of_the", "the", "the_fox", "fox" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s s s", "/s,s_s/s,s_s/s");
input2expected.put("of the of", "/of,of_the/the,the_of/of");
return input2expected;
assertAnalyzesTo(a, "s s s",
new String[] { "s", "s_s", "s", "s_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "of the of",
new String[] { "of", "of_the", "the", "the_of", "of" },
new int[] { 1, 0, 1, 0, 1 });
}
/*
* Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
* modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
*/
/**
* Very simple tokenizer that breaks up a string into a series of Lucene
* {@link Token Token}s.
* Test that CommonGramsFilter works correctly in case-insensitive mode
*/
static class StringTokenStream extends TokenStream {
private String str;
private int prevEnd = 0;
private StringTokenizer tok;
private int count = 0;
public StringTokenStream(String str, String delim) {
this.str = str;
tok = new StringTokenizer(str, delim);
public void testCaseSensitive() throws Exception {
final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
"cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}
public Token next() {
if (!tok.hasMoreTokens())
return null;
count++;
String term = tok.nextToken();
Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
prevEnd)
+ term.length(), "word");
prevEnd = t.endOffset();
return t;
}
/**
* Test CommonGramsQueryFilter in the case that the last word is a stopword
*/
public void testLastWordisStopWord() throws Exception {
final String input = "dog the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "dog_the" });
}
public static String testFilter(String in, String type) throws IOException {
TokenStream nsf;
StringTokenStream ts = new StringTokenStream(in, " .");
if (type.equals("query")) {
CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords);
nsf = new CommonGramsQueryFilter(cgf);
} else {
nsf = new CommonGramsFilter(ts, commonWords);
/**
* Test CommonGramsQueryFilter in the case that the first word is a stopword
*/
public void testFirstWordisStopWord() throws Exception {
final String input = "the dog";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_dog" });
}
StringBuffer outBuf = new StringBuffer();
while (true) {
Token t = nsf.next();
if (t == null)
break;
for (int i = 0; i < t.getPositionIncrement(); i++)
outBuf.append('/');
if (t.getPositionIncrement() == 0)
outBuf.append(',');
outBuf.append(t.term());
/**
* Test CommonGramsQueryFilter in the case of a single (stop)word query
*/
public void testOneWordQueryStopWord() throws Exception {
final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" });
}
String out = outBuf.toString();
out = out.replaceAll(" ", "");
return out;
/**
* Test CommonGramsQueryFilter in the case of a single word query
*/
public void testOneWordQuery() throws Exception {
final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" });
}
/**
* Test CommonGramsQueryFilter when first and last words are stopwords.
*/
public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });
}
}

View File

@ -16,9 +16,12 @@
*/
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import java.io.StringReader;
import java.util.Set;
import java.util.Map;
import java.util.HashMap;
@ -28,7 +31,7 @@ import java.util.HashMap;
* used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change
**/
public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
public String getSchemaFile() {
return "schema-stop-keep.xml";
}
@ -65,4 +68,23 @@ public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
.isIgnoreCase() == true);
}
/**
* If no words are provided, then a set of english default stopwords is used.
*/
public void testDefaults() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing_the", "the_factory" });
}
}

View File

@ -16,36 +16,24 @@
*/
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class DoubleMetaphoneFilterFactoryTest extends TestCase {
public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new IterTokenStream("international");
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filteredStream.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filteredStream.next(new Token()));
assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" });
}
public void testSettingSizeAndInject() throws Exception {
@ -55,17 +43,31 @@ public class DoubleMetaphoneFilterFactoryTest extends TestCase {
parameters.put("maxCodeLength", "8");
factory.init(parameters);
TokenStream inputStream = new IterTokenStream("international");
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
}
/**
* Ensure that reset() removes any state (buffered tokens)
*/
public void testReset() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream);
TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
Token token = filteredStream.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertTrue(filteredStream.incrementToken());
assertEquals(13, termAtt.termLength());
assertEquals("international", termAtt.term());
filteredStream.reset();
assertNull(filteredStream.next(new Token()));
// ensure there are no more tokens, such as ANTRNXNL
assertFalse(filteredStream.incrementToken());
}
}

View File

@ -16,94 +16,52 @@
*/
package org.apache.solr.analysis;
import junit.framework.TestCase;
import java.io.StringReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
public class DoubleMetaphoneFilterTest extends TestCase {
public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
public void testSize4FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
assertTokenStreamContents(filter, new String[] { "ANTR" });
}
public void testSize4TrueInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
Token token = filter.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
}
public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new IterTokenStream("Kuczewski");
TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
}
public void testSize8FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international");
TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
Token token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filter.next(new Token()));
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
}
public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
Token token = filter.next(new Token());
assertEquals(5, token.termLength());
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
.termLength()));
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
}
public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&" });
TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertEquals("12345", filter.next(new Token()).term());
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
// should have something after the stream
stream = new IterTokenStream(
new String[] { "12345", "#$%@#^%&", "hello" });
stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNotNull(filter.next(new Token()));
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
}
}

View File

@ -16,11 +16,17 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -32,11 +38,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
gold[i] = stemmer.getCurrent();
}
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
@ -44,21 +50,23 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
Tokenizer tokenizer = new WhitespaceTokenizer(
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
public void testProtected() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
gold[i] = stemmer.getCurrent();
} else {
gold.append(test[i]).append(' ');
gold[i] = test[i];
}
}
@ -69,8 +77,10 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
Tokenizer tokenizer = new WhitespaceTokenizer(
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
class LinesMockSolrResourceLoader implements ResourceLoader {

View File

@ -17,9 +17,13 @@ package org.apache.solr.analysis;
*/
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
public class LengthFilterTest extends BaseTokenTestCase {
public void test() throws IOException {
@ -28,9 +32,8 @@ public class LengthFilterTest extends BaseTokenTestCase {
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args);
String[] test = {"foo", "foobar", "super-duper-trooper"};
String gold = "foobar";
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString(), out);
String test = "foo foobar super-duper-trooper";
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
assertTokenStreamContents(stream, new String[] { "foobar" });
}
}

View File

@ -16,11 +16,18 @@ package org.apache.solr.analysis;
* limitations under the License.
*/
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -32,11 +39,11 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
for (String aTest : test) {
stemmer.setCurrent(aTest);
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
gold[i] = stemmer.getCurrent();
}
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
@ -45,21 +52,27 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
Tokenizer tokenizer = new WhitespaceTokenizer(
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
public void testProtected() throws Exception {
/**
* Tests the protected words mechanism of EnglishPorterFilterFactory
*/
@Deprecated
public void testProtectedOld() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder();
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold.append(stemmer.getCurrent()).append(' ');
gold[i] = stemmer.getCurrent();
} else {
gold.append(test[i]).append(' ');
gold[i] = test[i];
}
}
@ -70,8 +83,10 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test)));
assertEquals(gold.toString().trim(), out);
Tokenizer tokenizer = new WhitespaceTokenizer(
new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
}
class LinesMockSolrResourceLoader implements ResourceLoader {
@ -93,5 +108,22 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
return null;
}
}
/**
* Test the protected words mechanism of SnowballPorterFilterFactory
*/
public void testProtected() throws Exception {
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("protected", "protwords.txt");
args.put("language", "English");
factory.init(args);
factory.inform(loader);
Reader reader = new StringReader("ridding of some stemming");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
}
}

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Simple tests to ensure the Arabic filter Factories are working.
*/
public class TestArabicFilters extends BaseTokenTestCase {
/**
* Test ArabicLetterTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"الذين", "مَلكت", "أيمانكم"});
}
/**
* Test ArabicNormalizationFilterFactory
*/
public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
}
/**
* Test ArabicStemFilterFactory
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = normFactory.create(tokenizer);
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Brazilian stem filter factory is working.
*/
public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems and normalizes text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Brasília");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "brasil" });
}
}

View File

@ -60,9 +60,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts);
//System.out.println(actual);
assertEquals(expected, actual);
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testABAAB() throws Exception {
@ -70,9 +68,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts);
//System.out.println(actual);
assertEquals(expected, actual);
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testReset() throws Exception {

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the CJK tokenizer factory is working.
*/
public class TestCJKTokenizerFactory extends BaseTokenTestCase {
/**
* Ensure the tokenizer actually tokenizes CJK text correctly
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("我是中国人");
CJKTokenizerFactory factory = new CJKTokenizerFactory();
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"我是", "是中", "中国", "国人"});
}
}

View File

@ -17,14 +17,18 @@
package org.apache.solr.analysis;
import junit.framework.TestCase;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* @version $Id$
*
*/
public class TestCapitalizationFilter extends BaseTokenTestCase {
@ -64,39 +68,46 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
String out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
assertEquals( "Hello there my name is ryan", out );
Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
// now each token
factory.onlyFirstWord = false;
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
assertEquals( "Hello There My Name Is Ryan", out );
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
// now only the long words
factory.minWordLength = 3;
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) );
assertEquals( "Hello There my Name is Ryan", out );
tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
// without prefix
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
assertEquals( "Mckinley", out );
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Mckinley" });
// Now try some prefixes
factory = new CapitalizationFilterFactory();
args.put( "okPrefix", "McK" ); // all words
factory.init( args );
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) );
assertEquals( "McKinley", out );
tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "McKinley" });
// now try some stuff with numbers
factory.forceFirstLetter = false;
factory.onlyFirstWord = false;
out = tsToString( factory.create( new IterTokenStream( "1st 2nd third" ) ) );
assertEquals( "1st 2nd Third", out );
tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
factory.forceFirstLetter = true;
out = tsToString( factory.create( new IterTokenStream( "the The the" ) ) );
assertEquals( "The The the", out );
tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "The The the" });
}
public void testKeepIgnoreCase() throws Exception {
@ -123,4 +134,80 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
}
/**
* Test CapitalizationFilterFactory's minWordLength option.
*
* This is very weird when combined with ONLY_FIRST_WORD!!!
*/
public void testMinWordLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"helo testing"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
}
/**
* Test CapitalizationFilterFactory's maxWordCount option with only words of 1
* in each token (it should do nothing)
*/
public void testMaxWordCount() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"one two three four"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
}
/**
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
*/
public void testMaxWordCount2() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
"one two three four"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"one two three four"});
}
/**
* Test CapitalizationFilterFactory's maxTokenLength option when exceeded
*
* This is weird, it is not really a max, but inclusive (look at 'is')
*/
public void testMaxTokenLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"this is a test"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
}
/**
* Test CapitalizationFilterFactory's forceFirstLetter option
*/
public void testForceFirstLetter() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.KEEP, "kitten");
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"Kitten"});
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Chinese filter factory is working.
*/
public class TestChineseFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes text (numerics, stopwords)
*/
public void testFiltering() throws Exception {
Reader reader = new StringReader("this 1234 Is such a silly filter");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ChineseFilterFactory factory = new ChineseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
}
}

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the Chinese tokenizer factory is working.
*/
public class TestChineseTokenizerFactory extends BaseTokenTestCase {
/**
* Ensure the tokenizer actually tokenizes chinese text correctly
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("我是中国人");
ChineseTokenizerFactory factory = new ChineseTokenizerFactory();
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"", "", "", "", ""});
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.text.Collator;
import java.text.RuleBasedCollator;
import java.util.HashMap;
@ -27,7 +28,9 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader;
public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
@ -39,18 +42,80 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
* Then things will sort and match correctly.
*/
public void testBasicUsage() throws IOException {
String[] turkishUpperCase = { "I", "WİLL", "USE", "TURKİSH", "CASING" };
String[] turkishLowerCase = { "ı", "will", "use", "turkish", "casıng" };
String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "tr");
args.put("strength", "primary");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(new IterTokenStream(turkishUpperCase));
TokenStream tsLower = factory.create(new IterTokenStream(turkishLowerCase));
assertTokEqual(BaseTokenTestCase.getTokens(tsUpper),
BaseTokenTestCase.getTokens(tsLower));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
public void testNormalization() throws IOException {
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "tr");
args.put("strength", "primary");
args.put("decomposition", "canonical");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the K decomposition option for unicode normalization.
* This works even with identical strength.
*/
public void testFullDecomposition() throws IOException {
String fullWidth = "";
String halfWidth = "Testing";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "zh");
args.put("strength", "identical");
args.put("decomposition", "full");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsFull = factory.create(
new KeywordTokenizer(new StringReader(fullWidth)));
TokenStream tsHalf = factory.create(
new KeywordTokenizer(new StringReader(halfWidth)));
assertCollatesToSame(tsFull, tsHalf);
}
/*
* Test secondary strength, for english case is not significant.
*/
public void testSecondaryStrength() throws IOException {
String upperCase = "TESTING";
String lowerCase = "testing";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "en");
args.put("strength", "secondary");
args.put("decomposition", "no");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
@ -74,18 +139,20 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
// at this point, you would save these tailoredRules to a file,
// and use the custom parameter.
//
String[] germanUmlaut = { "Töne" };
String[] germanOE = { "Toene" };
String germanUmlaut = "Töne";
String germanOE = "Toene";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("custom", "rules.txt");
args.put("strength", "primary");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(tailoredRules));
TokenStream tsUmlaut = factory.create(new IterTokenStream(germanUmlaut));
TokenStream tsOE = factory.create(new IterTokenStream(germanOE));
assertTokEqual(BaseTokenTestCase.getTokens(tsUmlaut),
BaseTokenTestCase.getTokens(tsOE));
TokenStream tsUmlaut = factory.create(
new KeywordTokenizer(new StringReader(germanUmlaut)));
TokenStream tsOE = factory.create(
new KeywordTokenizer(new StringReader(germanOE)));
assertCollatesToSame(tsUmlaut, tsOE);
}
private class StringMockSolrResourceLoader implements ResourceLoader {
@ -107,4 +174,17 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
return new ByteArrayInputStream(text.getBytes("UTF-8"));
}
}
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException {
TermAttribute term1 = (TermAttribute) stream1
.addAttribute(TermAttribute.class);
TermAttribute term2 = (TermAttribute) stream2
.addAttribute(TermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
assertEquals(term1.term(), term2.term());
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}
}

View File

@ -0,0 +1,51 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
/**
* Simple tests to ensure the Dictionary compound filter factory is working.
*/
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually decompounds text.
*/
public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("dictionary", "compoundDictionary.txt");
factory.init(args);
factory.inform(loader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "I", "like", "to", "play", "softball", "soft", "ball" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Dutch stem filter factory is working.
*/
public class TestDutchStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("lichamelijkheden");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
DutchStemFilterFactory factory = new DutchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "licham" });
}
}

View File

@ -0,0 +1,50 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
/**
* Simple tests to ensure the French elision filter factory is working.
*/
public class TestElisionFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes text.
*/
public void testElision() throws Exception {
Reader reader = new StringReader("l'avion");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ElisionFilterFactory factory = new ElisionFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("articles", "frenchArticles.txt");
factory.init(args);
factory.inform(loader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "avion" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the French stem filter factory is working.
*/
public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("habitable");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "habit" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the German stem filter factory is working.
*/
public class TestGermanStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Tischen");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
GermanStemFilterFactory factory = new GermanStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "tisch" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Greek lowercase filter factory is working.
*/
public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually lowercases (and a bit more) greek text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
}
}

View File

@ -28,12 +28,24 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
ts = new HyphenatedWordsFilter(ts);
String actual = tsToString(ts);
assertEquals("Testing HyphenatedWordsFilter",
outputAfterHyphenatedWordsFilter, actual);
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
}
/**
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
*/
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}
}

View File

@ -17,13 +17,14 @@
package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
@ -37,7 +38,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
words.add( "aaa" );
words.add( "bbb" );
List<Token> input = tokens( "aaa BBB ccc ddd EEE" );
String input = "aaa BBB ccc ddd EEE";
Map<String,String> args = new HashMap<String, String>();
@ -47,18 +48,28 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
factory.init( args );
factory.inform( solrConfig.getResourceLoader() );
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
List<Token> expect = tokens( "aaa BBB" );
List<Token> real = getTokens(factory.create( new IterTokenStream(input) ));
assertTokEqual( expect, real );
// Test Stopwords (ignoreCase via the setter instead)
factory = new KeepWordFilterFactory();
args = new HashMap<String, String>();
factory.init( args );
factory.inform( solrConfig.getResourceLoader() );
factory.setIgnoreCase(true);
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case
args = new HashMap<String, String>();
args.put( "ignoreCase", "false" );
factory.init( args );
factory.inform( solrConfig.getResourceLoader() );
expect = tokens( "aaa" );
real = getTokens(factory.create( new IterTokenStream(input) ));
assertTokEqual( expect, real );
assertFalse(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa" });
}
}

View File

@ -1,37 +1,27 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.junit.Assert;
import org.junit.Test;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
/**
* @version $Id$
* @since solr 1.4
*/
public class TestMultiWordSynonyms {
public class TestMultiWordSynonyms extends BaseTokenTestCase {
@Test
public void testMultiWordSynonmys() throws IOException {
public void testMultiWordSynonyms() throws IOException {
List<String> rules = new ArrayList<String>();
rules.add("a b c,d");
SynonymMap synMap = new SynonymMap(true);
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
ts.reset();
List<String> tokens = new ArrayList<String>();
while (ts.incrementToken()) tokens.add(termAtt.term());
// This fails because ["e","e"] is the value of the token stream
Assert.assertEquals(Arrays.asList("a", "e"), tokens);
assertTokenStreamContents(ts, new String[] { "a", "e" });
}
}

View File

@ -0,0 +1,163 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the NGram filter factories are working.
*/
public class TestNGramFilters extends BaseTokenTestCase {
/**
* Test NGramTokenizerFactory
*/
public void testNGramTokenizer() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
NGramTokenizerFactory factory = new NGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t", "e", "s", "t", "te", "es", "st" });
}
/**
* Test NGramTokenizerFactory with min and max gram options
*/
public void testNGramTokenizer2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "2");
args.put("maxGramSize", "3");
NGramTokenizerFactory factory = new NGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "te", "es", "st", "tes", "est" });
}
/**
* Test the NGramFilterFactory
*/
public void testNGramFilter() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
NGramFilterFactory factory = new NGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t", "e", "s", "t", "te", "es", "st" });
}
/**
* Test the NGramFilterFactory with min and max gram options
*/
public void testNGramFilter2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "2");
args.put("maxGramSize", "3");
NGramFilterFactory factory = new NGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "te", "es", "st", "tes", "est" });
}
/**
* Test EdgeNGramTokenizerFactory
*/
public void testEdgeNGramTokenizer() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t" });
}
/**
* Test EdgeNGramTokenizerFactory with min and max gram size
*/
public void testEdgeNGramTokenizer2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "1");
args.put("maxGramSize", "2");
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
/**
* Test EdgeNGramTokenizerFactory with side option
*/
public void testEdgeNGramTokenizer3() throws Exception {
Reader reader = new StringReader("ready");
Map<String,String> args = new HashMap<String,String>();
args.put("side", "back");
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "y" });
}
/**
* Test EdgeNGramFilterFactory
*/
public void testEdgeNGramFilter() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t" });
}
/**
* Test EdgeNGramFilterFactory with min and max gram size
*/
public void testEdgeNGramFilter2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "1");
args.put("maxGramSize", "2");
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
/**
* Test EdgeNGramFilterFactory with side option
*/
public void testEdgeNGramFilter3() throws Exception {
Reader reader = new StringReader("ready");
Map<String,String> args = new HashMap<String,String>();
args.put("side", "back");
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "y" });
}
}

View File

@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
@ -37,20 +39,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// this is test.
public void testNothingChange() throws IOException {
final String BLOCK = "this is test.";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3",
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1$2$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
new int[] { 4, 7, 13 },
new int[] { 1, 1, 1 });
}
// 012345678
// aa bb cc
public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "",
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertEquals( 0, getTokens( ts ).size() );
assertFalse(ts.incrementToken());
}
// 012345678
@ -58,10 +73,19 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// aa#bb#cc
public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3",
PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1#$2#$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
new int[] { 8 },
new int[] { 1 });
}
// 11111
@ -73,7 +97,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
new int[] { 8, 11 },
new int[] { 1, 1 });
}
// 01234567
@ -84,7 +112,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "a", "aa",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
new int[] { 2, 5 },
new int[] { 1, 1 });
}
// 11111
@ -96,7 +128,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
new int[] { 11, 14 },
new int[] { 1, 1 });
}
// 111111111122222222223333
@ -108,8 +144,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ),
getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
// 11111111112222222222333333333
@ -121,8 +160,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ),
getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
}
// 11111111112222222222333333333
@ -136,7 +178,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ),
getTokens( ts ) );
assertTokenStreamContents(ts,
new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
}
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
@ -27,7 +26,7 @@ import java.util.regex.Pattern;
/**
* @version $Id:$
*/
public class TestPatternReplaceFilter extends AnalysisTestCase {
public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab";
@ -35,14 +34,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", true);
Token token = ts.next();
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTokenStreamContents(ts,
new String[] { "-foo-foo-foo-", "-", "c-" });
}
public void testReplaceFirst() throws Exception {
@ -51,14 +44,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
"-", false);
Token token = ts.next();
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTokenStreamContents(ts,
new String[] { "-fooaabfooabfoob", "-", "c-" });
}
public void testStripFirst() throws Exception {
@ -67,14 +54,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, false);
Token token = ts.next();
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTokenStreamContents(ts,
new String[] { "fooaabfooabfoob", "", "c" });
}
public void testStripAll() throws Exception {
@ -83,14 +64,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"),
null, true);
Token token = ts.next();
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTokenStreamContents(ts,
new String[] { "foofoofoo", "", "c" });
}
public void testReplaceAllWithBackRef() throws Exception {
@ -99,14 +74,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("(a*)b"),
"$1\\$", true);
Token token = ts.next();
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
assertTokenStreamContents(ts,
new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
}
}

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
@ -27,8 +28,8 @@ import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class TestPatternTokenizerFactory extends BaseTokenTestCase
{
@ -57,7 +58,7 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
tokenizer.init( args );
TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
String out = TestHyphenatedWordsFilter.tsToString( stream );
String out = tsToString( stream );
System.out.println( test[2] + " ==> " + out );
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
@ -93,20 +94,45 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
TokenStream stream = tokFactory.create( charStream );
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther", "is", "here" },
new int[] { 0, 13, 26, 29 },
new int[] { 12, 25, 28, 33 },
new int[] { 1, 1, 1, 1 });
List<Token> result = getTokens( stream );
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" );
assertTokEqualOff( expect, result );
charStream.reset();
charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
args.put( PatternTokenizerFactory.PATTERN, "Günther" );
args.put( PatternTokenizerFactory.GROUP, "0" );
tokFactory = new PatternTokenizerFactory();
tokFactory.init( args );
stream = tokFactory.create( charStream );
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
new int[] { 12, 25 },
new int[] { 1, 1 });
}
result = getTokens( stream );
expect = tokens( "Günther,1,0,12 Günther,1,13,25" );
assertTokEqualOff( expect, result );
/**
* TODO: rewrite tests not to use string comparison.
* @deprecated only tests TermAttribute!
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setTermBuffer("bogusTerm");
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');
out.append(termAtt.term());
in.clearAttributes();
termAtt.setTermBuffer("bogusTerm");
}
in.close();
return out.toString();
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Persian normalization factory is working.
*/
public class TestPersianNormalizationFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes persian text.
*/
public void testNormalization() throws Exception {
Reader reader = new StringReader("های");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "هاي" });
}
}

View File

@ -17,16 +17,14 @@
package org.apache.solr.analysis;
import java.util.ArrayList;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
@ -61,50 +59,38 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
assertFalse( ff.inject );
}
public void runner( Encoder enc, boolean inject ) throws Exception
{
String[] input = new String[] {
"aaa", "bbb", "ccc", "easgasg"
};
public void testAlgorithms() throws Exception {
assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg",
new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg",
new String[] { "A", "B", "KKK", "ESKS" });
ArrayList<Token> stream = new ArrayList<Token>();
ArrayList<Token> output = new ArrayList<Token>();
for( String s : input ) {
stream.add( new Token( s, 0, s.length() ) );
assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg",
new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg",
new String[] { "A", "PP", "KK", "ASKS" });
// phonetic token is added first in the current impl
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) );
assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg",
new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg",
new String[] { "A000", "B000", "C000", "E220" });
// add the original if applicable
if( inject ) {
output.add( new Token( s, 0, s.length() ) );
}
assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg",
new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg",
new String[] { "A0", "B1", "C3", "E034034" });
}
// System.out.println("###stream="+stream);
// System.out.println("###output="+output);
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
Token got = new Token();
for( Token t : output ) {
got = filter.next(got);
// System.out.println("##### expect=" + t + " got="+got);
assertEquals( t.term(), got.term());
}
assertNull( filter.next() ); // no more tokens
}
public void testEncodes() throws Exception {
runner( new DoubleMetaphone(), true );
runner( new Metaphone(), true );
runner( new Soundex(), true );
runner( new RefinedSoundex(), true );
runner( new DoubleMetaphone(), false );
runner( new Metaphone(), false );
runner( new Soundex(), false );
runner( new RefinedSoundex(), false );
static void assertAlgorithm(String algName, String inject, String input,
String[] expected) throws Exception {
Tokenizer tokenizer = new WhitespaceTokenizer(
new StringReader(input));
Map<String,String> args = new HashMap<String,String>();
args.put("encoder", algName);
args.put("inject", inject);
PhoneticFilterFactory factory = new PhoneticFilterFactory();
factory.init(args);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, expected);
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Porter stem filter factory is working.
*/
public class TestPorterStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("dogs");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
PorterStemFilterFactory factory = new PorterStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "dog" });
}
}

View File

@ -20,10 +20,14 @@ package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.Iterator;
import java.util.Arrays;
public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase {
public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t,start,end);
@ -38,15 +42,27 @@ public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase {
throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator();
final TokenStream ts = new RemoveDuplicatesTokenFilter
RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
final TokenStream ts = factory.create
(new TokenStream() {
public Token next() { return toks.hasNext() ? toks.next() : null; }
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setTermBuffer(tok.term());
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
});
final String actual = TestBufferedTokenStream.tsToString(ts);
assertEquals(expected + " != " + actual, expected, actual);
assertTokenStreamContents(ts, expected.split("\\s"));
}
public void testNoDups() throws Exception {

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Reverse string filter factory is working.
*/
public class TestReverseStringFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually reverses text.
*/
public void testReversing() throws Exception {
Reader reader = new StringReader("simple test");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ReverseStringFilterFactory factory = new ReverseStringFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "elpmis", "tset" });
}
}

View File

@ -21,11 +21,9 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.queryParser.ParseException;
@ -53,57 +51,52 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
public void testReversedTokens() throws IOException {
String text = "simple text";
String expected1 = "simple \u0001elpmis text \u0001txet";
String expected2 = "\u0001elpmis \u0001txet";
args.put("withOriginal", "true");
factory.init(args);
TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
List<Token> realTokens = getTokens(input);
List<Token> expectedTokens = tokens(expected1);
// set positionIncrements in expected tokens
for (int i = 1; i < expectedTokens.size(); i += 2) {
expectedTokens.get(i).setPositionIncrement(0);
}
assertTokEqual(realTokens, expectedTokens);
assertTokenStreamContents(input,
new String[] { "\u0001elpmis", "simple", "\u0001txet", "text" },
new int[] { 1, 0, 1, 0 });
// now without original tokens
args.put("withOriginal", "false");
factory.init(args);
input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
realTokens = getTokens(input);
expectedTokens = tokens(expected2);
assertTokEqual(realTokens, expectedTokens);
assertTokenStreamContents(input,
new String[] { "\u0001elpmis", "\u0001txet" },
new int[] { 1, 1 });
}
public void testIndexingAnalysis() throws Exception {
Analyzer a = schema.getAnalyzer();
String text = "one two three si\uD834\uDD1Ex";
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens1 = getTokens(
new WhitespaceTokenizer(new StringReader(expected1)));
// set positionIncrements and offsets in expected tokens
for (int i = 1; i < expectedTokens1.size(); i += 2) {
Token t = expectedTokens1.get(i);
t.setPositionIncrement(0);
}
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens2 = getTokens(
new WhitespaceTokenizer(new StringReader(expected2)));
String expected3 = "one two three si\uD834\uDD1Ex";
List<Token> expectedTokens3 = getTokens(
new WhitespaceTokenizer(new StringReader(expected3)));
// field one
TokenStream input = a.tokenStream("one", new StringReader(text));
List<Token> realTokens = getTokens(input);
assertTokEqual(realTokens, expectedTokens1);
assertTokenStreamContents(input,
new String[] { "\u0001eno", "one", "\u0001owt", "two",
"\u0001eerht", "three", "\u0001x\uD834\uDD1Eis", "si\uD834\uDD1Ex" },
new int[] { 0, 0, 4, 4, 8, 8, 14, 14 },
new int[] { 3, 3, 7, 7, 13, 13, 19, 19 },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }
);
// field two
input = a.tokenStream("two", new StringReader(text));
realTokens = getTokens(input);
assertTokEqual(realTokens, expectedTokens2);
assertTokenStreamContents(input,
new String[] { "\u0001eno", "\u0001owt",
"\u0001eerht", "\u0001x\uD834\uDD1Eis" },
new int[] { 0, 4, 8, 14 },
new int[] { 3, 7, 13, 19 },
new int[] { 1, 1, 1, 1 }
);
// field three
input = a.tokenStream("three", new StringReader(text));
realTokens = getTokens(input);
assertTokEqual(realTokens, expectedTokens3);
assertTokenStreamContents(input,
new String[] { "one", "two", "three", "si\uD834\uDD1Ex" },
new int[] { 0, 4, 8, 14 },
new int[] { 3, 7, 13, 19 },
new int[] { 1, 1, 1, 1 }
);
}
public void testQueryParsing() throws IOException, ParseException {

View File

@ -0,0 +1,79 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Simple tests to ensure the Russian filter factories are working.
*/
public class TestRussianFilters extends BaseTokenTestCase {
/**
* Test RussianLetterTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"Вместе", "с", "тем", "о",
"силе", "электромагнитной", "100"});
}
/**
* Test RussianLowerCaseFilterFactory
*/
public void testLowerCase() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
RussianLowerCaseFilterFactory filterFactory = new RussianLowerCaseFilterFactory();
filterFactory.init(args);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"вместе", "с", "тем", "о",
"силе", "электромагнитной", "100"});
}
/**
* Test RussianStemFilterFactory
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
RussianLowerCaseFilterFactory caseFactory = new RussianLowerCaseFilterFactory();
caseFactory.init(args);
RussianStemFilterFactory stemFactory = new RussianStemFilterFactory();
stemFactory.init(args);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = caseFactory.create(tokenizer);
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"вмест", "с", "тем", "о",
"сил", "электромагнитн", "100"});
}
}

View File

@ -0,0 +1,73 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Shingle filter factory works.
*/
public class TestShingleFilterFactory extends BaseTokenTestCase {
/**
* Test the defaults
*/
public void testDefaults() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream, new String[] {"this", "this is", "is",
"is a", "a", "a test", "test"});
}
/**
* Test with unigrams disabled
*/
public void testNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] {"this is", "is a", "a test"});
}
/**
* Test with a higher max shingle size
*/
public void testMaxShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("maxShingleSize", "3");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] {"this", "this is", "this is a", "is",
"is a", "is a test", "a", "a test", "test"});
}
}

View File

@ -0,0 +1,121 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the standard lucene factories are working.
*/
public class TestStandardFactories extends BaseTokenTestCase {
/**
* Test StandardTokenizerFactory
*/
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do" });
}
/**
* Test StandardFilterFactory
*/
public void testStandardFilter() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
StandardFilterFactory filterFactory = new StandardFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] {"What", "this", "thing", "do"});
}
/**
* Test KeywordTokenizerFactory
*/
public void testKeywordTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
KeywordTokenizerFactory factory = new KeywordTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's this thing do?"});
}
/**
* Test WhitespaceTokenizerFactory
*/
public void testWhitespaceTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do?"});
}
/**
* Test LetterTokenizerFactory
*/
public void testLetterTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LetterTokenizerFactory factory = new LetterTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What", "s", "this", "thing", "do"});
}
/**
* Test LowerCaseTokenizerFactory
*/
public void testLowerCaseTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"what", "s", "this", "thing", "do"});
}
/**
* Ensure the ASCIIFoldingFilterFactory works
*/
public void testASCIIFolding() throws Exception {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Ceska" });
}
/**
* Ensure the ISOLatin1AccentFilterFactory works
* (sometimes, at least not uppercase hacek)
*/
public void testISOLatin1Folding() throws Exception {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Česka" });
}
}

View File

@ -19,11 +19,20 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Collection;
import java.util.List;
/**
@ -31,33 +40,41 @@ import java.util.List;
*/
public class TestSynonymFilter extends BaseTokenTestCase {
public List strings(String str) {
static List<String> strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
}
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
ArrayList<Token> lst = new ArrayList<Token>();
final List toks = tokens(input);
TokenStream ts = new TokenStream() {
Iterator iter = toks.iterator();
@Override
public Token next() throws IOException {
return iter.hasNext() ? (Token)iter.next() : null;
}
};
SynonymFilter sf = new SynonymFilter(ts, dict);
Token target = new Token(); // test with token reuse
while(true) {
Token t = sf.next(target);
if (t==null) return lst;
lst.add((Token)t.clone());
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected);
}
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
posIncs);
}
public void testMatching() throws IOException {
SynonymMap map = new SynonymMap();
@ -71,28 +88,29 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge);
// System.out.println(map);
// System.out.println(getTokList(map,"a",false));
assertTokEqual(getTokList(map,"$",false), tokens("$"));
assertTokEqual(getTokList(map,"a",false), tokens("aa"));
assertTokEqual(getTokList(map,"a $",false), tokens("aa $"));
assertTokEqual(getTokList(map,"$ a",false), tokens("$ aa"));
assertTokEqual(getTokList(map,"a a",false), tokens("aa aa"));
assertTokEqual(getTokList(map,"b",false), tokens("bb"));
assertTokEqual(getTokList(map,"z x c v",false), tokens("zxcv"));
assertTokEqual(getTokList(map,"z x c $",false), tokens("z xc $"));
assertTokenizesTo(map, "$", new String[] { "$" });
assertTokenizesTo(map, "a", new String[] { "aa" });
assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
assertTokenizesTo(map, "b", new String[] { "bb" });
assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });
// repeats
map.add(strings("a b"), tokens("ab"), orig, merge);
map.add(strings("a b"), tokens("ab"), orig, merge);
assertTokEqual(getTokList(map,"a b",false), tokens("ab"));
// FIXME: the below test intended to be { "ab" }
assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" });
// check for lack of recursion
map.add(strings("zoo"), tokens("zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo $ zoo"));
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo zoo zoo $ zoo zoo"));
// FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
// maybe this was just a typo in the old test????
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
}
public void testIncludeOrig() throws IOException {
@ -107,25 +125,48 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge);
// System.out.println(map);
// System.out.println(getTokList(map,"a",false));
assertTokEqual(getTokList(map,"$",false), tokens("$"));
assertTokEqual(getTokList(map,"a",false), tokens("a/aa"));
assertTokEqual(getTokList(map,"a",false), tokens("a/aa"));
assertTokEqual(getTokList(map,"$ a",false), tokens("$ a/aa"));
assertTokEqual(getTokList(map,"a $",false), tokens("a/aa $"));
assertTokEqual(getTokList(map,"$ a !",false), tokens("$ a/aa !"));
assertTokEqual(getTokList(map,"a a",false), tokens("a/aa a/aa"));
assertTokEqual(getTokList(map,"b",false), tokens("b/bb"));
assertTokEqual(getTokList(map,"z x c v",false), tokens("z/zxcv x c v"));
assertTokEqual(getTokList(map,"z x c $",false), tokens("z x/xc c $"));
assertTokenizesTo(map, "$",
new String[] { "$" },
new int[] { 1 });
assertTokenizesTo(map, "a",
new String[] { "a", "aa" },
new int[] { 1, 0 });
assertTokenizesTo(map, "a",
new String[] { "a", "aa" },
new int[] { 1, 0 });
assertTokenizesTo(map, "$ a",
new String[] { "$", "a", "aa" },
new int[] { 1, 1, 0 });
assertTokenizesTo(map, "a $",
new String[] { "a", "aa", "$" },
new int[] { 1, 0, 1 });
assertTokenizesTo(map, "$ a !",
new String[] { "$", "a", "aa", "!" },
new int[] { 1, 1, 0, 1 });
assertTokenizesTo(map, "a a",
new String[] { "a", "aa", "a", "aa" },
new int[] { 1, 0, 1, 0 });
assertTokenizesTo(map, "b",
new String[] { "b", "bb" },
new int[] { 1, 0 });
assertTokenizesTo(map, "z x c v",
new String[] { "z", "zxcv", "x", "c", "v" },
new int[] { 1, 0, 1, 1, 1 });
assertTokenizesTo(map, "z x c $",
new String[] { "z", "x", "xc", "c", "$" },
new int[] { 1, 1, 0, 1, 1 });
// check for lack of recursion
map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo/zoo $ zoo/zoo"));
// CHECKME: I think the previous test (with 4 zoo's), was just a typo.
assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
new int[] { 1, 0, 1, 1, 1 });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo $ zoo/zoo zoo"));
assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 1, 0, 1 });
}
@ -136,25 +177,35 @@ public class TestSynonymFilter extends BaseTokenTestCase {
boolean merge = true;
map.add(strings("a"), tokens("a5,5"), orig, merge);
map.add(strings("a"), tokens("a3,3"), orig, merge);
// System.out.println(map);
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2"));
assertTokenizesTo(map, "a",
new String[] { "a3", "a5" },
new int[] { 1, 2 });
map.add(strings("b"), tokens("b3,3"), orig, merge);
map.add(strings("b"), tokens("b5,5"), orig, merge);
//System.out.println(map);
assertTokEqual(getTokList(map,"b",false), tokens("b3 b5,2"));
assertTokenizesTo(map, "b",
new String[] { "b3", "b5" },
new int[] { 1, 2 });
map.add(strings("a"), tokens("A3,3"), orig, merge);
map.add(strings("a"), tokens("A5,5"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a3/A3 a5,2/A5"));
assertTokenizesTo(map, "a",
new String[] { "a3", "A3", "a5", "A5" },
new int[] { 1, 0, 2, 0 });
map.add(strings("a"), tokens("a1"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a1 a3,2/A3 a5,2/A5"));
assertTokenizesTo(map, "a",
new String[] { "a1", "a3", "A3", "a5", "A5" },
new int[] { 1, 2, 0, 2, 0 });
map.add(strings("a"), tokens("a2,2"), orig, merge);
map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a1 a2 a3/A3 a4 a5/A5 a6"));
assertTokenizesTo(map, "a",
new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
}
@ -167,41 +218,56 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("qwe"), tokens("xx"), orig, merge);
map.add(strings("qwe"), tokens("yy"), orig, merge);
map.add(strings("qwe"), tokens("zz"), orig, merge);
assertTokEqual(getTokList(map,"$",false), tokens("$"));
assertTokEqual(getTokList(map,"qwe",false), tokens("qq/ww/ee/xx/yy/zz"));
assertTokenizesTo(map, "$", new String[] { "$" });
assertTokenizesTo(map, "qwe",
new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
new int[] { 1, 0, 0, 0, 0, 0 });
// test merging within the map
map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2 a7,2 a8 a9 a10 a11 a111,100"));
assertTokenizesTo(map, "a",
new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
}
public void testOffsets() throws IOException {
public void testPositionIncrements() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = false;
boolean merge = true;
// test that generated tokens start at the same offset as the original
// test that generated tokens start at the same posInc as the original
map.add(strings("a"), tokens("aa"), orig, merge);
assertTokEqual(getTokList(map,"a,5",false), tokens("aa,5"));
assertTokEqual(getTokList(map,"a,0",false), tokens("aa,0"));
assertTokenizesTo(map, tokens("a,5"),
new String[] { "aa" },
new int[] { 5 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "aa" },
new int[] { 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5"));
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0"));
assertTokenizesTo(map, tokens("b,5"),
new String[] { "bb" },
new int[] { 5 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "bb" },
new int[] { 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5 c2,2"));
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0 c2,2"));
assertTokenizesTo(map, tokens("c,5"),
new String[] { "cc", "c2" },
new int[] { 5, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "cc", "c2" },
new int[] { 0, 2 });
}
public void testOffsetsWithOrig() throws IOException {
public void testPositionIncrementsWithOrig() throws IOException {
SynonymMap map = new SynonymMap();
boolean orig = true;
@ -209,18 +275,30 @@ public class TestSynonymFilter extends BaseTokenTestCase {
// test that generated tokens start at the same offset as the original
map.add(strings("a"), tokens("aa"), orig, merge);
assertTokEqual(getTokList(map,"a,5",false), tokens("a,5/aa"));
assertTokEqual(getTokList(map,"a,0",false), tokens("a,0/aa"));
assertTokenizesTo(map, tokens("a,5"),
new String[] { "a", "aa" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "a", "aa" },
new int[] { 0, 0 });
// test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5/b"));
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0/b"));
assertTokenizesTo(map, tokens("b,5"),
new String[] { "b", "bb" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "b", "bb" },
new int[] { 0, 0 });
// test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5/c c2,2"));
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0/c c2,2"));
assertTokenizesTo(map, tokens("c,5"),
new String[] { "c", "cc", "c2" },
new int[] { 5, 0, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "c", "cc", "c2" },
new int[] { 0, 0, 2 });
}
@ -238,10 +316,101 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("a a"), tokens("b"), orig, merge);
map.add(strings("x"), tokens("y"), orig, merge);
System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false));
// "a a x" => "b y"
assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5"));
assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
new String[] { "b", "y" },
new int[] { 0, 4 },
new int[] { 3, 5 },
new int[] { 1, 1 });
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
* @deprecated does not support attributes api
*/
private List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
}
/**
* @deprecated does not support custom attributes
*/
private static class IterTokenStream extends TokenStream {
final Token tokens[];
int index = 0;
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
public IterTokenStream(Token... tokens) {
super();
this.tokens = tokens;
}
public IterTokenStream(Collection<Token> tokens) {
this(tokens.toArray(new Token[tokens.size()]));
}
public boolean incrementToken() throws IOException {
if (index >= tokens.length)
return false;
else {
clearAttributes();
Token token = tokens[index++];
termAtt.setTermBuffer(token.term());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
}
}
}

View File

@ -0,0 +1,42 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Thai word filter factory is working.
*/
public class TestThaiWordFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually decomposes text.
*/
public void testWordBreak() throws Exception {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ThaiWordFilterFactory factory = new ThaiWordFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้",
"ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
}
}

View File

@ -17,12 +17,19 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.List;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/**
* @version $Id:$
@ -35,46 +42,75 @@ public class TestTrimFilter extends BaseTokenTestCase {
char[] ccc = "cCc".toCharArray();
char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray();
TokenStream ts = new TrimFilter
(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
TrimFilterFactory factory = new TrimFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("updateOffsets", "false");
factory.init(args);
TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21)), false);
new Token(empty, 0, empty.length, 21, 21)));
TermAttribute token;
assertTrue(ts.incrementToken());
token = (TermAttribute) ts.getAttribute(TermAttribute.class);
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
assertFalse(ts.incrementToken());
assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
a = " a".toCharArray();
b = "b ".toCharArray();
ccc = " c ".toCharArray();
whitespace = " ".toCharArray();
ts = new TrimFilter(new IterTokenStream(
factory = new TrimFilterFactory();
args = new HashMap<String,String>();
args.put("updateOffsets", "true");
factory.init(args);
ts = factory.create(new IterTokenStream(
new Token(a, 0, a.length, 0, 2),
new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3)), true);
new Token(whitespace, 0, whitespace.length, 0, 3)));
List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3");
List<Token> real = getTokens(ts);
for (Token t : expect) {
System.out.println("TEST:" + t);
}
for (Token t : real) {
System.out.println("REAL:" + t);
}
assertTokEqualOff(expect, real);
assertTokenStreamContents(ts,
new String[] { "a", "b", "c", "" },
new int[] { 1, 0, 1, 3 },
new int[] { 2, 1, 2, 3 },
new int[] { 1, 1, 1, 1 });
}
/**
* @deprecated does not support custom attributes
*/
private static class IterTokenStream extends TokenStream {
final Token tokens[];
int index = 0;
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
public IterTokenStream(Token... tokens) {
super();
this.tokens = tokens;
}
public IterTokenStream(Collection<Token> tokens) {
this(tokens.toArray(new Token[tokens.size()]));
}
public boolean incrementToken() throws IOException {
if (index >= tokens.length)
return false;
else {
clearAttributes();
Token token = tokens[index++];
termAtt.setTermBuffer(token.term());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
}
}
}

View File

@ -17,14 +17,14 @@
package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -37,7 +37,7 @@ import java.util.HashSet;
/**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/
public class TestWordDelimiterFilter extends AbstractSolrTestCase {
public class TestWordDelimiterFilter extends BaseTokenTestCase {
public String getSchemaFile() { return "solr/conf/schema.xml"; }
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
@ -144,148 +144,74 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
// test that subwords and catenated subwords have
// the correct offsets.
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 12); // actual
return t;
}
},
new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
1,1,0,0,1,1,0);
int i=0;
for(Token t; (t=wdf.next())!=null;) {
String termText = new String(t.termBuffer(), 0, t.termLength());
if (termText.equals("foo")) {
assertEquals(5, t.startOffset());
assertEquals(8, t.endOffset());
i++;
}
if (termText.equals("bar")) {
assertEquals(9, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
if (termText.equals("foobar")) {
assertEquals(5, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
}
assertEquals(3,i); // make sure all 3 tokens were generated
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 9, 5 },
new int[] { 8, 12, 12 });
// test that if splitting or catenating a synonym, that the offsets
// are not altered (they would be incorrect).
wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 6); // a synonym
return t;
}
},
new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
1,1,0,0,1,1,0);
for(Token t; (t=wdf.next())!=null;) {
assertEquals(5, t.startOffset());
assertEquals(6, t.endOffset());
}
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar" },
new int[] { 5, 5, 5 },
new int[] { 6, 6, 6 });
}
public void testOffsetChange() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() {
if (t != null) return null;
t = new Token("übelkeit)", 7, 16);
return t;
}
},
new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
1,1,0,0,1,1,0
);
Token t = wdf.next();
assertNotNull(t);
assertEquals("übelkeit", t.term());
assertEquals(7, t.startOffset());
assertEquals(15, t.endOffset());
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
new int[] { 7 },
new int[] { 15 });
}
public void testOffsetChange2() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() {
if (t != null) return null;
t = new Token("(übelkeit", 7, 17);
return t;
}
},
new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
1,1,0,0,1,1,0
);
Token t = wdf.next();
assertNotNull(t);
assertEquals("übelkeit", t.term());
assertEquals(8, t.startOffset());
assertEquals(17, t.endOffset());
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
new int[] { 8 },
new int[] { 17 });
}
public void testOffsetChange3() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
Token t;
public Token next() {
if (t != null) return null;
t = new Token("(übelkeit", 7, 16);
return t;
}
},
new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
1,1,0,0,1,1,0
);
Token t = wdf.next();
assertNotNull(t);
assertEquals("übelkeit", t.term());
assertEquals(8, t.startOffset());
assertEquals(16, t.endOffset());
assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
new int[] { 8 },
new int[] { 16 });
}
public void testOffsetChange4() throws Exception
{
WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() {
private Token t;
public Token next() {
if (t != null) return null;
t = new Token("(foo,bar)", 7, 16);
return t;
}
},
new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
1,1,0,0,1,1,0
);
Token t = wdf.next();
assertNotNull(t);
assertEquals("foo", t.term());
assertEquals(8, t.startOffset());
assertEquals(11, t.endOffset());
t = wdf.next();
assertNotNull(t);
assertEquals("bar", t.term());
assertEquals(12, t.startOffset());
assertEquals(15, t.endOffset());
assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar"},
new int[] { 8, 12, 8 },
new int[] { 11, 15, 15 });
}
public void testAlphaNumericWords(){
@ -338,24 +264,10 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
public void doSplit(final String input, String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
boolean done=false;
@Override
public Token next() throws IOException {
if (done) return null;
done = true;
return new Token(input,0,input.length());
}
}
,1,1,0,0,0
);
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), 1, 1, 0, 0, 0);
for(String expected : output) {
Token t = wdf.next();
assertEquals(expected, t.term());
}
assertEquals(null, wdf.next());
assertTokenStreamContents(wdf, output);
}
public void testSplits() throws Exception {
@ -365,29 +277,38 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
// non-space marking symbol shouldn't cause split
// this is an example in Thai
doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
// possessive followed by delimiter
doSplit("test's'", "test");
// some russian upper and lowercase
doSplit("Роберт", "Роберт");
// now cause a split (russian camelCase)
doSplit("РобЕрт", "Роб", "Ерт");
// a composed titlecase character, don't split
doSplit("aDžungla", "aDžungla");
// a modifier letter, don't split
doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام");
// enclosing mark, don't split
doSplit("۞test", "۞test");
// combining spacing mark (the virama), don't split
doSplit("हिन्दी", "हिन्दी");
// don't split non-ascii digits
doSplit("١٢٣٤", "١٢٣٤");
// don't split supplementaries into unpaired surrogates
doSplit("𠀀𠀀", "𠀀𠀀");
}
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() {
boolean done=false;
@Override
public Token next() throws IOException {
if (done) return null;
done = true;
return new Token(input,0,input.length());
}
}
,1,1,0,0,0,1,0,1,stemPossessive,null
);
WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
for(String expected : output) {
Token t = wdf.next();
assertEquals(expected, t.term());
}
assertEquals(null, wdf.next());
assertTokenStreamContents(wdf, output);
}
/*
@ -485,25 +406,4 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
new int[] { 6, 14, 19 },
new int[] { 1, 11, 1 });
}
private void assertAnalyzesTo(Analyzer a, String input, String[] output,
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts
.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
.getAttribute(PositionIncrementAttribute.class);
for (int i = 0; i < output.length; i++) {
assertTrue(ts.incrementToken());
assertEquals(output[i], termAtt.term());
assertEquals(startOffsets[i], offsetAtt.startOffset());
assertEquals(endOffsets[i], offsetAtt.endOffset());
assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
}
assertFalse(ts.incrementToken());
ts.close();
}
}

View File

@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A set of words for testing the DictionaryCompound factory
soft
ball
team

View File

@ -0,0 +1,24 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A set of articles for testing the French Elision filter.
# Requiring a text file is a bit weird here...
l
m
t
qu
n
s
j