SOLR-1674: Improve analysis tests and cut over to new TokenStream API

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@892821 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Mark Robert Miller 2009-12-21 13:53:50 +00:00
parent 5be5c31bb0
commit b105beef66
47 changed files with 2418 additions and 912 deletions

View File

@ -174,6 +174,9 @@ Other Changes
* SOLR-1662: Added Javadocs in BufferedTokenStream and fixed incorrect cloning * SOLR-1662: Added Javadocs in BufferedTokenStream and fixed incorrect cloning
in TestBufferedTokenStream (Robert Muir, Uwe Schindler via shalin) in TestBufferedTokenStream (Robert Muir, Uwe Schindler via shalin)
* SOLR-1674: Improve analysis tests and cut over to new TokenStream API.
(Robert Muir via Mark Miller)
Build Build
---------------------- ----------------------

View File

@ -17,19 +17,21 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.core.SolrConfig; import org.apache.solr.core.SolrConfig;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.util.TestHarness; import org.apache.solr.util.TestHarness;
import junit.framework.TestCase; import junit.framework.TestCase;
/** /**
* *
*/ */
abstract public class AnalysisTestCase extends TestCase { abstract public class AnalysisTestCase extends AbstractSolrTestCase {
protected SolrConfig solrConfig; protected SolrConfig solrConfig;
/** Creates a new instance of AnalysisTestCase */ /** Creates a new instance of AnalysisTestCase */
public AnalysisTestCase() { public AnalysisTestCase() {
} }
public String getSolrConfigFile() { return "solrconfig.xml"; } public String getSolrConfigFile() { return "solrconfig.xml"; }
public String getSchemaFile() { return "schema.xml"; }
public void setUp() throws Exception { public void setUp() throws Exception {
// if you override setUp or tearDown, you better call // if you override setUp or tearDown, you better call

View File

@ -18,174 +18,134 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.io.StringReader;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import junit.framework.TestCase; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
/** /**
* General token testing helper functions * General token testing helper functions
*/ */
public abstract class BaseTokenTestCase extends AnalysisTestCase public abstract class BaseTokenTestCase extends AnalysisTestCase
{ {
public static String tsToString(TokenStream in) throws IOException { // some helpers to test Analyzers and TokenStreams:
StringBuilder out = new StringBuilder(); // these are taken from Lucene's BaseTokenStreamTestCase
Token t = in.next();
if (null != t) public static void assertTokenStreamContents(TokenStream ts, String[] output,
out.append(new String(t.termBuffer(), 0, t.termLength())); int startOffsets[], int endOffsets[], String types[], int posIncrements[])
throws IOException {
assertNotNull(output);
assertTrue("has TermAttribute", ts.hasAttribute(TermAttribute.class));
TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
for (t = in.next(); null != t; t = in.next()) { OffsetAttribute offsetAtt = null;
out.append(" ").append(new String(t.termBuffer(), 0, t.termLength())); if (startOffsets != null || endOffsets != null) {
assertTrue("has OffsetAttribute", ts.hasAttribute(OffsetAttribute.class));
offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
} }
in.close();
return out.toString(); TypeAttribute typeAtt = null;
} if (types != null) {
assertTrue("has TypeAttribute", ts.hasAttribute(TypeAttribute.class));
public List<String> tok2str(Iterable<Token> tokLst) { typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class);
ArrayList<String> lst = new ArrayList<String>();
for ( Token t : tokLst ) {
lst.add( new String(t.termBuffer(), 0, t.termLength()));
} }
return lst;
} PositionIncrementAttribute posIncrAtt = null;
if (posIncrements != null) {
assertTrue("has PositionIncrementAttribute", ts
public void assertTokEqual(List<Token> a, List<Token> b) { .hasAttribute(PositionIncrementAttribute.class));
assertTokEq(a,b,false); posIncrAtt = (PositionIncrementAttribute) ts
assertTokEq(b,a,false); .getAttribute(PositionIncrementAttribute.class);
}
public void assertTokEqualOff(List<Token> a, List<Token> b) {
assertTokEq(a,b,true);
assertTokEq(b,a,true);
}
private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
int pos=0;
for (Iterator iter = a.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (!tokAt(b, new String(tok.termBuffer(), 0, tok.termLength()), pos
, checkOff ? tok.startOffset() : -1
, checkOff ? tok.endOffset() : -1
))
{
fail(a + "!=" + b);
}
} }
}
ts.reset();
public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) { for (int i = 0; i < output.length; i++) {
int pos=0; // extra safety to enforce, that the state is not preserved and also
for (Iterator iter = lst.iterator(); iter.hasNext();) { // assign bogus values
Token tok = (Token)iter.next(); ts.clearAttributes();
pos += tok.getPositionIncrement(); termAtt.setTermBuffer("bogusTerm");
if (pos==tokPos && new String(tok.termBuffer(), 0, tok.termLength()).equals(val) if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243);
&& (startOff==-1 || tok.startOffset()==startOff) if (typeAtt != null) typeAtt.setType("bogusType");
&& (endOff ==-1 || tok.endOffset() ==endOff ) if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657);
)
{
return true;
}
}
return false;
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
*/
public List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t); assertTrue("token " + i + " exists", ts.incrementToken());
for (int j=1; j<toks.length; j++) { assertEquals("term " + i, output[i], termAtt.term());
t = new Token(toks[j],0,0,"TEST"); if (startOffsets != null) assertEquals("startOffset " + i,
t.setPositionIncrement(0); startOffsets[i], offsetAtt.startOffset());
result.add(t); if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i],
} offsetAtt.endOffset());
if (types != null) assertEquals("type " + i, types[i], typeAtt.type());
if (posIncrements != null) assertEquals("posIncrement " + i,
posIncrements[i], posIncrAtt.getPositionIncrement());
} }
return result; assertFalse("end of stream", ts.incrementToken());
ts.end();
ts.close();
} }
//------------------------------------------------------------------------ public static void assertTokenStreamContents(TokenStream ts, String[] output)
// These may be useful beyond test cases... throws IOException {
//------------------------------------------------------------------------ assertTokenStreamContents(ts, output, null, null, null, null);
static List<Token> getTokens(TokenStream tstream) throws IOException {
List<Token> tokens = new ArrayList<Token>();
while (true) {
Token t = tstream.next();
if (t==null) break;
tokens.add(t);
}
return tokens;
} }
public static class IterTokenStream extends TokenStream { public static void assertTokenStreamContents(TokenStream ts, String[] output,
Iterator<Token> toks; String[] types) throws IOException {
public IterTokenStream(Token... toks) { assertTokenStreamContents(ts, output, null, null, types, null);
this.toks = Arrays.asList(toks).iterator(); }
}
public IterTokenStream(Iterable<Token> toks) { public static void assertTokenStreamContents(TokenStream ts, String[] output,
this.toks = toks.iterator(); int[] posIncrements) throws IOException {
} assertTokenStreamContents(ts, output, null, null, null, posIncrements);
public IterTokenStream(Iterator<Token> toks) { }
this.toks = toks;
} public static void assertTokenStreamContents(TokenStream ts, String[] output,
public IterTokenStream(String ... text) { int startOffsets[], int endOffsets[]) throws IOException {
int off = 0; assertTokenStreamContents(ts, output, startOffsets, endOffsets, null, null);
ArrayList<Token> t = new ArrayList<Token>( text.length ); }
for( String txt : text ) {
t.add( new Token( txt, off, off+txt.length() ) ); public static void assertTokenStreamContents(TokenStream ts, String[] output,
off += txt.length() + 2; int startOffsets[], int endOffsets[], int[] posIncrements)
} throws IOException {
this.toks = t.iterator(); assertTokenStreamContents(ts, output, startOffsets, endOffsets, null,
} posIncrements);
@Override }
public Token next() {
if (toks.hasNext()) { public static void assertAnalyzesTo(Analyzer a, String input,
return toks.next(); String[] output, int startOffsets[], int endOffsets[], String types[],
} int posIncrements[]) throws IOException {
return null; assertTokenStreamContents(a.tokenStream("dummy", new StringReader(input)),
} output, startOffsets, endOffsets, types, posIncrements);
}
public static void assertAnalyzesTo(Analyzer a, String input, String[] output)
throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, String[] types) throws IOException {
assertAnalyzesTo(a, input, output, null, null, types, null);
}
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int[] posIncrements) throws IOException {
assertAnalyzesTo(a, input, output, null, null, null, posIncrements);
}
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int startOffsets[], int endOffsets[]) throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null, null);
}
public static void assertAnalyzesTo(Analyzer a, String input,
String[] output, int startOffsets[], int endOffsets[], int[] posIncrements)
throws IOException {
assertAnalyzesTo(a, input, output, startOffsets, endOffsets, null,
posIncrements);
} }
} }

View File

@ -17,9 +17,13 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.util.AbstractSolrTestCase; import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import java.io.StringReader;
import java.util.Set; import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -29,7 +33,7 @@ import java.util.HashMap;
* used by the StopFilterFactoryTest TODO: consider creating separate test files * used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change * so this won't break if stop filter test files change
**/ **/
public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase { public class CommonGramsFilterFactoryTest extends BaseTokenTestCase {
public String getSchemaFile() { public String getSchemaFile() {
return "schema-stop-keep.xml"; return "schema-stop-keep.xml";
} }
@ -66,4 +70,23 @@ public class CommonGramsFilterFactoryTest extends AbstractSolrTestCase {
.isIgnoreCase() == true); .isIgnoreCase() == true);
} }
/**
* If no words are provided, then a set of english default stopwords is used.
*/
public void testDefaults() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsFilterFactory factory = new CommonGramsFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing", "testing_the", "the", "the_factory", "factory" });
}
} }

View File

@ -16,29 +16,20 @@
*/ */
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException; import java.io.Reader;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.StringTokenizer;
import java.util.Map.Entry;
import junit.framework.TestCase; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.analysis.TestBufferedTokenStream.AB_AAB_Stream;
/** /**
* Tests CommonGramsQueryFilter * Tests CommonGramsQueryFilter
*/ */
public class CommonGramsFilterTest extends TestCase { public class CommonGramsFilterTest extends BaseTokenTestCase {
private static final String[] commonWords = { "s", "a", "b", "c", "d", "the", private static final String[] commonWords = { "s", "a", "b", "c", "d", "the",
"of" }; "of" };
@ -63,18 +54,6 @@ public class CommonGramsFilterTest extends TestCase {
assertEquals("How", term.term()); assertEquals("How", term.term());
} }
public void testCommonGramsQueryFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initQueryMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "query"));
}
}
public void testQueryReset() throws Exception { public void testQueryReset() throws Exception {
final String input = "How the s a brown s cow d like A B thing?"; final String input = "How the s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input)); WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
@ -93,18 +72,6 @@ public class CommonGramsFilterTest extends TestCase {
assertEquals("How_the", term.term()); assertEquals("How_the", term.term());
} }
public void testCommonGramsFilter() throws Exception {
Set<Map.Entry<String, String>> input2expectedSet = initMap().entrySet();
for (Iterator<Entry<String, String>> i = input2expectedSet.iterator(); i
.hasNext();) {
Map.Entry<String, String> me = i.next();
String input = me.getKey();
String expected = me.getValue();
String message = "message: input value is: " + input;
assertEquals(message, expected, testFilter(input, "common"));
}
}
/** /**
* This is for testing CommonGramsQueryFilter which outputs a set of tokens * This is for testing CommonGramsQueryFilter which outputs a set of tokens
* optimized for querying with only one token at each position, either a * optimized for querying with only one token at each position, either a
@ -116,150 +83,226 @@ public class CommonGramsFilterTest extends TestCase {
* *
* @return Map<String,String> * @return Map<String,String>
*/ */
private static Map<String, String> initQueryMap() { public void testCommonGramsQueryFilter() throws Exception {
Map<String, String> input2expected = new LinkedHashMap<String, String>(); Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsQueryFilter(new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords));
}
};
// Stop words used below are "of" "the" and "s" // Stop words used below are "of" "the" and "s"
// two word queries // two word queries
input2expected.put("brown fox", "/brown/fox"); assertAnalyzesTo(a, "brown fox",
input2expected.put("the fox", "/the_fox"); new String[] { "brown", "fox" });
input2expected.put("fox of", "/fox_of"); assertAnalyzesTo(a, "the fox",
input2expected.put("of the", "/of_the"); new String[] { "the_fox" });
assertAnalyzesTo(a, "fox of",
new String[] { "fox_of" });
assertAnalyzesTo(a, "of the",
new String[] { "of_the" });
// one word queries // one word queries
input2expected.put("the", "/the"); assertAnalyzesTo(a, "the",
input2expected.put("foo", "/foo"); new String[] { "the" });
assertAnalyzesTo(a, "foo",
new String[] { "foo" });
// 3 word combinations s=stopword/common word n=not a stop word // 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n"); assertAnalyzesTo(a, "n n n",
input2expected.put("quick brown fox", "/quick/brown/fox"); new String[] { "n", "n", "n" });
assertAnalyzesTo(a, "quick brown fox",
new String[] { "quick", "brown", "fox" });
input2expected.put("n n s", "/n/n_s"); assertAnalyzesTo(a, "n n s",
input2expected.put("quick brown the", "/quick/brown_the"); new String[] { "n", "n_s" });
assertAnalyzesTo(a, "quick brown the",
new String[] { "quick", "brown_the" });
input2expected.put("n s n", "/n_s/s_n"); assertAnalyzesTo(a, "n s n",
input2expected.put("quick the brown", "/quick_the/the_brown"); new String[] { "n_s", "s_n" });
assertAnalyzesTo(a, "quick the brown",
new String[] { "quick_the", "the_brown" });
input2expected.put("n s s", "/n_s/s_s"); assertAnalyzesTo(a, "n s s",
input2expected.put("fox of the", "/fox_of/of_the"); new String[] { "n_s", "s_s" });
assertAnalyzesTo(a, "fox of the",
new String[] { "fox_of", "of_the" });
input2expected.put("s n n", "/s_n/n/n"); assertAnalyzesTo(a, "s n n",
input2expected.put("the quick brown", "/the_quick/quick/brown"); new String[] { "s_n", "n", "n" });
assertAnalyzesTo(a, "the quick brown",
new String[] { "the_quick", "quick", "brown" });
input2expected.put("s n s", "/s_n/n_s"); assertAnalyzesTo(a, "s n s",
input2expected.put("the fox of", "/the_fox/fox_of"); new String[] { "s_n", "n_s" });
assertAnalyzesTo(a, "the fox of",
new String[] { "the_fox", "fox_of" });
input2expected.put("s s n", "/s_s/s_n"); assertAnalyzesTo(a, "s s n",
input2expected.put("of the fox", "/of_the/the_fox"); new String[] { "s_s", "s_n" });
assertAnalyzesTo(a, "of the fox",
new String[] { "of_the", "the_fox" });
input2expected.put("s s s", "/s_s/s_s"); assertAnalyzesTo(a, "s s s",
input2expected.put("of the of", "/of_the/the_of"); new String[] { "s_s", "s_s" });
assertAnalyzesTo(a, "of the of",
return input2expected; new String[] { "of_the", "the_of" });
} }
private static Map<String, String> initMap() { public void testCommonGramsFilter() throws Exception {
Map<String, String> input2expected = new HashMap<String, String>(); Analyzer a = new Analyzer() {
@Override
public TokenStream tokenStream(String field, Reader in) {
return new CommonGramsFilter(
new WhitespaceTokenizer(in), commonWords);
}
};
// Stop words used below are "of" "the" and "s" // Stop words used below are "of" "the" and "s"
// one word queries // one word queries
input2expected.put("the", "/the"); assertAnalyzesTo(a, "the", new String[] { "the" });
input2expected.put("foo", "/foo"); assertAnalyzesTo(a, "foo", new String[] { "foo" });
// two word queries // two word queries
input2expected.put("brown fox", "/brown/fox"); assertAnalyzesTo(a, "brown fox",
input2expected.put("the fox", "/the,the_fox/fox"); new String[] { "brown", "fox" },
input2expected.put("fox of", "/fox,fox_of/of"); new int[] { 1, 1 });
input2expected.put("of the", "/of,of_the/the"); assertAnalyzesTo(a, "the fox",
new String[] { "the", "the_fox", "fox" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(a, "fox of",
new String[] { "fox", "fox_of", "of" },
new int[] { 1, 0, 1 });
assertAnalyzesTo(a, "of the",
new String[] { "of", "of_the", "the" },
new int[] { 1, 0, 1 });
// 3 word combinations s=stopword/common word n=not a stop word // 3 word combinations s=stopword/common word n=not a stop word
input2expected.put("n n n", "/n/n/n"); assertAnalyzesTo(a, "n n n",
input2expected.put("quick brown fox", "/quick/brown/fox"); new String[] { "n", "n", "n" },
new int[] { 1, 1, 1 });
assertAnalyzesTo(a, "quick brown fox",
new String[] { "quick", "brown", "fox" },
new int[] { 1, 1, 1 });
input2expected.put("n n s", "/n/n,n_s/s"); assertAnalyzesTo(a, "n n s",
input2expected.put("quick brown the", "/quick/brown,brown_the/the"); new String[] { "n", "n", "n_s", "s" },
new int[] { 1, 1, 0, 1 });
assertAnalyzesTo(a, "quick brown the",
new String[] { "quick", "brown", "brown_the", "the" },
new int[] { 1, 1, 0, 1 });
input2expected.put("n s n", "/n,n_s/s,s_n/n"); assertAnalyzesTo(a, "n s n",
input2expected.put("quick the fox", "/quick,quick_the/the,the_fox/fox"); new String[] { "n", "n_s", "s", "s_n", "n" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "quick the fox",
new String[] { "quick", "quick_the", "the", "the_fox", "fox" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("n s s", "/n,n_s/s,s_s/s"); assertAnalyzesTo(a, "n s s",
input2expected.put("fox of the", "/fox,fox_of/of,of_the/the"); new String[] { "n", "n_s", "s", "s_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "fox of the",
new String[] { "fox", "fox_of", "of", "of_the", "the" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s n n", "/s,s_n/n/n"); assertAnalyzesTo(a, "s n n",
input2expected.put("the quick brown", "/the,the_quick/quick/brown"); new String[] { "s", "s_n", "n", "n" },
new int[] { 1, 0, 1, 1 });
assertAnalyzesTo(a, "the quick brown",
new String[] { "the", "the_quick", "quick", "brown" },
new int[] { 1, 0, 1, 1 });
input2expected.put("s n s", "/s,s_n/n,n_s/s"); assertAnalyzesTo(a, "s n s",
input2expected.put("the fox of", "/the,the_fox/fox,fox_of/of"); new String[] { "s", "s_n", "n", "n_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "the fox of",
new String[] { "the", "the_fox", "fox", "fox_of", "of" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s s n", "/s,s_s/s,s_n/n"); assertAnalyzesTo(a, "s s n",
input2expected.put("of the fox", "/of,of_the/the,the_fox/fox"); new String[] { "s", "s_s", "s", "s_n", "n" },
new int[] { 1, 0, 1, 0, 1 });
assertAnalyzesTo(a, "of the fox",
new String[] { "of", "of_the", "the", "the_fox", "fox" },
new int[] { 1, 0, 1, 0, 1 });
input2expected.put("s s s", "/s,s_s/s,s_s/s"); assertAnalyzesTo(a, "s s s",
input2expected.put("of the of", "/of,of_the/the,the_of/of"); new String[] { "s", "s_s", "s", "s_s", "s" },
new int[] { 1, 0, 1, 0, 1 });
return input2expected; assertAnalyzesTo(a, "of the of",
new String[] { "of", "of_the", "the", "the_of", "of" },
new int[] { 1, 0, 1, 0, 1 });
} }
/*
* Helper methodsCopied and from CDL XTF BigramsStopFilter.java and slightly
* modified to use with CommonGrams http://xtf.wiki.sourceforge.net/
*/
/** /**
* Very simple tokenizer that breaks up a string into a series of Lucene * Test that CommonGramsFilter works correctly in case-insensitive mode
* {@link Token Token}s.
*/ */
static class StringTokenStream extends TokenStream { public void testCaseSensitive() throws Exception {
private String str; final String input = "How The s a brown s cow d like A B thing?";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
private int prevEnd = 0; Set common = CommonGramsFilter.makeCommonSet(commonWords);
TokenFilter cgf = new CommonGramsFilter(wt, common, false);
private StringTokenizer tok; assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
"s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
private int count = 0; "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
public StringTokenStream(String str, String delim) {
this.str = str;
tok = new StringTokenizer(str, delim);
}
public Token next() {
if (!tok.hasMoreTokens())
return null;
count++;
String term = tok.nextToken();
Token t = new Token(term, str.indexOf(term, prevEnd), str.indexOf(term,
prevEnd)
+ term.length(), "word");
prevEnd = t.endOffset();
return t;
}
} }
public static String testFilter(String in, String type) throws IOException { /**
TokenStream nsf; * Test CommonGramsQueryFilter in the case that the last word is a stopword
StringTokenStream ts = new StringTokenStream(in, " ."); */
if (type.equals("query")) { public void testLastWordisStopWord() throws Exception {
CommonGramsFilter cgf = new CommonGramsFilter(ts, commonWords); final String input = "dog the";
nsf = new CommonGramsQueryFilter(cgf); WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
} else { CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
nsf = new CommonGramsFilter(ts, commonWords); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
} assertTokenStreamContents(nsf, new String[] { "dog_the" });
}
StringBuffer outBuf = new StringBuffer();
while (true) { /**
Token t = nsf.next(); * Test CommonGramsQueryFilter in the case that the first word is a stopword
if (t == null) */
break; public void testFirstWordisStopWord() throws Exception {
for (int i = 0; i < t.getPositionIncrement(); i++) final String input = "the dog";
outBuf.append('/'); WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
if (t.getPositionIncrement() == 0) CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
outBuf.append(','); TokenFilter nsf = new CommonGramsQueryFilter(cgf);
outBuf.append(t.term()); assertTokenStreamContents(nsf, new String[] { "the_dog" });
} }
String out = outBuf.toString(); /**
out = out.replaceAll(" ", ""); * Test CommonGramsQueryFilter in the case of a single (stop)word query
return out; */
public void testOneWordQueryStopWord() throws Exception {
final String input = "the";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the" });
}
/**
* Test CommonGramsQueryFilter in the case of a single word query
*/
public void testOneWordQuery() throws Exception {
final String input = "monster";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "monster" });
}
/**
* Test CommonGramsQueryFilter when first and last words are stopwords.
*/
public void TestFirstAndLastStopWord() throws Exception {
final String input = "the of";
WhitespaceTokenizer wt = new WhitespaceTokenizer(new StringReader(input));
CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
TokenFilter nsf = new CommonGramsQueryFilter(cgf);
assertTokenStreamContents(nsf, new String[] { "the_of" });
} }
} }

View File

@ -16,9 +16,12 @@
*/ */
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import java.io.StringReader;
import java.util.Set; import java.util.Set;
import java.util.Map; import java.util.Map;
import java.util.HashMap; import java.util.HashMap;
@ -28,7 +31,7 @@ import java.util.HashMap;
* used by the StopFilterFactoryTest TODO: consider creating separate test files * used by the StopFilterFactoryTest TODO: consider creating separate test files
* so this won't break if stop filter test files change * so this won't break if stop filter test files change
**/ **/
public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase { public class CommonGramsQueryFilterFactoryTest extends BaseTokenTestCase {
public String getSchemaFile() { public String getSchemaFile() {
return "schema-stop-keep.xml"; return "schema-stop-keep.xml";
} }
@ -65,4 +68,23 @@ public class CommonGramsQueryFilterFactoryTest extends AbstractSolrTestCase {
.isIgnoreCase() == true); .isIgnoreCase() == true);
} }
/**
* If no words are provided, then a set of english default stopwords is used.
*/
public void testDefaults() throws Exception {
ResourceLoader loader = solrConfig.getResourceLoader();
assertTrue("loader is null and it shouldn't be", loader != null);
CommonGramsQueryFilterFactory factory = new CommonGramsQueryFilterFactory();
Map<String, String> args = new HashMap<String, String>();
factory.init(args);
factory.inform(loader);
Set words = factory.getCommonWords();
assertTrue("words is null and it shouldn't be", words != null);
assertTrue(words.contains("the"));
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("testing the factory"));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "testing_the", "the_factory" });
}
} }

View File

@ -16,36 +16,24 @@
*/ */
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class DoubleMetaphoneFilterFactoryTest extends TestCase { public class DoubleMetaphoneFilterFactoryTest extends BaseTokenTestCase {
public void testDefaults() throws Exception { public void testDefaults() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory(); DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>()); factory.init(new HashMap<String, String>());
TokenStream inputStream = new IterTokenStream("international"); TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream); TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
assertTokenStreamContents(filteredStream, new String[] { "international", "ANTR" });
Token token = filteredStream.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filteredStream.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filteredStream.next(new Token()));
} }
public void testSettingSizeAndInject() throws Exception { public void testSettingSizeAndInject() throws Exception {
@ -55,17 +43,31 @@ public class DoubleMetaphoneFilterFactoryTest extends TestCase {
parameters.put("maxCodeLength", "8"); parameters.put("maxCodeLength", "8");
factory.init(parameters); factory.init(parameters);
TokenStream inputStream = new IterTokenStream("international"); TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filteredStream = factory.create(inputStream); TokenStream filteredStream = factory.create(inputStream);
assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass()); assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
assertTokenStreamContents(filteredStream, new String[] { "ANTRNXNL" });
}
/**
* Ensure that reset() removes any state (buffered tokens)
*/
public void testReset() throws Exception {
DoubleMetaphoneFilterFactory factory = new DoubleMetaphoneFilterFactory();
factory.init(new HashMap<String, String>());
TokenStream inputStream = new WhitespaceTokenizer(new StringReader("international"));
Token token = filteredStream.next(new Token()); TokenStream filteredStream = factory.create(inputStream);
assertEquals(8, token.termLength()); TermAttribute termAtt = (TermAttribute) filteredStream.addAttribute(TermAttribute.class);
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token assertEquals(DoubleMetaphoneFilter.class, filteredStream.getClass());
.termLength()));
assertTrue(filteredStream.incrementToken());
assertNull(filteredStream.next(new Token())); assertEquals(13, termAtt.termLength());
assertEquals("international", termAtt.term());
filteredStream.reset();
// ensure there are no more tokens, such as ANTRNXNL
assertFalse(filteredStream.incrementToken());
} }
} }

View File

@ -16,94 +16,52 @@
*/ */
package org.apache.solr.analysis; package org.apache.solr.analysis;
import junit.framework.TestCase; import java.io.StringReader;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.analysis.BaseTokenTestCase.IterTokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer;
public class DoubleMetaphoneFilterTest extends TestCase { public class DoubleMetaphoneFilterTest extends BaseTokenTestCase {
public void testSize4FalseInject() throws Exception { public void testSize4FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international"); TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "ANTR" });
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
} }
public void testSize4TrueInject() throws Exception { public void testSize4TrueInject() throws Exception {
TokenStream stream = new IterTokenStream("international"); TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
Token token = filter.next(new Token());
assertEquals(13, token.termLength());
assertEquals("international", new String(token.termBuffer(), 0, token
.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("ANTR", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
} }
public void testAlternateInjectFalse() throws Exception { public void testAlternateInjectFalse() throws Exception {
TokenStream stream = new IterTokenStream("Kuczewski"); TokenStream stream = new WhitespaceTokenizer(new StringReader("Kuczewski"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
Token token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KSSK", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(4, token.termLength());
assertEquals("KXFS", new String(token.termBuffer(), 0, token.termLength()));
assertNull(filter.next(new Token()));
} }
public void testSize8FalseInject() throws Exception { public void testSize8FalseInject() throws Exception {
TokenStream stream = new IterTokenStream("international"); TokenStream stream = new WhitespaceTokenizer(new StringReader("international"));
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
Token token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("ANTRNXNL", new String(token.termBuffer(), 0, token
.termLength()));
assertNull(filter.next(new Token()));
} }
public void testNonConvertableStringsWithInject() throws Exception { public void testNonConvertableStringsWithInject() throws Exception {
TokenStream stream = new IterTokenStream( TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
Token token = filter.next(new Token());
assertEquals(5, token.termLength());
assertEquals("12345", new String(token.termBuffer(), 0, token.termLength()));
token = filter.next(new Token());
assertEquals(8, token.termLength());
assertEquals("#$%@#^%&", new String(token.termBuffer(), 0, token
.termLength()));
} }
public void testNonConvertableStringsWithoutInject() throws Exception { public void testNonConvertableStringsWithoutInject() throws Exception {
TokenStream stream = new IterTokenStream( TokenStream stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%&"));
new String[] { "12345", "#$%@#^%&" });
TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false); TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
assertEquals("12345", filter.next(new Token()).term());
// should have something after the stream // should have something after the stream
stream = new IterTokenStream( stream = new WhitespaceTokenizer(new StringReader("12345 #$%@#^%& hello"));
new String[] { "12345", "#$%@#^%&", "hello" });
filter = new DoubleMetaphoneFilter(stream, 8, false); filter = new DoubleMetaphoneFilter(stream, 8, false);
assertNotNull(filter.next(new Token())); assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
} }
} }

View File

@ -16,11 +16,17 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.tartarus.snowball.ext.EnglishStemmer; import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -32,11 +38,11 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException { public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer(); EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder(); String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) { for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]); stemmer.setCurrent(test[i]);
stemmer.stem(); stemmer.stem();
gold.append(stemmer.getCurrent()).append(' '); gold[i] = stemmer.getCurrent();
} }
EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory(); EnglishPorterFilterFactory factory = new EnglishPorterFilterFactory();
@ -44,21 +50,23 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
factory.init(args); factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>())); factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test))); Tokenizer tokenizer = new WhitespaceTokenizer(
assertEquals(gold.toString().trim(), out); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
} }
public void testProtected() throws Exception { public void testProtected() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer(); EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder(); String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) { for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) { if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]); stemmer.setCurrent(test[i]);
stemmer.stem(); stemmer.stem();
gold.append(stemmer.getCurrent()).append(' '); gold[i] = stemmer.getCurrent();
} else { } else {
gold.append(test[i]).append(' '); gold[i] = test[i];
} }
} }
@ -69,8 +77,10 @@ public class EnglishPorterFilterFactoryTest extends BaseTokenTestCase {
List<String> lines = new ArrayList<String>(); List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling"); Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines)); factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test))); Tokenizer tokenizer = new WhitespaceTokenizer(
assertEquals(gold.toString().trim(), out); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
} }
class LinesMockSolrResourceLoader implements ResourceLoader { class LinesMockSolrResourceLoader implements ResourceLoader {

View File

@ -17,9 +17,13 @@ package org.apache.solr.analysis;
*/ */
import java.io.IOException; import java.io.IOException;
import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
public class LengthFilterTest extends BaseTokenTestCase { public class LengthFilterTest extends BaseTokenTestCase {
public void test() throws IOException { public void test() throws IOException {
@ -28,9 +32,8 @@ public class LengthFilterTest extends BaseTokenTestCase {
args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4)); args.put(LengthFilterFactory.MIN_KEY, String.valueOf(4));
args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10)); args.put(LengthFilterFactory.MAX_KEY, String.valueOf(10));
factory.init(args); factory.init(args);
String[] test = {"foo", "foobar", "super-duper-trooper"}; String test = "foo foobar super-duper-trooper";
String gold = "foobar"; TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(test)));
String out = tsToString(factory.create(new IterTokenStream(test))); assertTokenStreamContents(stream, new String[] { "foobar" });
assertEquals(gold.toString(), out);
} }
} }

View File

@ -16,11 +16,18 @@ package org.apache.solr.analysis;
* limitations under the License. * limitations under the License.
*/ */
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
import org.apache.solr.common.util.StrUtils;
import org.tartarus.snowball.ext.EnglishStemmer; import org.tartarus.snowball.ext.EnglishStemmer;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -32,11 +39,11 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
public void test() throws IOException { public void test() throws IOException {
EnglishStemmer stemmer = new EnglishStemmer(); EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder(); String[] gold = new String[test.length];
for (String aTest : test) { for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(aTest); stemmer.setCurrent(test[i]);
stemmer.stem(); stemmer.stem();
gold.append(stemmer.getCurrent()).append(' '); gold[i] = stemmer.getCurrent();
} }
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory(); SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
@ -45,21 +52,27 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
factory.init(args); factory.init(args);
factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>())); factory.inform(new LinesMockSolrResourceLoader(new ArrayList<String>()));
String out = tsToString(factory.create(new IterTokenStream(test))); Tokenizer tokenizer = new WhitespaceTokenizer(
assertEquals(gold.toString().trim(), out); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
} }
public void testProtected() throws Exception { /**
* Tests the protected words mechanism of EnglishPorterFilterFactory
*/
@Deprecated
public void testProtectedOld() throws Exception {
EnglishStemmer stemmer = new EnglishStemmer(); EnglishStemmer stemmer = new EnglishStemmer();
String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"}; String[] test = {"The", "fledgling", "banks", "were", "counting", "on", "a", "big", "boom", "in", "banking"};
StringBuilder gold = new StringBuilder(); String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) { for (int i = 0; i < test.length; i++) {
if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) { if (test[i].equals("fledgling") == false && test[i].equals("banks") == false) {
stemmer.setCurrent(test[i]); stemmer.setCurrent(test[i]);
stemmer.stem(); stemmer.stem();
gold.append(stemmer.getCurrent()).append(' '); gold[i] = stemmer.getCurrent();
} else { } else {
gold.append(test[i]).append(' '); gold[i] = test[i];
} }
} }
@ -70,8 +83,10 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
List<String> lines = new ArrayList<String>(); List<String> lines = new ArrayList<String>();
Collections.addAll(lines, "banks", "fledgling"); Collections.addAll(lines, "banks", "fledgling");
factory.inform(new LinesMockSolrResourceLoader(lines)); factory.inform(new LinesMockSolrResourceLoader(lines));
String out = tsToString(factory.create(new IterTokenStream(test))); Tokenizer tokenizer = new WhitespaceTokenizer(
assertEquals(gold.toString().trim(), out); new StringReader(StrUtils.join(Arrays.asList(test), ' ')));
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, gold);
} }
class LinesMockSolrResourceLoader implements ResourceLoader { class LinesMockSolrResourceLoader implements ResourceLoader {
@ -93,5 +108,22 @@ public class SnowballPorterFilterFactoryTest extends BaseTokenTestCase {
return null; return null;
} }
} }
/**
* Test the protected words mechanism of SnowballPorterFilterFactory
*/
public void testProtected() throws Exception {
SnowballPorterFilterFactory factory = new SnowballPorterFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("protected", "protwords.txt");
args.put("language", "English");
factory.init(args);
factory.inform(loader);
Reader reader = new StringReader("ridding of some stemming");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "ridding", "of", "some", "stem" });
}
} }

View File

@ -0,0 +1,65 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Simple tests to ensure the Arabic filter Factories are working.
*/
public class TestArabicFilters extends BaseTokenTestCase {
/**
* Test ArabicLetterTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"الذين", "مَلكت", "أيمانكم"});
}
/**
* Test ArabicNormalizationFilterFactory
*/
public void testNormalizer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
ArabicNormalizationFilterFactory filterFactory = new ArabicNormalizationFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"الذين", "ملكت", "ايمانكم"});
}
/**
* Test ArabicStemFilterFactory
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("الذين مَلكت أيمانكم");
ArabicLetterTokenizerFactory factory = new ArabicLetterTokenizerFactory();
ArabicNormalizationFilterFactory normFactory = new ArabicNormalizationFilterFactory();
ArabicStemFilterFactory stemFactory = new ArabicStemFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = normFactory.create(tokenizer);
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"ذين", "ملكت", "ايمانكم"});
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Brazilian stem filter factory is working.
*/
public class TestBrazilianStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems and normalizes text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Brasília");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
BrazilianStemFilterFactory factory = new BrazilianStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "brasil" });
}
}

View File

@ -60,9 +60,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String expected = "How now Q B brown A cow B like Q B thing?"; final String expected = "How now Q B brown A cow B like Q B thing?";
TokenStream ts = new AB_Q_Stream TokenStream ts = new AB_Q_Stream
(new WhitespaceTokenizer(new StringReader(input))); (new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts); assertTokenStreamContents(ts, expected.split("\\s"));
//System.out.println(actual);
assertEquals(expected, actual);
} }
public void testABAAB() throws Exception { public void testABAAB() throws Exception {
@ -70,9 +68,7 @@ public class TestBufferedTokenStream extends BaseTokenTestCase {
final String expected = "How now A A B brown A cow B like A A B thing?"; final String expected = "How now A A B brown A cow B like A A B thing?";
TokenStream ts = new AB_AAB_Stream TokenStream ts = new AB_AAB_Stream
(new WhitespaceTokenizer(new StringReader(input))); (new WhitespaceTokenizer(new StringReader(input)));
final String actual = tsToString(ts); assertTokenStreamContents(ts, expected.split("\\s"));
//System.out.println(actual);
assertEquals(expected, actual);
} }
public void testReset() throws Exception { public void testReset() throws Exception {

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the CJK tokenizer factory is working.
*/
public class TestCJKTokenizerFactory extends BaseTokenTestCase {
/**
* Ensure the tokenizer actually tokenizes CJK text correctly
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("我是中国人");
CJKTokenizerFactory factory = new CJKTokenizerFactory();
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"我是", "是中", "中国", "国人"});
}
}

View File

@ -17,14 +17,18 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import junit.framework.TestCase; import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/** /**
* @version $Id$ *
*/ */
public class TestCapitalizationFilter extends BaseTokenTestCase { public class TestCapitalizationFilter extends BaseTokenTestCase {
@ -64,39 +68,46 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory.processWord(termBuffer, 0, termBuffer.length, 0 ); factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length)); assertEquals( "BIG", new String(termBuffer, 0, termBuffer.length));
String out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) ); Tokenizer tokenizer = new KeywordTokenizer(new StringReader("Hello thEre my Name is Ryan"));
assertEquals( "Hello there my name is ryan", out ); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello there my name is ryan" });
// now each token // now each token
factory.onlyFirstWord = false; factory.onlyFirstWord = false;
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) ); tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan"));
assertEquals( "Hello There My Name Is Ryan", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" });
// now only the long words // now only the long words
factory.minWordLength = 3; factory.minWordLength = 3;
out = tsToString( factory.create( new IterTokenStream( "Hello thEre my Name is Ryan" ) ) ); tokenizer = new WhitespaceTokenizer(new StringReader("Hello thEre my Name is Ryan" ));
assertEquals( "Hello There my Name is Ryan", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Hello", "There", "my", "Name", "is", "Ryan" });
// without prefix // without prefix
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) ); tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
assertEquals( "Mckinley", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Mckinley" });
// Now try some prefixes // Now try some prefixes
factory = new CapitalizationFilterFactory(); factory = new CapitalizationFilterFactory();
args.put( "okPrefix", "McK" ); // all words args.put( "okPrefix", "McK" ); // all words
factory.init( args ); factory.init( args );
out = tsToString( factory.create( new IterTokenStream( "McKinley" ) ) ); tokenizer = new WhitespaceTokenizer(new StringReader("McKinley" ));
assertEquals( "McKinley", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "McKinley" });
// now try some stuff with numbers // now try some stuff with numbers
factory.forceFirstLetter = false; factory.forceFirstLetter = false;
factory.onlyFirstWord = false; factory.onlyFirstWord = false;
out = tsToString( factory.create( new IterTokenStream( "1st 2nd third" ) ) ); tokenizer = new WhitespaceTokenizer(new StringReader("1st 2nd third" ));
assertEquals( "1st 2nd Third", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "1st", "2nd", "Third" });
factory.forceFirstLetter = true; factory.forceFirstLetter = true;
out = tsToString( factory.create( new IterTokenStream( "the The the" ) ) ); tokenizer = new KeywordTokenizer(new StringReader("the The the" ));
assertEquals( "The The the", out ); stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "The The the" });
} }
public void testKeepIgnoreCase() throws Exception { public void testKeepIgnoreCase() throws Exception {
@ -123,4 +134,80 @@ public class TestCapitalizationFilter extends BaseTokenTestCase {
factory.processWord(termBuffer, 0, termBuffer.length, 0 ); factory.processWord(termBuffer, 0, termBuffer.length, 0 );
assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length)); assertEquals( "Kitten", new String(termBuffer, 0, termBuffer.length));
} }
/**
* Test CapitalizationFilterFactory's minWordLength option.
*
* This is very weird when combined with ONLY_FIRST_WORD!!!
*/
public void testMinWordLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.ONLY_FIRST_WORD, "true");
args.put(CapitalizationFilterFactory.MIN_WORD_LENGTH, "5");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"helo testing"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"helo", "Testing"});
}
/**
* Test CapitalizationFilterFactory's maxWordCount option with only words of 1
* in each token (it should do nothing)
*/
public void testMaxWordCount() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"one two three four"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"One", "Two", "Three", "Four"});
}
/**
* Test CapitalizationFilterFactory's maxWordCount option when exceeded
*/
public void testMaxWordCount2() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_WORD_COUNT, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new KeywordTokenizer(new StringReader(
"one two three four"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"one two three four"});
}
/**
* Test CapitalizationFilterFactory's maxTokenLength option when exceeded
*
* This is weird, it is not really a max, but inclusive (look at 'is')
*/
public void testMaxTokenLength() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.MAX_TOKEN_LENGTH, "2");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(
"this is a test"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"this", "is", "A", "test"});
}
/**
* Test CapitalizationFilterFactory's forceFirstLetter option
*/
public void testForceFirstLetter() throws Exception {
Map<String,String> args = new HashMap<String,String>();
args.put(CapitalizationFilterFactory.KEEP, "kitten");
args.put(CapitalizationFilterFactory.FORCE_FIRST_LETTER, "true");
CapitalizationFilterFactory factory = new CapitalizationFilterFactory();
factory.init(args);
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader("kitten"));
TokenStream ts = factory.create(tokenizer);
assertTokenStreamContents(ts, new String[] {"Kitten"});
}
} }

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Chinese filter factory is working.
*/
public class TestChineseFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes text (numerics, stopwords)
*/
public void testFiltering() throws Exception {
Reader reader = new StringReader("this 1234 Is such a silly filter");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ChineseFilterFactory factory = new ChineseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Is", "silly", "filter" });
}
}

View File

@ -0,0 +1,38 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
/**
* Simple tests to ensure the Chinese tokenizer factory is working.
*/
public class TestChineseTokenizerFactory extends BaseTokenTestCase {
/**
* Ensure the tokenizer actually tokenizes chinese text correctly
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("我是中国人");
ChineseTokenizerFactory factory = new ChineseTokenizerFactory();
TokenStream stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"", "", "", "", ""});
}
}

View File

@ -20,6 +20,7 @@ package org.apache.solr.analysis;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.StringReader;
import java.text.Collator; import java.text.Collator;
import java.text.RuleBasedCollator; import java.text.RuleBasedCollator;
import java.util.HashMap; import java.util.HashMap;
@ -27,7 +28,9 @@ import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.solr.common.ResourceLoader; import org.apache.solr.common.ResourceLoader;
public class TestCollationKeyFilterFactory extends BaseTokenTestCase { public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
@ -39,18 +42,80 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
* Then things will sort and match correctly. * Then things will sort and match correctly.
*/ */
public void testBasicUsage() throws IOException { public void testBasicUsage() throws IOException {
String[] turkishUpperCase = { "I", "WİLL", "USE", "TURKİSH", "CASING" }; String turkishUpperCase = "I WİLL USE TURKİSH CASING";
String[] turkishLowerCase = { "ı", "will", "use", "turkish", "casıng" }; String turkishLowerCase = "ı will use turkish casıng";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>();
args.put("language", "tr"); args.put("language", "tr");
args.put("strength", "primary"); args.put("strength", "primary");
factory.init(args); factory.init(args);
factory.inform(new StringMockSolrResourceLoader("")); factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(new IterTokenStream(turkishUpperCase)); TokenStream tsUpper = factory.create(
TokenStream tsLower = factory.create(new IterTokenStream(turkishLowerCase)); new KeywordTokenizer(new StringReader(turkishUpperCase)));
assertTokEqual(BaseTokenTestCase.getTokens(tsUpper), TokenStream tsLower = factory.create(
BaseTokenTestCase.getTokens(tsLower)); new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the decomposition option for unicode normalization.
*/
public void testNormalization() throws IOException {
String turkishUpperCase = "I W\u0049\u0307LL USE TURKİSH CASING";
String turkishLowerCase = "ı will use turkish casıng";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "tr");
args.put("strength", "primary");
args.put("decomposition", "canonical");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(turkishUpperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(turkishLowerCase)));
assertCollatesToSame(tsUpper, tsLower);
}
/*
* Test usage of the K decomposition option for unicode normalization.
* This works even with identical strength.
*/
public void testFullDecomposition() throws IOException {
String fullWidth = "";
String halfWidth = "Testing";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "zh");
args.put("strength", "identical");
args.put("decomposition", "full");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsFull = factory.create(
new KeywordTokenizer(new StringReader(fullWidth)));
TokenStream tsHalf = factory.create(
new KeywordTokenizer(new StringReader(halfWidth)));
assertCollatesToSame(tsFull, tsHalf);
}
/*
* Test secondary strength, for english case is not significant.
*/
public void testSecondaryStrength() throws IOException {
String upperCase = "TESTING";
String lowerCase = "testing";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("language", "en");
args.put("strength", "secondary");
args.put("decomposition", "no");
factory.init(args);
factory.inform(new StringMockSolrResourceLoader(""));
TokenStream tsUpper = factory.create(
new KeywordTokenizer(new StringReader(upperCase)));
TokenStream tsLower = factory.create(
new KeywordTokenizer(new StringReader(lowerCase)));
assertCollatesToSame(tsUpper, tsLower);
} }
/* /*
@ -74,20 +139,22 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
// at this point, you would save these tailoredRules to a file, // at this point, you would save these tailoredRules to a file,
// and use the custom parameter. // and use the custom parameter.
// //
String[] germanUmlaut = { "Töne" }; String germanUmlaut = "Töne";
String[] germanOE = { "Toene" }; String germanOE = "Toene";
CollationKeyFilterFactory factory = new CollationKeyFilterFactory(); CollationKeyFilterFactory factory = new CollationKeyFilterFactory();
Map<String,String> args = new HashMap<String,String>(); Map<String,String> args = new HashMap<String,String>();
args.put("custom", "rules.txt"); args.put("custom", "rules.txt");
args.put("strength", "primary"); args.put("strength", "primary");
factory.init(args); factory.init(args);
factory.inform(new StringMockSolrResourceLoader(tailoredRules)); factory.inform(new StringMockSolrResourceLoader(tailoredRules));
TokenStream tsUmlaut = factory.create(new IterTokenStream(germanUmlaut)); TokenStream tsUmlaut = factory.create(
TokenStream tsOE = factory.create(new IterTokenStream(germanOE)); new KeywordTokenizer(new StringReader(germanUmlaut)));
assertTokEqual(BaseTokenTestCase.getTokens(tsUmlaut), TokenStream tsOE = factory.create(
BaseTokenTestCase.getTokens(tsOE)); new KeywordTokenizer(new StringReader(germanOE)));
}
assertCollatesToSame(tsUmlaut, tsOE);
}
private class StringMockSolrResourceLoader implements ResourceLoader { private class StringMockSolrResourceLoader implements ResourceLoader {
String text; String text;
@ -107,4 +174,17 @@ public class TestCollationKeyFilterFactory extends BaseTokenTestCase {
return new ByteArrayInputStream(text.getBytes("UTF-8")); return new ByteArrayInputStream(text.getBytes("UTF-8"));
} }
} }
private void assertCollatesToSame(TokenStream stream1, TokenStream stream2)
throws IOException {
TermAttribute term1 = (TermAttribute) stream1
.addAttribute(TermAttribute.class);
TermAttribute term2 = (TermAttribute) stream2
.addAttribute(TermAttribute.class);
assertTrue(stream1.incrementToken());
assertTrue(stream2.incrementToken());
assertEquals(term1.term(), term2.term());
assertFalse(stream1.incrementToken());
assertFalse(stream2.incrementToken());
}
} }

View File

@ -0,0 +1,51 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
/**
* Simple tests to ensure the Dictionary compound filter factory is working.
*/
public class TestDictionaryCompoundWordTokenFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually decompounds text.
*/
public void testDecompounding() throws Exception {
Reader reader = new StringReader("I like to play softball");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
DictionaryCompoundWordTokenFilterFactory factory = new DictionaryCompoundWordTokenFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("dictionary", "compoundDictionary.txt");
factory.init(args);
factory.inform(loader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] { "I", "like", "to", "play", "softball", "soft", "ball" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Dutch stem filter factory is working.
*/
public class TestDutchStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("lichamelijkheden");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
DutchStemFilterFactory factory = new DutchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "licham" });
}
}

View File

@ -0,0 +1,50 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.solr.common.ResourceLoader;
/**
* Simple tests to ensure the French elision filter factory is working.
*/
public class TestElisionFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes text.
*/
public void testElision() throws Exception {
Reader reader = new StringReader("l'avion");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ElisionFilterFactory factory = new ElisionFilterFactory();
ResourceLoader loader = solrConfig.getResourceLoader();
Map<String,String> args = new HashMap<String,String>();
args.put("articles", "frenchArticles.txt");
factory.init(args);
factory.inform(loader);
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "avion" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the French stem filter factory is working.
*/
public class TestFrenchStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("habitable");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
FrenchStemFilterFactory factory = new FrenchStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "habit" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the German stem filter factory is working.
*/
public class TestGermanStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Tischen");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
GermanStemFilterFactory factory = new GermanStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "tisch" });
}
}

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Greek lowercase filter factory is working.
*/
public class TestGreekLowerCaseFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually lowercases (and a bit more) greek text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("Μάϊος ΜΆΪΟΣ");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
GreekLowerCaseFilterFactory factory = new GreekLowerCaseFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "μαιοσ", "μαιοσ" });
}
}

View File

@ -28,12 +28,24 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception { public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal"; String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecologi-\ncal";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on and ecological";
// first test // first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input)); TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
ts = new HyphenatedWordsFilter(ts); HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
String actual = tsToString(ts); ts = factory.create(ts);
assertEquals("Testing HyphenatedWordsFilter", assertTokenStreamContents(ts,
outputAfterHyphenatedWordsFilter, actual); new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecological" });
} }
/**
* Test that HyphenatedWordsFilter behaves correctly with a final hyphen
*/
public void testHyphenAtEnd() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on and ecology-";
// first test
TokenStream ts = new WhitespaceTokenizer(new StringReader(input));
HyphenatedWordsFilterFactory factory = new HyphenatedWordsFilterFactory();
ts = factory.create(ts);
assertTokenStreamContents(ts,
new String[] { "ecological", "develop", "comprehensive-hands-on", "and", "ecology-" });
}
} }

View File

@ -17,13 +17,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/** /**
@ -37,7 +38,7 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
words.add( "aaa" ); words.add( "aaa" );
words.add( "bbb" ); words.add( "bbb" );
List<Token> input = tokens( "aaa BBB ccc ddd EEE" ); String input = "aaa BBB ccc ddd EEE";
Map<String,String> args = new HashMap<String, String>(); Map<String,String> args = new HashMap<String, String>();
@ -47,18 +48,28 @@ public class TestKeepWordFilter extends BaseTokenTestCase {
factory.init( args ); factory.init( args );
factory.inform( solrConfig.getResourceLoader() ); factory.inform( solrConfig.getResourceLoader() );
factory.setWords( words ); factory.setWords( words );
assertTrue(factory.isIgnoreCase());
TokenStream stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
List<Token> expect = tokens( "aaa BBB" ); // Test Stopwords (ignoreCase via the setter instead)
List<Token> real = getTokens(factory.create( new IterTokenStream(input) )); factory = new KeepWordFilterFactory();
assertTokEqual( expect, real ); args = new HashMap<String, String>();
factory.init( args );
factory.inform( solrConfig.getResourceLoader() );
factory.setIgnoreCase(true);
factory.setWords( words );
assertTrue(factory.isIgnoreCase());
stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
assertTokenStreamContents(stream, new String[] { "aaa", "BBB" });
// Now force case // Now force case
args = new HashMap<String, String>();
args.put( "ignoreCase", "false" ); args.put( "ignoreCase", "false" );
factory.init( args ); factory.init( args );
factory.inform( solrConfig.getResourceLoader() ); factory.inform( solrConfig.getResourceLoader() );
assertFalse(factory.isIgnoreCase());
expect = tokens( "aaa" ); stream = factory.create(new WhitespaceTokenizer(new StringReader(input)));
real = getTokens(factory.create( new IterTokenStream(input) )); assertTokenStreamContents(stream, new String[] { "aaa" });
assertTokEqual( expect, real );
} }
} }

View File

@ -1,37 +1,27 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.junit.Assert;
import org.junit.Test; import org.junit.Test;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays;
import java.util.List; import java.util.List;
/** /**
* @version $Id$
* @since solr 1.4 * @since solr 1.4
*/ */
public class TestMultiWordSynonyms { public class TestMultiWordSynonyms extends BaseTokenTestCase {
@Test @Test
public void testMultiWordSynonmys() throws IOException { public void testMultiWordSynonyms() throws IOException {
List<String> rules = new ArrayList<String>(); List<String> rules = new ArrayList<String>();
rules.add("a b c,d"); rules.add("a b c,d");
SynonymMap synMap = new SynonymMap(true); SynonymMap synMap = new SynonymMap(true);
SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null); SynonymFilterFactory.parseRules(rules, synMap, "=>", ",", true, null);
SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap); SynonymFilter ts = new SynonymFilter(new WhitespaceTokenizer(new StringReader("a e")), synMap);
TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class);
ts.reset();
List<String> tokens = new ArrayList<String>();
while (ts.incrementToken()) tokens.add(termAtt.term());
// This fails because ["e","e"] is the value of the token stream // This fails because ["e","e"] is the value of the token stream
Assert.assertEquals(Arrays.asList("a", "e"), tokens); assertTokenStreamContents(ts, new String[] { "a", "e" });
} }
} }

View File

@ -0,0 +1,163 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the NGram filter factories are working.
*/
public class TestNGramFilters extends BaseTokenTestCase {
/**
* Test NGramTokenizerFactory
*/
public void testNGramTokenizer() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
NGramTokenizerFactory factory = new NGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t", "e", "s", "t", "te", "es", "st" });
}
/**
* Test NGramTokenizerFactory with min and max gram options
*/
public void testNGramTokenizer2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "2");
args.put("maxGramSize", "3");
NGramTokenizerFactory factory = new NGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "te", "es", "st", "tes", "est" });
}
/**
* Test the NGramFilterFactory
*/
public void testNGramFilter() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
NGramFilterFactory factory = new NGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t", "e", "s", "t", "te", "es", "st" });
}
/**
* Test the NGramFilterFactory with min and max gram options
*/
public void testNGramFilter2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "2");
args.put("maxGramSize", "3");
NGramFilterFactory factory = new NGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "te", "es", "st", "tes", "est" });
}
/**
* Test EdgeNGramTokenizerFactory
*/
public void testEdgeNGramTokenizer() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t" });
}
/**
* Test EdgeNGramTokenizerFactory with min and max gram size
*/
public void testEdgeNGramTokenizer2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "1");
args.put("maxGramSize", "2");
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
/**
* Test EdgeNGramTokenizerFactory with side option
*/
public void testEdgeNGramTokenizer3() throws Exception {
Reader reader = new StringReader("ready");
Map<String,String> args = new HashMap<String,String>();
args.put("side", "back");
EdgeNGramTokenizerFactory factory = new EdgeNGramTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] { "y" });
}
/**
* Test EdgeNGramFilterFactory
*/
public void testEdgeNGramFilter() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t" });
}
/**
* Test EdgeNGramFilterFactory with min and max gram size
*/
public void testEdgeNGramFilter2() throws Exception {
Reader reader = new StringReader("test");
Map<String,String> args = new HashMap<String,String>();
args.put("minGramSize", "1");
args.put("maxGramSize", "2");
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "t", "te" });
}
/**
* Test EdgeNGramFilterFactory with side option
*/
public void testEdgeNGramFilter3() throws Exception {
Reader reader = new StringReader("ready");
Map<String,String> args = new HashMap<String,String>();
args.put("side", "back");
EdgeNGramFilterFactory factory = new EdgeNGramFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] { "y" });
}
}

View File

@ -19,6 +19,8 @@ package org.apache.solr.analysis;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.CharReader; import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.CharStream;
@ -37,20 +39,33 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// this is test. // this is test.
public void testNothingChange() throws IOException { public void testNothingChange() throws IOException {
final String BLOCK = "this is test."; final String BLOCK = "this is test.";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1$2$3", PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1$2$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "this,1,0,4 is,1,5,7 test.,1,8,13" ), getTokens( ts ) ); assertTokenStreamContents(ts,
new String[] { "this", "is", "test." },
new int[] { 0, 5, 8 },
new int[] { 4, 7, 13 },
new int[] { 1, 1, 1 });
} }
// 012345678 // 012345678
// aa bb cc // aa bb cc
public void testReplaceByEmpty() throws IOException { public void testReplaceByEmpty() throws IOException {
final String BLOCK = "aa bb cc"; final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "", PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertEquals( 0, getTokens( ts ).size() ); assertFalse(ts.incrementToken());
} }
// 012345678 // 012345678
@ -58,10 +73,19 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
// aa#bb#cc // aa#bb#cc
public void test1block1matchSameLength() throws IOException { public void test1block1matchSameLength() throws IOException {
final String BLOCK = "aa bb cc"; final String BLOCK = "aa bb cc";
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2#$3", PatternReplaceCharFilterFactory factory = new PatternReplaceCharFilterFactory();
Map<String,String> args = new HashMap<String,String>();
args.put("pattern", "(aa)\\s+(bb)\\s+(cc)");
args.put("replacement", "$1#$2#$3");
factory.init(args);
CharStream cs = factory.create(
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb#cc,1,0,8" ), getTokens( ts ) ); assertTokenStreamContents(ts,
new String[] { "aa#bb#cc" },
new int[] { 0 },
new int[] { 8 },
new int[] { 1 });
} }
// 11111 // 11111
@ -73,7 +97,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3", CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1##$2###$3",
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb###cc,1,0,8 dd,1,9,11" ), getTokens( ts ) ); assertTokenStreamContents(ts,
new String[] { "aa##bb###cc", "dd" },
new int[] { 0, 9 },
new int[] { 8, 11 },
new int[] { 1, 1 });
} }
// 01234567 // 01234567
@ -84,7 +112,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "a", "aa", CharStream cs = new PatternReplaceCharFilter( "a", "aa",
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 aa,1,4,5" ), getTokens( ts ) ); assertTokenStreamContents(ts,
new String[] { "aa", "aa" },
new int[] { 1, 4 },
new int[] { 2, 5 },
new int[] { 1, 1 });
} }
// 11111 // 11111
@ -96,7 +128,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2", CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1#$2",
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa#bb,1,0,11 dd,1,12,14" ), getTokens( ts ) ); assertTokenStreamContents(ts,
new String[] { "aa#bb", "dd" },
new int[] { 0, 12 },
new int[] { 11, 14 },
new int[] { 1, 1 });
} }
// 111111111122222222223333 // 111111111122222222223333
@ -108,8 +144,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3", CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)\\s+(cc)", "$1 $2 $3",
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,2,4 bb,1,6,8 cc,1,9,10 ---,1,11,14 aa,1,15,17 bb,1,18,20 aa,1,21,23 bb,1,25,27 cc,1,29,33" ), assertTokenStreamContents(ts,
getTokens( ts ) ); new String[] { "aa", "bb", "cc", "---", "aa", "bb", "aa", "bb", "cc" },
new int[] { 2, 6, 9, 11, 15, 18, 21, 25, 29 },
new int[] { 4, 8, 10, 14, 17, 20, 23, 27, 33 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1 });
} }
// 11111111112222222222333333333 // 11111111112222222222333333333
@ -121,8 +160,11 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".", CharStream cs = new PatternReplaceCharFilter( "(aa)\\s+(bb)", "$1##$2", ".",
CharReader.get( new StringReader( BLOCK ) ) ); CharReader.get( new StringReader( BLOCK ) ) );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa##bb,1,2,7 cc,1,8,10 ---,1,11,14 aa##bb,1,15,20 aa.,1,21,24 bb,1,25,27 aa##bb,1,28,35 cc,1,36,38" ), assertTokenStreamContents(ts,
getTokens( ts ) ); new String[] { "aa##bb", "cc", "---", "aa##bb", "aa.", "bb", "aa##bb", "cc" },
new int[] { 2, 8, 11, 15, 21, 25, 28, 36 },
new int[] { 7, 10, 14, 20, 24, 27, 35, 38 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1 });
} }
// 11111111112222222222333333333 // 11111111112222222222333333333
@ -136,7 +178,10 @@ public class TestPatternReplaceCharFilter extends BaseTokenTestCase {
cs = new PatternReplaceCharFilter( "bb", "b", ".", cs ); cs = new PatternReplaceCharFilter( "bb", "b", ".", cs );
cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs ); cs = new PatternReplaceCharFilter( "ccc", "c", ".", cs );
TokenStream ts = new WhitespaceTokenizer( cs ); TokenStream ts = new WhitespaceTokenizer( cs );
assertTokEqualOff( tokens( "aa,1,1,2 b,1,3,5 -,1,6,7 c,1,8,11 .,1,12,13 ---,1,14,17 b,1,18,20 aa,1,21,22 .,1,23,24 c,1,25,28 c,1,29,32 b,1,33,35" ), assertTokenStreamContents(ts,
getTokens( ts ) ); new String[] { "aa", "b", "-", "c", ".", "---", "b", "aa", ".", "c", "c", "b" },
new int[] { 1, 3, 6, 8, 12, 14, 18, 21, 23, 25, 29, 33 },
new int[] { 2, 5, 7, 11, 13, 17, 20, 22, 24, 28, 32, 35 },
new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
} }
} }

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
@ -27,7 +26,7 @@ import java.util.regex.Pattern;
/** /**
* @version $Id:$ * @version $Id:$
*/ */
public class TestPatternReplaceFilter extends AnalysisTestCase { public class TestPatternReplaceFilter extends BaseTokenTestCase {
public void testReplaceAll() throws Exception { public void testReplaceAll() throws Exception {
String input = "aabfooaabfooabfoob ab caaaaaaaaab"; String input = "aabfooaabfooabfoob ab caaaaaaaaab";
@ -35,14 +34,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
"-", true); "-", true);
Token token = ts.next(); assertTokenStreamContents(ts,
assertEquals("-foo-foo-foo-", new String(token.termBuffer(), 0, token.termLength())); new String[] { "-foo-foo-foo-", "-", "c-" });
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testReplaceFirst() throws Exception { public void testReplaceFirst() throws Exception {
@ -51,14 +44,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
"-", false); "-", false);
Token token = ts.next(); assertTokenStreamContents(ts,
assertEquals("-fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); new String[] { "-fooaabfooabfoob", "-", "c-" });
token = ts.next();
assertEquals("-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c-", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testStripFirst() throws Exception { public void testStripFirst() throws Exception {
@ -67,14 +54,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
null, false); null, false);
Token token = ts.next(); assertTokenStreamContents(ts,
assertEquals("fooaabfooabfoob", new String(token.termBuffer(), 0, token.termLength())); new String[] { "fooaabfooabfoob", "", "c" });
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testStripAll() throws Exception { public void testStripAll() throws Exception {
@ -83,14 +64,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("a*b"), Pattern.compile("a*b"),
null, true); null, true);
Token token = ts.next(); assertTokenStreamContents(ts,
assertEquals("foofoofoo", new String(token.termBuffer(), 0, token.termLength())); new String[] { "foofoofoo", "", "c" });
token = ts.next();
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("c", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
public void testReplaceAllWithBackRef() throws Exception { public void testReplaceAllWithBackRef() throws Exception {
@ -99,14 +74,8 @@ public class TestPatternReplaceFilter extends AnalysisTestCase {
(new WhitespaceTokenizer(new StringReader(input)), (new WhitespaceTokenizer(new StringReader(input)),
Pattern.compile("(a*)b"), Pattern.compile("(a*)b"),
"$1\\$", true); "$1\\$", true);
Token token = ts.next(); assertTokenStreamContents(ts,
assertEquals("aa$fooaa$fooa$foo$", new String(token.termBuffer(), 0, token.termLength())); new String[] { "aa$fooaa$fooa$foo$", "a$", "caaaaaaaaa$" });
token = ts.next();
assertEquals("a$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertEquals("caaaaaaaaa$", new String(token.termBuffer(), 0, token.termLength()));
token = ts.next();
assertNull(token);
} }
} }

View File

@ -17,6 +17,7 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
@ -27,8 +28,8 @@ import org.apache.lucene.analysis.CharReader;
import org.apache.lucene.analysis.CharStream; import org.apache.lucene.analysis.CharStream;
import org.apache.lucene.analysis.MappingCharFilter; import org.apache.lucene.analysis.MappingCharFilter;
import org.apache.lucene.analysis.NormalizeCharMap; import org.apache.lucene.analysis.NormalizeCharMap;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
public class TestPatternTokenizerFactory extends BaseTokenTestCase public class TestPatternTokenizerFactory extends BaseTokenTestCase
{ {
@ -57,7 +58,7 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
tokenizer.init( args ); tokenizer.init( args );
TokenStream stream = tokenizer.create( new StringReader( test[2] ) ); TokenStream stream = tokenizer.create( new StringReader( test[2] ) );
String out = TestHyphenatedWordsFilter.tsToString( stream ); String out = tsToString( stream );
System.out.println( test[2] + " ==> " + out ); System.out.println( test[2] + " ==> " + out );
assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out ); assertEquals("pattern: "+test[1]+" with input: "+test[2], test[3], out );
@ -93,20 +94,45 @@ public class TestPatternTokenizerFactory extends BaseTokenTestCase
PatternTokenizerFactory tokFactory = new PatternTokenizerFactory(); PatternTokenizerFactory tokFactory = new PatternTokenizerFactory();
tokFactory.init( args ); tokFactory.init( args );
TokenStream stream = tokFactory.create( charStream ); TokenStream stream = tokFactory.create( charStream );
assertTokenStreamContents(stream,
List<Token> result = getTokens( stream ); new String[] { "Günther", "Günther", "is", "here" },
List<Token> expect = tokens( "Günther,1,0,12 Günther,1,13,25 is,1,26,28 here,1,29,33" ); new int[] { 0, 13, 26, 29 },
assertTokEqualOff( expect, result ); new int[] { 12, 25, 28, 33 },
new int[] { 1, 1, 1, 1 });
charStream.reset(); charStream = new MappingCharFilter( normMap, CharReader.get( new StringReader( INPUT ) ) );
args.put( PatternTokenizerFactory.PATTERN, "Günther" ); args.put( PatternTokenizerFactory.PATTERN, "Günther" );
args.put( PatternTokenizerFactory.GROUP, "0" ); args.put( PatternTokenizerFactory.GROUP, "0" );
tokFactory = new PatternTokenizerFactory(); tokFactory = new PatternTokenizerFactory();
tokFactory.init( args ); tokFactory.init( args );
stream = tokFactory.create( charStream ); stream = tokFactory.create( charStream );
assertTokenStreamContents(stream,
new String[] { "Günther", "Günther" },
new int[] { 0, 13 },
new int[] { 12, 25 },
new int[] { 1, 1 });
}
/**
* TODO: rewrite tests not to use string comparison.
* @deprecated only tests TermAttribute!
*/
private static String tsToString(TokenStream in) throws IOException {
StringBuilder out = new StringBuilder();
TermAttribute termAtt = (TermAttribute) in.addAttribute(TermAttribute.class);
// extra safety to enforce, that the state is not preserved and also
// assign bogus values
in.clearAttributes();
termAtt.setTermBuffer("bogusTerm");
while (in.incrementToken()) {
if (out.length() > 0)
out.append(' ');
out.append(termAtt.term());
in.clearAttributes();
termAtt.setTermBuffer("bogusTerm");
}
result = getTokens( stream ); in.close();
expect = tokens( "Günther,1,0,12 Günther,1,13,25" ); return out.toString();
assertTokEqualOff( expect, result );
} }
} }

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Persian normalization factory is working.
*/
public class TestPersianNormalizationFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually normalizes persian text.
*/
public void testNormalization() throws Exception {
Reader reader = new StringReader("های");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
PersianNormalizationFilterFactory factory = new PersianNormalizationFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "هاي" });
}
}

View File

@ -17,16 +17,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.util.ArrayList; import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex; import org.apache.lucene.analysis.TokenStream;
import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.WhitespaceTokenizer;
/** /**
@ -61,50 +59,38 @@ public class TestPhoneticFilter extends BaseTokenTestCase {
assertFalse( ff.inject ); assertFalse( ff.inject );
} }
public void runner( Encoder enc, boolean inject ) throws Exception public void testAlgorithms() throws Exception {
{ assertAlgorithm("Metaphone", "true", "aaa bbb ccc easgasg",
String[] input = new String[] { new String[] { "A", "aaa", "B", "bbb", "KKK", "ccc", "ESKS", "easgasg" });
"aaa", "bbb", "ccc", "easgasg" assertAlgorithm("Metaphone", "false", "aaa bbb ccc easgasg",
}; new String[] { "A", "B", "KKK", "ESKS" });
ArrayList<Token> stream = new ArrayList<Token>(); assertAlgorithm("DoubleMetaphone", "true", "aaa bbb ccc easgasg",
ArrayList<Token> output = new ArrayList<Token>(); new String[] { "A", "aaa", "PP", "bbb", "KK", "ccc", "ASKS", "easgasg" });
for( String s : input ) { assertAlgorithm("DoubleMetaphone", "false", "aaa bbb ccc easgasg",
stream.add( new Token( s, 0, s.length() ) ); new String[] { "A", "PP", "KK", "ASKS" });
// phonetic token is added first in the current impl assertAlgorithm("Soundex", "true", "aaa bbb ccc easgasg",
output.add( new Token( enc.encode(s).toString(), 0, s.length() ) ); new String[] { "A000", "aaa", "B000", "bbb", "C000", "ccc", "E220", "easgasg" });
assertAlgorithm("Soundex", "false", "aaa bbb ccc easgasg",
// add the original if applicable new String[] { "A000", "B000", "C000", "E220" });
if( inject ) {
output.add( new Token( s, 0, s.length() ) ); assertAlgorithm("RefinedSoundex", "true", "aaa bbb ccc easgasg",
} new String[] { "A0", "aaa", "B1", "bbb", "C3", "ccc", "E034034", "easgasg" });
} assertAlgorithm("RefinedSoundex", "false", "aaa bbb ccc easgasg",
new String[] { "A0", "B1", "C3", "E034034" });
// System.out.println("###stream="+stream);
// System.out.println("###output="+output);
PhoneticFilter filter = new PhoneticFilter(
new IterTokenStream(stream.iterator()), enc, "text", inject );
Token got = new Token();
for( Token t : output ) {
got = filter.next(got);
// System.out.println("##### expect=" + t + " got="+got);
assertEquals( t.term(), got.term());
}
assertNull( filter.next() ); // no more tokens
} }
public void testEncodes() throws Exception { static void assertAlgorithm(String algName, String inject, String input,
runner( new DoubleMetaphone(), true ); String[] expected) throws Exception {
runner( new Metaphone(), true ); Tokenizer tokenizer = new WhitespaceTokenizer(
runner( new Soundex(), true ); new StringReader(input));
runner( new RefinedSoundex(), true ); Map<String,String> args = new HashMap<String,String>();
args.put("encoder", algName);
runner( new DoubleMetaphone(), false ); args.put("inject", inject);
runner( new Metaphone(), false ); PhoneticFilterFactory factory = new PhoneticFilterFactory();
runner( new Soundex(), false ); factory.init(args);
runner( new RefinedSoundex(), false ); TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, expected);
} }
} }

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Porter stem filter factory is working.
*/
public class TestPorterStemFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually stems text.
*/
public void testStemming() throws Exception {
Reader reader = new StringReader("dogs");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
PorterStemFilterFactory factory = new PorterStemFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "dog" });
}
}

View File

@ -20,10 +20,14 @@ package org.apache.solr.analysis;
import junit.framework.TestCase; import junit.framework.TestCase;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import java.util.Iterator; import java.util.Iterator;
import java.util.Arrays; import java.util.Arrays;
public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase { public class TestRemoveDuplicatesTokenFilter extends BaseTokenTestCase {
public static Token tok(int pos, String t, int start, int end) { public static Token tok(int pos, String t, int start, int end) {
Token tok = new Token(t,start,end); Token tok = new Token(t,start,end);
@ -38,15 +42,27 @@ public class TestRemoveDuplicatesTokenFilter extends AnalysisTestCase {
throws Exception { throws Exception {
final Iterator<Token> toks = Arrays.asList(tokens).iterator(); final Iterator<Token> toks = Arrays.asList(tokens).iterator();
RemoveDuplicatesTokenFilterFactory factory = new RemoveDuplicatesTokenFilterFactory();
final TokenStream ts = new RemoveDuplicatesTokenFilter final TokenStream ts = factory.create
(new TokenStream() { (new TokenStream() {
public Token next() { return toks.hasNext() ? toks.next() : null; } TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
public boolean incrementToken() {
if (toks.hasNext()) {
clearAttributes();
Token tok = toks.next();
termAtt.setTermBuffer(tok.term());
offsetAtt.setOffset(tok.startOffset(), tok.endOffset());
posIncAtt.setPositionIncrement(tok.getPositionIncrement());
return true;
} else {
return false;
}
}
}); });
final String actual = TestBufferedTokenStream.tsToString(ts); assertTokenStreamContents(ts, expected.split("\\s"));
assertEquals(expected + " != " + actual, expected, actual);
} }
public void testNoDups() throws Exception { public void testNoDups() throws Exception {

View File

@ -0,0 +1,41 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Reverse string filter factory is working.
*/
public class TestReverseStringFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually reverses text.
*/
public void testReversing() throws Exception {
Reader reader = new StringReader("simple test");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ReverseStringFilterFactory factory = new ReverseStringFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "elpmis", "tset" });
}
}

View File

@ -21,11 +21,9 @@ import java.io.IOException;
import java.io.StringReader; import java.io.StringReader;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.ParseException;
@ -53,57 +51,52 @@ public class TestReversedWildcardFilterFactory extends BaseTokenTestCase {
public void testReversedTokens() throws IOException { public void testReversedTokens() throws IOException {
String text = "simple text"; String text = "simple text";
String expected1 = "simple \u0001elpmis text \u0001txet";
String expected2 = "\u0001elpmis \u0001txet";
args.put("withOriginal", "true"); args.put("withOriginal", "true");
factory.init(args); factory.init(args);
TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text))); TokenStream input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
List<Token> realTokens = getTokens(input); assertTokenStreamContents(input,
List<Token> expectedTokens = tokens(expected1); new String[] { "\u0001elpmis", "simple", "\u0001txet", "text" },
// set positionIncrements in expected tokens new int[] { 1, 0, 1, 0 });
for (int i = 1; i < expectedTokens.size(); i += 2) {
expectedTokens.get(i).setPositionIncrement(0);
}
assertTokEqual(realTokens, expectedTokens);
// now without original tokens // now without original tokens
args.put("withOriginal", "false"); args.put("withOriginal", "false");
factory.init(args); factory.init(args);
input = factory.create(new WhitespaceTokenizer(new StringReader(text))); input = factory.create(new WhitespaceTokenizer(new StringReader(text)));
realTokens = getTokens(input); assertTokenStreamContents(input,
expectedTokens = tokens(expected2); new String[] { "\u0001elpmis", "\u0001txet" },
assertTokEqual(realTokens, expectedTokens); new int[] { 1, 1 });
} }
public void testIndexingAnalysis() throws Exception { public void testIndexingAnalysis() throws Exception {
Analyzer a = schema.getAnalyzer(); Analyzer a = schema.getAnalyzer();
String text = "one two three si\uD834\uDD1Ex"; String text = "one two three si\uD834\uDD1Ex";
String expected1 = "one \u0001eno two \u0001owt three \u0001eerht si\uD834\uDD1Ex \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens1 = getTokens(
new WhitespaceTokenizer(new StringReader(expected1)));
// set positionIncrements and offsets in expected tokens
for (int i = 1; i < expectedTokens1.size(); i += 2) {
Token t = expectedTokens1.get(i);
t.setPositionIncrement(0);
}
String expected2 = "\u0001eno \u0001owt \u0001eerht \u0001x\uD834\uDD1Eis";
List<Token> expectedTokens2 = getTokens(
new WhitespaceTokenizer(new StringReader(expected2)));
String expected3 = "one two three si\uD834\uDD1Ex";
List<Token> expectedTokens3 = getTokens(
new WhitespaceTokenizer(new StringReader(expected3)));
// field one // field one
TokenStream input = a.tokenStream("one", new StringReader(text)); TokenStream input = a.tokenStream("one", new StringReader(text));
List<Token> realTokens = getTokens(input); assertTokenStreamContents(input,
assertTokEqual(realTokens, expectedTokens1); new String[] { "\u0001eno", "one", "\u0001owt", "two",
"\u0001eerht", "three", "\u0001x\uD834\uDD1Eis", "si\uD834\uDD1Ex" },
new int[] { 0, 0, 4, 4, 8, 8, 14, 14 },
new int[] { 3, 3, 7, 7, 13, 13, 19, 19 },
new int[] { 1, 0, 1, 0, 1, 0, 1, 0 }
);
// field two // field two
input = a.tokenStream("two", new StringReader(text)); input = a.tokenStream("two", new StringReader(text));
realTokens = getTokens(input); assertTokenStreamContents(input,
assertTokEqual(realTokens, expectedTokens2); new String[] { "\u0001eno", "\u0001owt",
"\u0001eerht", "\u0001x\uD834\uDD1Eis" },
new int[] { 0, 4, 8, 14 },
new int[] { 3, 7, 13, 19 },
new int[] { 1, 1, 1, 1 }
);
// field three // field three
input = a.tokenStream("three", new StringReader(text)); input = a.tokenStream("three", new StringReader(text));
realTokens = getTokens(input); assertTokenStreamContents(input,
assertTokEqual(realTokens, expectedTokens3); new String[] { "one", "two", "three", "si\uD834\uDD1Ex" },
new int[] { 0, 4, 8, 14 },
new int[] { 3, 7, 13, 19 },
new int[] { 1, 1, 1, 1 }
);
} }
public void testQueryParsing() throws IOException, ParseException { public void testQueryParsing() throws IOException, ParseException {

View File

@ -0,0 +1,79 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
/**
* Simple tests to ensure the Russian filter factories are working.
*/
public class TestRussianFilters extends BaseTokenTestCase {
/**
* Test RussianLetterTokenizerFactory
*/
public void testTokenizer() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream, new String[] {"Вместе", "с", "тем", "о",
"силе", "электромагнитной", "100"});
}
/**
* Test RussianLowerCaseFilterFactory
*/
public void testLowerCase() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
RussianLowerCaseFilterFactory filterFactory = new RussianLowerCaseFilterFactory();
filterFactory.init(args);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"вместе", "с", "тем", "о",
"силе", "электромагнитной", "100"});
}
/**
* Test RussianStemFilterFactory
*/
public void testStemmer() throws Exception {
Reader reader = new StringReader("Вместе с тем о силе электромагнитной 100");
Map<String,String> args = new HashMap<String,String>();
RussianLetterTokenizerFactory factory = new RussianLetterTokenizerFactory();
factory.init(args);
RussianLowerCaseFilterFactory caseFactory = new RussianLowerCaseFilterFactory();
caseFactory.init(args);
RussianStemFilterFactory stemFactory = new RussianStemFilterFactory();
stemFactory.init(args);
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = caseFactory.create(tokenizer);
stream = stemFactory.create(stream);
assertTokenStreamContents(stream, new String[] {"вмест", "с", "тем", "о",
"сил", "электромагнитн", "100"});
}
}

View File

@ -0,0 +1,73 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Shingle filter factory works.
*/
public class TestShingleFilterFactory extends BaseTokenTestCase {
/**
* Test the defaults
*/
public void testDefaults() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream, new String[] {"this", "this is", "is",
"is a", "a", "a test", "test"});
}
/**
* Test with unigrams disabled
*/
public void testNoUnigrams() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("outputUnigrams", "false");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] {"this is", "is a", "a test"});
}
/**
* Test with a higher max shingle size
*/
public void testMaxShingleSize() throws Exception {
Reader reader = new StringReader("this is a test");
Map<String,String> args = new HashMap<String,String>();
args.put("maxShingleSize", "3");
ShingleFilterFactory factory = new ShingleFilterFactory();
factory.init(args);
TokenStream stream = factory.create(new WhitespaceTokenizer(reader));
assertTokenStreamContents(stream,
new String[] {"this", "this is", "this is a", "is",
"is a", "is a test", "a", "a test", "test"});
}
}

View File

@ -0,0 +1,121 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the standard lucene factories are working.
*/
public class TestStandardFactories extends BaseTokenTestCase {
/**
* Test StandardTokenizerFactory
*/
public void testStandardTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do" });
}
/**
* Test StandardFilterFactory
*/
public void testStandardFilter() throws Exception {
Reader reader = new StringReader("What's this thing do?");
StandardTokenizerFactory factory = new StandardTokenizerFactory();
StandardFilterFactory filterFactory = new StandardFilterFactory();
Tokenizer tokenizer = factory.create(reader);
TokenStream stream = filterFactory.create(tokenizer);
assertTokenStreamContents(stream,
new String[] {"What", "this", "thing", "do"});
}
/**
* Test KeywordTokenizerFactory
*/
public void testKeywordTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
KeywordTokenizerFactory factory = new KeywordTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's this thing do?"});
}
/**
* Test WhitespaceTokenizerFactory
*/
public void testWhitespaceTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
WhitespaceTokenizerFactory factory = new WhitespaceTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What's", "this", "thing", "do?"});
}
/**
* Test LetterTokenizerFactory
*/
public void testLetterTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LetterTokenizerFactory factory = new LetterTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"What", "s", "this", "thing", "do"});
}
/**
* Test LowerCaseTokenizerFactory
*/
public void testLowerCaseTokenizer() throws Exception {
Reader reader = new StringReader("What's this thing do?");
LowerCaseTokenizerFactory factory = new LowerCaseTokenizerFactory();
Tokenizer stream = factory.create(reader);
assertTokenStreamContents(stream,
new String[] {"what", "s", "this", "thing", "do"});
}
/**
* Ensure the ASCIIFoldingFilterFactory works
*/
public void testASCIIFolding() throws Exception {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ASCIIFoldingFilterFactory factory = new ASCIIFoldingFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Ceska" });
}
/**
* Ensure the ISOLatin1AccentFilterFactory works
* (sometimes, at least not uppercase hacek)
*/
public void testISOLatin1Folding() throws Exception {
Reader reader = new StringReader("Česká");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ISOLatin1AccentFilterFactory factory = new ISOLatin1AccentFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] { "Česka" });
}
}

View File

@ -19,11 +19,20 @@ package org.apache.solr.analysis;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.io.IOException; import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.Iterator; import java.util.Collection;
import java.util.List; import java.util.List;
/** /**
@ -31,34 +40,42 @@ import java.util.List;
*/ */
public class TestSynonymFilter extends BaseTokenTestCase { public class TestSynonymFilter extends BaseTokenTestCase {
public List strings(String str) { static List<String> strings(String str) {
String[] arr = str.split(" "); String[] arr = str.split(" ");
return Arrays.asList(arr); return Arrays.asList(arr);
} }
static void assertTokenizesTo(SynonymMap dict, String input,
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException { String expected[]) throws IOException {
ArrayList<Token> lst = new ArrayList<Token>(); Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
final List toks = tokens(input); SynonymFilter stream = new SynonymFilter(tokenizer, dict);
TokenStream ts = new TokenStream() { assertTokenStreamContents(stream, expected);
Iterator iter = toks.iterator();
@Override
public Token next() throws IOException {
return iter.hasNext() ? (Token)iter.next() : null;
}
};
SynonymFilter sf = new SynonymFilter(ts, dict);
Token target = new Token(); // test with token reuse
while(true) {
Token t = sf.next(target);
if (t==null) return lst;
lst.add((Token)t.clone());
}
} }
static void assertTokenizesTo(SynonymMap dict, String input,
String expected[], int posIncs[]) throws IOException {
Tokenizer tokenizer = new WhitespaceTokenizer(new StringReader(input));
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, posIncs);
}
static void assertTokenizesTo(SynonymMap dict, List<Token> input,
String expected[], int startOffsets[], int endOffsets[], int posIncs[])
throws IOException {
TokenStream tokenizer = new IterTokenStream(input);
SynonymFilter stream = new SynonymFilter(tokenizer, dict);
assertTokenStreamContents(stream, expected, startOffsets, endOffsets,
posIncs);
}
public void testMatching() throws IOException { public void testMatching() throws IOException {
SynonymMap map = new SynonymMap(); SynonymMap map = new SynonymMap();
@ -71,28 +88,29 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("z x c v"), tokens("zxcv"), orig, merge); map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge); map.add(strings("x c"), tokens("xc"), orig, merge);
// System.out.println(map); assertTokenizesTo(map, "$", new String[] { "$" });
// System.out.println(getTokList(map,"a",false)); assertTokenizesTo(map, "a", new String[] { "aa" });
assertTokenizesTo(map, "a $", new String[] { "aa", "$" });
assertTokEqual(getTokList(map,"$",false), tokens("$")); assertTokenizesTo(map, "$ a", new String[] { "$", "aa" });
assertTokEqual(getTokList(map,"a",false), tokens("aa")); assertTokenizesTo(map, "a a", new String[] { "aa", "aa" });
assertTokEqual(getTokList(map,"a $",false), tokens("aa $")); assertTokenizesTo(map, "b", new String[] { "bb" });
assertTokEqual(getTokList(map,"$ a",false), tokens("$ aa")); assertTokenizesTo(map, "z x c v", new String[] { "zxcv" });
assertTokEqual(getTokList(map,"a a",false), tokens("aa aa")); assertTokenizesTo(map, "z x c $", new String[] { "z", "xc", "$" });
assertTokEqual(getTokList(map,"b",false), tokens("bb"));
assertTokEqual(getTokList(map,"z x c v",false), tokens("zxcv"));
assertTokEqual(getTokList(map,"z x c $",false), tokens("z xc $"));
// repeats // repeats
map.add(strings("a b"), tokens("ab"), orig, merge); map.add(strings("a b"), tokens("ab"), orig, merge);
map.add(strings("a b"), tokens("ab"), orig, merge); map.add(strings("a b"), tokens("ab"), orig, merge);
assertTokEqual(getTokList(map,"a b",false), tokens("ab"));
// FIXME: the below test intended to be { "ab" }
assertTokenizesTo(map, "a b", new String[] { "ab", "ab", "ab" });
// check for lack of recursion // check for lack of recursion
map.add(strings("zoo"), tokens("zoo"), orig, merge); map.add(strings("zoo"), tokens("zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo $ zoo")); assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "$", "zoo" });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge); map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo zoo zoo zoo $ zoo zoo")); // FIXME: the below test intended to be { "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo" }
// maybe this was just a typo in the old test????
assertTokenizesTo(map, "zoo zoo $ zoo", new String[] { "zoo", "zoo", "zoo", "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" });
} }
public void testIncludeOrig() throws IOException { public void testIncludeOrig() throws IOException {
@ -107,25 +125,48 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("z x c v"), tokens("zxcv"), orig, merge); map.add(strings("z x c v"), tokens("zxcv"), orig, merge);
map.add(strings("x c"), tokens("xc"), orig, merge); map.add(strings("x c"), tokens("xc"), orig, merge);
// System.out.println(map); assertTokenizesTo(map, "$",
// System.out.println(getTokList(map,"a",false)); new String[] { "$" },
new int[] { 1 });
assertTokEqual(getTokList(map,"$",false), tokens("$")); assertTokenizesTo(map, "a",
assertTokEqual(getTokList(map,"a",false), tokens("a/aa")); new String[] { "a", "aa" },
assertTokEqual(getTokList(map,"a",false), tokens("a/aa")); new int[] { 1, 0 });
assertTokEqual(getTokList(map,"$ a",false), tokens("$ a/aa")); assertTokenizesTo(map, "a",
assertTokEqual(getTokList(map,"a $",false), tokens("a/aa $")); new String[] { "a", "aa" },
assertTokEqual(getTokList(map,"$ a !",false), tokens("$ a/aa !")); new int[] { 1, 0 });
assertTokEqual(getTokList(map,"a a",false), tokens("a/aa a/aa")); assertTokenizesTo(map, "$ a",
assertTokEqual(getTokList(map,"b",false), tokens("b/bb")); new String[] { "$", "a", "aa" },
assertTokEqual(getTokList(map,"z x c v",false), tokens("z/zxcv x c v")); new int[] { 1, 1, 0 });
assertTokEqual(getTokList(map,"z x c $",false), tokens("z x/xc c $")); assertTokenizesTo(map, "a $",
new String[] { "a", "aa", "$" },
new int[] { 1, 0, 1 });
assertTokenizesTo(map, "$ a !",
new String[] { "$", "a", "aa", "!" },
new int[] { 1, 1, 0, 1 });
assertTokenizesTo(map, "a a",
new String[] { "a", "aa", "a", "aa" },
new int[] { 1, 0, 1, 0 });
assertTokenizesTo(map, "b",
new String[] { "b", "bb" },
new int[] { 1, 0 });
assertTokenizesTo(map, "z x c v",
new String[] { "z", "zxcv", "x", "c", "v" },
new int[] { 1, 0, 1, 1, 1 });
assertTokenizesTo(map, "z x c $",
new String[] { "z", "x", "xc", "c", "$" },
new int[] { 1, 1, 0, 1, 1 });
// check for lack of recursion // check for lack of recursion
map.add(strings("zoo zoo"), tokens("zoo"), orig, merge); map.add(strings("zoo zoo"), tokens("zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo/zoo $ zoo/zoo")); // CHECKME: I think the previous test (with 4 zoo's), was just a typo.
assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo" },
new int[] { 1, 0, 1, 1, 1 });
map.add(strings("zoo"), tokens("zoo zoo"), orig, merge); map.add(strings("zoo"), tokens("zoo zoo"), orig, merge);
assertTokEqual(getTokList(map,"zoo zoo $ zoo",false), tokens("zoo/zoo zoo $ zoo/zoo zoo")); assertTokenizesTo(map, "zoo zoo $ zoo",
new String[] { "zoo", "zoo", "zoo", "$", "zoo", "zoo", "zoo" },
new int[] { 1, 0, 1, 1, 1, 0, 1 });
} }
@ -136,25 +177,35 @@ public class TestSynonymFilter extends BaseTokenTestCase {
boolean merge = true; boolean merge = true;
map.add(strings("a"), tokens("a5,5"), orig, merge); map.add(strings("a"), tokens("a5,5"), orig, merge);
map.add(strings("a"), tokens("a3,3"), orig, merge); map.add(strings("a"), tokens("a3,3"), orig, merge);
// System.out.println(map);
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2")); assertTokenizesTo(map, "a",
new String[] { "a3", "a5" },
new int[] { 1, 2 });
map.add(strings("b"), tokens("b3,3"), orig, merge); map.add(strings("b"), tokens("b3,3"), orig, merge);
map.add(strings("b"), tokens("b5,5"), orig, merge); map.add(strings("b"), tokens("b5,5"), orig, merge);
//System.out.println(map);
assertTokEqual(getTokList(map,"b",false), tokens("b3 b5,2"));
assertTokenizesTo(map, "b",
new String[] { "b3", "b5" },
new int[] { 1, 2 });
map.add(strings("a"), tokens("A3,3"), orig, merge); map.add(strings("a"), tokens("A3,3"), orig, merge);
map.add(strings("a"), tokens("A5,5"), orig, merge); map.add(strings("a"), tokens("A5,5"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a3/A3 a5,2/A5"));
assertTokenizesTo(map, "a",
new String[] { "a3", "A3", "a5", "A5" },
new int[] { 1, 0, 2, 0 });
map.add(strings("a"), tokens("a1"), orig, merge); map.add(strings("a"), tokens("a1"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a1 a3,2/A3 a5,2/A5")); assertTokenizesTo(map, "a",
new String[] { "a1", "a3", "A3", "a5", "A5" },
new int[] { 1, 2, 0, 2, 0 });
map.add(strings("a"), tokens("a2,2"), orig, merge); map.add(strings("a"), tokens("a2,2"), orig, merge);
map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge); map.add(strings("a"), tokens("a4,4 a6,2"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a1 a2 a3/A3 a4 a5/A5 a6")); assertTokenizesTo(map, "a",
new String[] { "a1", "a2", "a3", "A3", "a4", "a5", "A5", "a6" },
new int[] { 1, 1, 1, 0, 1, 1, 0, 1 });
} }
@ -167,41 +218,56 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("qwe"), tokens("xx"), orig, merge); map.add(strings("qwe"), tokens("xx"), orig, merge);
map.add(strings("qwe"), tokens("yy"), orig, merge); map.add(strings("qwe"), tokens("yy"), orig, merge);
map.add(strings("qwe"), tokens("zz"), orig, merge); map.add(strings("qwe"), tokens("zz"), orig, merge);
assertTokEqual(getTokList(map,"$",false), tokens("$")); assertTokenizesTo(map, "$", new String[] { "$" });
assertTokEqual(getTokList(map,"qwe",false), tokens("qq/ww/ee/xx/yy/zz")); assertTokenizesTo(map, "qwe",
new String[] { "qq", "ww", "ee", "xx", "yy", "zz" },
new int[] { 1, 0, 0, 0, 0, 0 });
// test merging within the map // test merging within the map
map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge); map.add(strings("a"), tokens("a5,5 a8,3 a10,2"), orig, merge);
map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge); map.add(strings("a"), tokens("a3,3 a7,4 a9,2 a11,2 a111,100"), orig, merge);
assertTokEqual(getTokList(map,"a",false), tokens("a3 a5,2 a7,2 a8 a9 a10 a11 a111,100")); assertTokenizesTo(map, "a",
new String[] { "a3", "a5", "a7", "a8", "a9", "a10", "a11", "a111" },
new int[] { 1, 2, 2, 1, 1, 1, 1, 100 });
} }
public void testOffsets() throws IOException { public void testPositionIncrements() throws IOException {
SynonymMap map = new SynonymMap(); SynonymMap map = new SynonymMap();
boolean orig = false; boolean orig = false;
boolean merge = true; boolean merge = true;
// test that generated tokens start at the same offset as the original // test that generated tokens start at the same posInc as the original
map.add(strings("a"), tokens("aa"), orig, merge); map.add(strings("a"), tokens("aa"), orig, merge);
assertTokEqual(getTokList(map,"a,5",false), tokens("aa,5")); assertTokenizesTo(map, tokens("a,5"),
assertTokEqual(getTokList(map,"a,0",false), tokens("aa,0")); new String[] { "aa" },
new int[] { 5 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "aa" },
new int[] { 0 });
// test that offset of first replacement is ignored (always takes the orig offset) // test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge); map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5")); assertTokenizesTo(map, tokens("b,5"),
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0")); new String[] { "bb" },
new int[] { 5 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "bb" },
new int[] { 0 });
// test that subsequent tokens are adjusted accordingly // test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5 c2,2")); assertTokenizesTo(map, tokens("c,5"),
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0 c2,2")); new String[] { "cc", "c2" },
new int[] { 5, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "cc", "c2" },
new int[] { 0, 2 });
} }
public void testOffsetsWithOrig() throws IOException { public void testPositionIncrementsWithOrig() throws IOException {
SynonymMap map = new SynonymMap(); SynonymMap map = new SynonymMap();
boolean orig = true; boolean orig = true;
@ -209,18 +275,30 @@ public class TestSynonymFilter extends BaseTokenTestCase {
// test that generated tokens start at the same offset as the original // test that generated tokens start at the same offset as the original
map.add(strings("a"), tokens("aa"), orig, merge); map.add(strings("a"), tokens("aa"), orig, merge);
assertTokEqual(getTokList(map,"a,5",false), tokens("a,5/aa")); assertTokenizesTo(map, tokens("a,5"),
assertTokEqual(getTokList(map,"a,0",false), tokens("a,0/aa")); new String[] { "a", "aa" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("a,0"),
new String[] { "a", "aa" },
new int[] { 0, 0 });
// test that offset of first replacement is ignored (always takes the orig offset) // test that offset of first replacement is ignored (always takes the orig offset)
map.add(strings("b"), tokens("bb,100"), orig, merge); map.add(strings("b"), tokens("bb,100"), orig, merge);
assertTokEqual(getTokList(map,"b,5",false), tokens("bb,5/b")); assertTokenizesTo(map, tokens("b,5"),
assertTokEqual(getTokList(map,"b,0",false), tokens("bb,0/b")); new String[] { "b", "bb" },
new int[] { 5, 0 });
assertTokenizesTo(map, tokens("b,0"),
new String[] { "b", "bb" },
new int[] { 0, 0 });
// test that subsequent tokens are adjusted accordingly // test that subsequent tokens are adjusted accordingly
map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge); map.add(strings("c"), tokens("cc,100 c2,2"), orig, merge);
assertTokEqual(getTokList(map,"c,5",false), tokens("cc,5/c c2,2")); assertTokenizesTo(map, tokens("c,5"),
assertTokEqual(getTokList(map,"c,0",false), tokens("cc,0/c c2,2")); new String[] { "c", "cc", "c2" },
new int[] { 5, 0, 2 });
assertTokenizesTo(map, tokens("c,0"),
new String[] { "c", "cc", "c2" },
new int[] { 0, 0, 2 });
} }
@ -238,10 +316,101 @@ public class TestSynonymFilter extends BaseTokenTestCase {
map.add(strings("a a"), tokens("b"), orig, merge); map.add(strings("a a"), tokens("b"), orig, merge);
map.add(strings("x"), tokens("y"), orig, merge); map.add(strings("x"), tokens("y"), orig, merge);
System.out.println(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false));
// "a a x" => "b y" // "a a x" => "b y"
assertTokEqualOff(getTokList(map,"a,1,0,1 a,1,2,3 x,1,4,5",false), tokens("b,1,0,3 y,1,4,5")); assertTokenizesTo(map, tokens("a,1,0,1 a,1,2,3 x,1,4,5"),
new String[] { "b", "y" },
new int[] { 0, 4 },
new int[] { 3, 5 },
new int[] { 1, 1 });
} }
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
* @deprecated does not support attributes api
*/
private List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
}
/**
* @deprecated does not support custom attributes
*/
private static class IterTokenStream extends TokenStream {
final Token tokens[];
int index = 0;
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
public IterTokenStream(Token... tokens) {
super();
this.tokens = tokens;
}
public IterTokenStream(Collection<Token> tokens) {
this(tokens.toArray(new Token[tokens.size()]));
}
public boolean incrementToken() throws IOException {
if (index >= tokens.length)
return false;
else {
clearAttributes();
Token token = tokens[index++];
termAtt.setTermBuffer(token.term());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
}
}
} }

View File

@ -0,0 +1,42 @@
package org.apache.solr.analysis;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* Simple tests to ensure the Thai word filter factory is working.
*/
public class TestThaiWordFilterFactory extends BaseTokenTestCase {
/**
* Ensure the filter actually decomposes text.
*/
public void testWordBreak() throws Exception {
Reader reader = new StringReader("การที่ได้ต้องแสดงว่างานดี");
Tokenizer tokenizer = new WhitespaceTokenizer(reader);
ThaiWordFilterFactory factory = new ThaiWordFilterFactory();
TokenStream stream = factory.create(tokenizer);
assertTokenStreamContents(stream, new String[] {"การ", "ที่", "ได้",
"ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
}
}

View File

@ -17,12 +17,19 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import java.util.List;
/** /**
* @version $Id:$ * @version $Id:$
@ -35,46 +42,75 @@ public class TestTrimFilter extends BaseTokenTestCase {
char[] ccc = "cCc".toCharArray(); char[] ccc = "cCc".toCharArray();
char[] whitespace = " ".toCharArray(); char[] whitespace = " ".toCharArray();
char[] empty = "".toCharArray(); char[] empty = "".toCharArray();
TokenStream ts = new TrimFilter TrimFilterFactory factory = new TrimFilterFactory();
(new IterTokenStream(new Token(a, 0, a.length, 1, 5), Map<String,String> args = new HashMap<String,String>();
args.put("updateOffsets", "false");
factory.init(args);
TokenStream ts = factory.create(new IterTokenStream(new Token(a, 0, a.length, 1, 5),
new Token(b, 0, b.length, 6, 10), new Token(b, 0, b.length, 6, 10),
new Token(ccc, 0, ccc.length, 11, 15), new Token(ccc, 0, ccc.length, 11, 15),
new Token(whitespace, 0, whitespace.length, 16, 20), new Token(whitespace, 0, whitespace.length, 16, 20),
new Token(empty, 0, empty.length, 21, 21)), false); new Token(empty, 0, empty.length, 21, 21)));
TermAttribute token; assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""});
assertTrue(ts.incrementToken());
token = (TermAttribute) ts.getAttribute(TermAttribute.class);
assertEquals("a", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("b", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("cCc", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
assertTrue(ts.incrementToken());
assertEquals("", new String(token.termBuffer(), 0, token.termLength()));
assertFalse(ts.incrementToken());
a = " a".toCharArray(); a = " a".toCharArray();
b = "b ".toCharArray(); b = "b ".toCharArray();
ccc = " c ".toCharArray(); ccc = " c ".toCharArray();
whitespace = " ".toCharArray(); whitespace = " ".toCharArray();
ts = new TrimFilter(new IterTokenStream( factory = new TrimFilterFactory();
args = new HashMap<String,String>();
args.put("updateOffsets", "true");
factory.init(args);
ts = factory.create(new IterTokenStream(
new Token(a, 0, a.length, 0, 2), new Token(a, 0, a.length, 0, 2),
new Token(b, 0, b.length, 0, 2), new Token(b, 0, b.length, 0, 2),
new Token(ccc, 0, ccc.length, 0, 3), new Token(ccc, 0, ccc.length, 0, 3),
new Token(whitespace, 0, whitespace.length, 0, 3)), true); new Token(whitespace, 0, whitespace.length, 0, 3)));
List<Token> expect = tokens("a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3"); assertTokenStreamContents(ts,
List<Token> real = getTokens(ts); new String[] { "a", "b", "c", "" },
for (Token t : expect) { new int[] { 1, 0, 1, 3 },
System.out.println("TEST:" + t); new int[] { 2, 1, 2, 3 },
} new int[] { 1, 1, 1, 1 });
for (Token t : real) { }
System.out.println("REAL:" + t);
} /**
assertTokEqualOff(expect, real); * @deprecated does not support custom attributes
*/
private static class IterTokenStream extends TokenStream {
final Token tokens[];
int index = 0;
TermAttribute termAtt = (TermAttribute) addAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) addAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) addAttribute(PositionIncrementAttribute.class);
FlagsAttribute flagsAtt = (FlagsAttribute) addAttribute(FlagsAttribute.class);
TypeAttribute typeAtt = (TypeAttribute) addAttribute(TypeAttribute.class);
PayloadAttribute payloadAtt = (PayloadAttribute) addAttribute(PayloadAttribute.class);
public IterTokenStream(Token... tokens) {
super();
this.tokens = tokens;
}
public IterTokenStream(Collection<Token> tokens) {
this(tokens.toArray(new Token[tokens.size()]));
}
public boolean incrementToken() throws IOException {
if (index >= tokens.length)
return false;
else {
clearAttributes();
Token token = tokens[index++];
termAtt.setTermBuffer(token.term());
offsetAtt.setOffset(token.startOffset(), token.endOffset());
posIncAtt.setPositionIncrement(token.getPositionIncrement());
flagsAtt.setFlags(token.getFlags());
typeAtt.setType(token.type());
payloadAtt.setPayload(token.getPayload());
return true;
}
}
} }
} }

View File

@ -17,14 +17,14 @@
package org.apache.solr.analysis; package org.apache.solr.analysis;
import org.apache.solr.util.AbstractSolrTestCase;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.KeywordTokenizer;
import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.WhitespaceTokenizer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.analysis.tokenattributes.TermAttribute;
@ -37,7 +37,7 @@ import java.util.HashSet;
/** /**
* New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest * New WordDelimiterFilter tests... most of the tests are in ConvertedLegacyTest
*/ */
public class TestWordDelimiterFilter extends AbstractSolrTestCase { public class TestWordDelimiterFilter extends BaseTokenTestCase {
public String getSchemaFile() { return "solr/conf/schema.xml"; } public String getSchemaFile() { return "solr/conf/schema.xml"; }
public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; } public String getSolrConfigFile() { return "solr/conf/solrconfig.xml"; }
@ -144,148 +144,74 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
// test that subwords and catenated subwords have // test that subwords and catenated subwords have
// the correct offsets. // the correct offsets.
WordDelimiterFilter wdf = new WordDelimiterFilter( WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("foo-bar", 5, 12)),
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 12); // actual
return t;
}
},
1,1,0,0,1,1,0); 1,1,0,0,1,1,0);
int i=0; assertTokenStreamContents(wdf,
for(Token t; (t=wdf.next())!=null;) { new String[] { "foo", "bar", "foobar" },
String termText = new String(t.termBuffer(), 0, t.termLength()); new int[] { 5, 9, 5 },
if (termText.equals("foo")) { new int[] { 8, 12, 12 });
assertEquals(5, t.startOffset());
assertEquals(8, t.endOffset());
i++;
}
if (termText.equals("bar")) {
assertEquals(9, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
if (termText.equals("foobar")) {
assertEquals(5, t.startOffset());
assertEquals(12, t.endOffset());
i++;
}
}
assertEquals(3,i); // make sure all 3 tokens were generated
// test that if splitting or catenating a synonym, that the offsets
// are not altered (they would be incorrect).
wdf = new WordDelimiterFilter( wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("foo-bar", 5, 6)),
Token t;
public Token next() throws IOException {
if (t!=null) return null;
t = new Token("foo-bar", 5, 6); // a synonym
return t;
}
},
1,1,0,0,1,1,0); 1,1,0,0,1,1,0);
for(Token t; (t=wdf.next())!=null;) {
assertEquals(5, t.startOffset()); assertTokenStreamContents(wdf,
assertEquals(6, t.endOffset()); new String[] { "foo", "bar", "foobar" },
} new int[] { 5, 5, 5 },
new int[] { 6, 6, 6 });
} }
public void testOffsetChange() throws Exception public void testOffsetChange() throws Exception
{ {
WordDelimiterFilter wdf = new WordDelimiterFilter( WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("übelkeit)", 7, 16)),
Token t;
public Token next() {
if (t != null) return null;
t = new Token("übelkeit)", 7, 16);
return t;
}
},
1,1,0,0,1,1,0 1,1,0,0,1,1,0
); );
Token t = wdf.next(); assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
assertNotNull(t); new int[] { 7 },
assertEquals("übelkeit", t.term()); new int[] { 15 });
assertEquals(7, t.startOffset());
assertEquals(15, t.endOffset());
} }
public void testOffsetChange2() throws Exception public void testOffsetChange2() throws Exception
{ {
WordDelimiterFilter wdf = new WordDelimiterFilter( WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("(übelkeit", 7, 17)),
Token t;
public Token next() {
if (t != null) return null;
t = new Token("(übelkeit", 7, 17);
return t;
}
},
1,1,0,0,1,1,0 1,1,0,0,1,1,0
); );
Token t = wdf.next(); assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
assertNotNull(t); new int[] { 8 },
assertEquals("übelkeit", t.term()); new int[] { 17 });
assertEquals(8, t.startOffset());
assertEquals(17, t.endOffset());
} }
public void testOffsetChange3() throws Exception public void testOffsetChange3() throws Exception
{ {
WordDelimiterFilter wdf = new WordDelimiterFilter( WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("(übelkeit", 7, 16)),
Token t;
public Token next() {
if (t != null) return null;
t = new Token("(übelkeit", 7, 16);
return t;
}
},
1,1,0,0,1,1,0 1,1,0,0,1,1,0
); );
Token t = wdf.next(); assertTokenStreamContents(wdf,
new String[] { "übelkeit" },
assertNotNull(t); new int[] { 8 },
assertEquals("übelkeit", t.term()); new int[] { 16 });
assertEquals(8, t.startOffset());
assertEquals(16, t.endOffset());
} }
public void testOffsetChange4() throws Exception public void testOffsetChange4() throws Exception
{ {
WordDelimiterFilter wdf = new WordDelimiterFilter( WordDelimiterFilter wdf = new WordDelimiterFilter(
new TokenStream() { new SingleTokenTokenStream(new Token("(foo,bar)", 7, 16)),
private Token t;
public Token next() {
if (t != null) return null;
t = new Token("(foo,bar)", 7, 16);
return t;
}
},
1,1,0,0,1,1,0 1,1,0,0,1,1,0
); );
Token t = wdf.next(); assertTokenStreamContents(wdf,
new String[] { "foo", "bar", "foobar"},
assertNotNull(t); new int[] { 8, 12, 8 },
assertEquals("foo", t.term()); new int[] { 11, 15, 15 });
assertEquals(8, t.startOffset());
assertEquals(11, t.endOffset());
t = wdf.next();
assertNotNull(t);
assertEquals("bar", t.term());
assertEquals(12, t.startOffset());
assertEquals(15, t.endOffset());
} }
public void testAlphaNumericWords(){ public void testAlphaNumericWords(){
@ -338,24 +264,10 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
public void doSplit(final String input, String... output) throws Exception { public void doSplit(final String input, String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
boolean done=false; new StringReader(input)), 1, 1, 0, 0, 0);
@Override
public Token next() throws IOException { assertTokenStreamContents(wdf, output);
if (done) return null;
done = true;
return new Token(input,0,input.length());
}
}
,1,1,0,0,0
);
for(String expected : output) {
Token t = wdf.next();
assertEquals(expected, t.term());
}
assertEquals(null, wdf.next());
} }
public void testSplits() throws Exception { public void testSplits() throws Exception {
@ -365,29 +277,38 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
// non-space marking symbol shouldn't cause split // non-space marking symbol shouldn't cause split
// this is an example in Thai // this is an example in Thai
doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19"); doSplit("\u0e1a\u0e49\u0e32\u0e19","\u0e1a\u0e49\u0e32\u0e19");
// possessive followed by delimiter
doSplit("test's'", "test");
// some russian upper and lowercase
doSplit("Роберт", "Роберт");
// now cause a split (russian camelCase)
doSplit("РобЕрт", "Роб", "Ерт");
// a composed titlecase character, don't split
doSplit("aDžungla", "aDžungla");
// a modifier letter, don't split
doSplit("ســـــــــــــــــلام", "ســـــــــــــــــلام");
// enclosing mark, don't split
doSplit("۞test", "۞test");
// combining spacing mark (the virama), don't split
doSplit("हिन्दी", "हिन्दी");
// don't split non-ascii digits
doSplit("١٢٣٤", "١٢٣٤");
// don't split supplementaries into unpaired surrogates
doSplit("𠀀𠀀", "𠀀𠀀");
} }
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception { public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
WordDelimiterFilter wdf = new WordDelimiterFilter(new TokenStream() { WordDelimiterFilter wdf = new WordDelimiterFilter(new KeywordTokenizer(
boolean done=false; new StringReader(input)), 1,1,0,0,0,1,0,1,stemPossessive, null);
@Override
public Token next() throws IOException {
if (done) return null;
done = true;
return new Token(input,0,input.length());
}
}
,1,1,0,0,0,1,0,1,stemPossessive,null
);
for(String expected : output) { assertTokenStreamContents(wdf, output);
Token t = wdf.next();
assertEquals(expected, t.term());
}
assertEquals(null, wdf.next());
} }
/* /*
@ -485,25 +406,4 @@ public class TestWordDelimiterFilter extends AbstractSolrTestCase {
new int[] { 6, 14, 19 }, new int[] { 6, 14, 19 },
new int[] { 1, 11, 1 }); new int[] { 1, 11, 1 });
} }
private void assertAnalyzesTo(Analyzer a, String input, String[] output,
int startOffsets[], int endOffsets[], int posIncs[]) throws Exception {
TokenStream ts = a.tokenStream("dummy", new StringReader(input));
TermAttribute termAtt = (TermAttribute) ts
.getAttribute(TermAttribute.class);
OffsetAttribute offsetAtt = (OffsetAttribute) ts
.getAttribute(OffsetAttribute.class);
PositionIncrementAttribute posIncAtt = (PositionIncrementAttribute) ts
.getAttribute(PositionIncrementAttribute.class);
for (int i = 0; i < output.length; i++) {
assertTrue(ts.incrementToken());
assertEquals(output[i], termAtt.term());
assertEquals(startOffsets[i], offsetAtt.startOffset());
assertEquals(endOffsets[i], offsetAtt.endOffset());
assertEquals(posIncs[i], posIncAtt.getPositionIncrement());
}
assertFalse(ts.incrementToken());
ts.close();
}
} }

View File

@ -0,0 +1,19 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A set of words for testing the DictionaryCompound factory
soft
ball
team

View File

@ -0,0 +1,24 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# A set of articles for testing the French Elision filter.
# Requiring a text file is a bit weird here...
l
m
t
qu
n
s
j