SOLR-234 TrimFilter can update the Token's startOffset and endOffset if updateOffsets="true". By default the Token offsets are unchanged.

This also refactors many of the Filter tests to share common code.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Ryan McKinley 2007-05-23 17:18:05 +00:00
parent 66b04d6ccc
commit c93dd85a74
9 changed files with 254 additions and 166 deletions

View File

@ -194,6 +194,10 @@ New Features
32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory
and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset
if updateOffsets="true". By default the Token offsets are unchanged.
(ryan)
Changes in runtime behavior
1. Highlighting using DisMax will only pick up terms from the main
user query, not boost or filter queries (klaas).

View File

@ -30,16 +30,49 @@ import java.io.IOException;
*/
public final class TrimFilter extends TokenFilter {
public TrimFilter(TokenStream in) {
final boolean updateOffsets;
public TrimFilter(TokenStream in, boolean updateOffsets ) {
super(in);
this.updateOffsets = updateOffsets;
}
@Override
public final Token next() throws IOException {
Token t = input.next();
if (null == t || null == t.termText())
return t;
t.setTermText(t.termText().trim());
if( updateOffsets ) {
String txt = t.termText();
int start = 0;
int end = txt.length();
int endOff = 0;
// eat the first characters
while ((start < end) && (txt.charAt(start) <= ' ')) {
start++;
}
// eat the end characters
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
end--;
endOff++;
}
if( start > 0 || end < txt.length() ) {
int incr = t.getPositionIncrement();
t = new Token( t.termText().substring( start, end ),
t.startOffset()+start,
t.endOffset()-endOff,
t.type() );
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
}
}
else {
t.setTermText( t.termText().trim() );
}
return t;
}
}

View File

@ -17,14 +17,35 @@
package org.apache.solr.analysis;
import java.util.Map;
import org.apache.lucene.analysis.TokenStream;
import org.apache.solr.core.SolrException;
/**
* @version $Id:$
* @see TrimFilter
*/
public class TrimFilterFactory extends BaseTokenFilterFactory {
protected boolean updateOffsets = false;
@Override
public void init(Map<String,String> args) {
super.init( args );
String v = args.get( "updateOffsets" );
if( v != null ) {
try {
updateOffsets = Boolean.valueOf( v );
}
catch( Exception ex ) {
throw new SolrException( 400, "Error reading updateOffsets value. Must be true or false.", ex );
}
}
}
public TokenStream create(TokenStream input) {
return new TrimFilter(input);
return new TrimFilter(input, updateOffsets);
}
}

View File

@ -0,0 +1,164 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import junit.framework.TestCase;
/**
* General token testing helper functions
*/
public abstract class BaseTokenTestCase extends TestCase
{
public static String tsToString(TokenStream in) throws IOException {
StringBuffer out = new StringBuffer();
Token t = in.next();
if (null != t)
out.append(t.termText());
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(t.termText());
}
in.close();
return out.toString();
}
public List<String> tok2str(Iterable<Token> tokLst) {
ArrayList<String> lst = new ArrayList<String>();
for ( Token t : tokLst ) {
lst.add( t.termText());
}
return lst;
}
public void assertTokEqual(List<Token> a, List<Token> b) {
assertTokEq(a,b,false);
assertTokEq(b,a,false);
}
public void assertTokEqualOff(List<Token> a, List<Token> b) {
assertTokEq(a,b,true);
assertTokEq(b,a,true);
}
private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
int pos=0;
for (Iterator iter = a.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (!tokAt(b, tok.termText(), pos
, checkOff ? tok.startOffset() : -1
, checkOff ? tok.endOffset() : -1
))
{
fail(a + "!=" + b);
}
}
}
public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
int pos=0;
for (Iterator iter = lst.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (pos==tokPos && tok.termText().equals(val)
&& (startOff==-1 || tok.startOffset()==startOff)
&& (endOff ==-1 || tok.endOffset() ==endOff )
)
{
return true;
}
}
return false;
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
*/
public List<Token> tokens(String str) {
String[] arr = str.split(" ");
List<Token> result = new ArrayList<Token>();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
}
//------------------------------------------------------------------------
// These may be useful beyond test cases...
//------------------------------------------------------------------------
// This could probably be put in a utility class
static List<Token> getTokens(TokenStream tstream) throws IOException {
List<Token> tokens = new ArrayList<Token>();
while (true) {
Token t = tstream.next();
if (t==null) break;
tokens.add(t);
}
return tokens;
}
// This could probably be put in a utility class
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Token... toks) {
this.toks = Arrays.asList(toks).iterator();
}
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}
@Override
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
}
}
}

View File

@ -26,9 +26,9 @@ import java.io.IOException;
import java.io.StringReader;
/**
* Test that BufferedTokenStream behaves as advertized in subclasses.
* Test that BufferedTokenStream behaves as advertised in subclasses.
*/
public class TestBufferedTokenStream extends TestCase {
public class TestBufferedTokenStream extends BaseTokenTestCase {
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
public static class AB_Q_Stream extends BufferedTokenStream {
@ -53,19 +53,6 @@ public class TestBufferedTokenStream extends TestCase {
}
}
public static String tsToString(TokenStream in) throws IOException {
StringBuffer out = new StringBuffer();
Token t = in.next();
if (null != t)
out.append(t.termText());
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(t.termText());
}
in.close();
return out.toString();
}
public void testABQ() throws Exception {
final String input = "How now A B brown A cow B like A B thing?";
final String expected = "How now Q B brown A cow B like Q B thing?";

View File

@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
/**
* HyphenatedWordsFilter test
*/
public class TestHyphenatedWordsFilter extends TestCase {
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
public void testHyphenatedWords() throws Exception {
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase {
assertEquals("Testing HyphenatedWordsFilter",
outputAfterHyphenatedWordsFilter, actual);
}
public static String tsToString(TokenStream in) throws IOException {
StringBuffer out = new StringBuffer();
Token t = in.next();
if (null != t)
out.append(t.termText());
for (t = in.next(); null != t; t = in.next()) {
out.append(" ").append(t.termText());
}
return out.toString();
}
}

View File

@ -19,24 +19,20 @@ package org.apache.solr.analysis;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import junit.framework.TestCase;
import org.apache.commons.codec.Encoder;
import org.apache.commons.codec.language.DoubleMetaphone;
import org.apache.commons.codec.language.Metaphone;
import org.apache.commons.codec.language.RefinedSoundex;
import org.apache.commons.codec.language.Soundex;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
/**
* @version $Id:$
* @version $Id$
*/
public class TestPhoneticFilter extends TestCase {
public class TestPhoneticFilter extends BaseTokenTestCase {
public void testFactory()
{
@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase {
runner( new Soundex(), false );
runner( new RefinedSoundex(), false );
}
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Iterator<Token> toks) {
this.toks = toks;
}
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
}
}
}

View File

@ -17,7 +17,6 @@
package org.apache.solr.analysis;
import junit.framework.TestCase;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
@ -31,67 +30,20 @@ import java.util.List;
* @author yonik
* @version $Id$
*/
public class TestSynonymFilter extends TestCase {
public class TestSynonymFilter extends BaseTokenTestCase {
public List strings(String str) {
String[] arr = str.split(" ");
return Arrays.asList(arr);
}
/***
* Return a list of tokens according to a test string format:
* a b c => returns List<Token> [a,b,c]
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
*/
public List tokens(String str) {
String[] arr = str.split(" ");
List result = new ArrayList();
for (int i=0; i<arr.length; i++) {
String[] toks = arr[i].split("/");
String[] params = toks[0].split(",");
int posInc;
int start;
int end;
if (params.length > 1) {
posInc = Integer.parseInt(params[1]);
} else {
posInc = 1;
}
if (params.length > 2) {
start = Integer.parseInt(params[2]);
} else {
start = 0;
}
if (params.length > 3) {
end = Integer.parseInt(params[3]);
} else {
end = start + params[0].length();
}
Token t = new Token(params[0],start,end,"TEST");
t.setPositionIncrement(posInc);
result.add(t);
for (int j=1; j<toks.length; j++) {
t = new Token(toks[j],0,0,"TEST");
t.setPositionIncrement(0);
result.add(t);
}
}
return result;
}
public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
ArrayList lst = new ArrayList();
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
ArrayList<Token> lst = new ArrayList<Token>();
final List toks = tokens(input);
TokenStream ts = new TokenStream() {
Iterator iter = toks.iterator();
@Override
public Token next() throws IOException {
return iter.hasNext() ? (Token)iter.next() : null;
}
@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase {
}
}
public List tok2str(List tokLst) {
ArrayList lst = new ArrayList();
for (Iterator iter = tokLst.iterator(); iter.hasNext();) {
lst.add(((Token)(iter.next())).termText());
}
return lst;
}
public void assertTokEqual(List a, List b) {
assertTokEq(a,b,false);
assertTokEq(b,a,false);
}
public void assertTokEqualOff(List a, List b) {
assertTokEq(a,b,true);
assertTokEq(b,a,true);
}
private void assertTokEq(List a, List b, boolean checkOff) {
int pos=0;
for (Iterator iter = a.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (!tokAt(b, tok.termText(), pos
, checkOff ? tok.startOffset() : -1
, checkOff ? tok.endOffset() : -1
))
{
fail(a + "!=" + b);
}
}
}
public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
int pos=0;
for (Iterator iter = lst.iterator(); iter.hasNext();) {
Token tok = (Token)iter.next();
pos += tok.getPositionIncrement();
if (pos==tokPos && tok.termText().equals(val)
&& (startOff==-1 || tok.startOffset()==startOff)
&& (endOff==-1 || tok.endOffset()==endOff)
)
{
return true;
}
}
return false;
}
public void testMatching() throws IOException {
SynonymMap map = new SynonymMap();

View File

@ -18,8 +18,10 @@
package org.apache.solr.analysis;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Arrays;
import java.util.List;
import junit.framework.TestCase;
@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream;
/**
* @version $Id:$
*/
public class TestTrimFilter extends TestCase {
public class TestTrimFilter extends BaseTokenTestCase {
public void testTrim() throws Exception {
TokenStream ts = new TrimFilter
(new IterTokenStream(new Token(" a ", 1, 5),
new Token("b ",6,10),
new Token("cCc",11,15),
new Token(" ",16,20)));
new Token(" ",16,20)), false );
assertEquals("a", ts.next().termText());
assertEquals("b", ts.next().termText());
assertEquals("cCc", ts.next().termText());
assertEquals("", ts.next().termText());
assertNull(ts.next());
ts = new TrimFilter( new IterTokenStream(
new Token(" a", 0,2),
new Token("b ", 0,2),
new Token(" c ",0,3),
new Token(" ",0,3)), true );
List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
List<Token> real = getTokens(ts);
for( Token t : expect ) {
System.out.println( "TEST:" + t );
}
for( Token t : real ) {
System.out.println( "REAL:" + t );
}
assertTokEqualOff( expect, real );
}
public static class IterTokenStream extends TokenStream {
Iterator<Token> toks;
public IterTokenStream(Token... toks) {
this.toks = Arrays.asList(toks).iterator();
}
public Token next() {
if (toks.hasNext()) {
return toks.next();
}
return null;
}
}
}