mirror of https://github.com/apache/lucene.git
SOLR-234 TrimFilter can update the Token's startOffset and endOffset if updateOffsets="true". By default the Token offsets are unchanged.
This also refactors many of the Filter tests to share common code. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
66b04d6ccc
commit
c93dd85a74
|
@ -194,6 +194,10 @@ New Features
|
|||
32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory
|
||||
and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
|
||||
|
||||
33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset
|
||||
if updateOffsets="true". By default the Token offsets are unchanged.
|
||||
(ryan)
|
||||
|
||||
Changes in runtime behavior
|
||||
1. Highlighting using DisMax will only pick up terms from the main
|
||||
user query, not boost or filter queries (klaas).
|
||||
|
|
|
@ -29,17 +29,50 @@ import java.io.IOException;
|
|||
* @version $Id:$
|
||||
*/
|
||||
public final class TrimFilter extends TokenFilter {
|
||||
|
||||
final boolean updateOffsets;
|
||||
|
||||
public TrimFilter(TokenStream in) {
|
||||
public TrimFilter(TokenStream in, boolean updateOffsets ) {
|
||||
super(in);
|
||||
this.updateOffsets = updateOffsets;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final Token next() throws IOException {
|
||||
Token t = input.next();
|
||||
if (null == t || null == t.termText())
|
||||
return t;
|
||||
|
||||
t.setTermText(t.termText().trim());
|
||||
if( updateOffsets ) {
|
||||
String txt = t.termText();
|
||||
int start = 0;
|
||||
int end = txt.length();
|
||||
int endOff = 0;
|
||||
|
||||
// eat the first characters
|
||||
while ((start < end) && (txt.charAt(start) <= ' ')) {
|
||||
start++;
|
||||
}
|
||||
|
||||
// eat the end characters
|
||||
while ((start < end) && (txt.charAt(end-1) <= ' ')) {
|
||||
end--;
|
||||
endOff++;
|
||||
}
|
||||
|
||||
if( start > 0 || end < txt.length() ) {
|
||||
int incr = t.getPositionIncrement();
|
||||
t = new Token( t.termText().substring( start, end ),
|
||||
t.startOffset()+start,
|
||||
t.endOffset()-endOff,
|
||||
t.type() );
|
||||
|
||||
t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
|
||||
}
|
||||
}
|
||||
else {
|
||||
t.setTermText( t.termText().trim() );
|
||||
}
|
||||
return t;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,14 +17,35 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.solr.core.SolrException;
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
* @see TrimFilter
|
||||
*/
|
||||
public class TrimFilterFactory extends BaseTokenFilterFactory {
|
||||
|
||||
protected boolean updateOffsets = false;
|
||||
|
||||
@Override
|
||||
public void init(Map<String,String> args) {
|
||||
super.init( args );
|
||||
|
||||
String v = args.get( "updateOffsets" );
|
||||
if( v != null ) {
|
||||
try {
|
||||
updateOffsets = Boolean.valueOf( v );
|
||||
}
|
||||
catch( Exception ex ) {
|
||||
throw new SolrException( 400, "Error reading updateOffsets value. Must be true or false.", ex );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public TokenStream create(TokenStream input) {
|
||||
return new TrimFilter(input);
|
||||
return new TrimFilter(input, updateOffsets);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
/**
|
||||
* General token testing helper functions
|
||||
*/
|
||||
public abstract class BaseTokenTestCase extends TestCase
|
||||
{
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuffer out = new StringBuffer();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(t.termText());
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(t.termText());
|
||||
}
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
public List<String> tok2str(Iterable<Token> tokLst) {
|
||||
ArrayList<String> lst = new ArrayList<String>();
|
||||
for ( Token t : tokLst ) {
|
||||
lst.add( t.termText());
|
||||
}
|
||||
return lst;
|
||||
}
|
||||
|
||||
|
||||
public void assertTokEqual(List<Token> a, List<Token> b) {
|
||||
assertTokEq(a,b,false);
|
||||
assertTokEq(b,a,false);
|
||||
}
|
||||
|
||||
public void assertTokEqualOff(List<Token> a, List<Token> b) {
|
||||
assertTokEq(a,b,true);
|
||||
assertTokEq(b,a,true);
|
||||
}
|
||||
|
||||
private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = a.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (!tokAt(b, tok.termText(), pos
|
||||
, checkOff ? tok.startOffset() : -1
|
||||
, checkOff ? tok.endOffset() : -1
|
||||
))
|
||||
{
|
||||
fail(a + "!=" + b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = lst.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (pos==tokPos && tok.termText().equals(val)
|
||||
&& (startOff==-1 || tok.startOffset()==startOff)
|
||||
&& (endOff ==-1 || tok.endOffset() ==endOff )
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
/***
|
||||
* Return a list of tokens according to a test string format:
|
||||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
*/
|
||||
public List<Token> tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
List<Token> result = new ArrayList<Token>();
|
||||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
t.setPositionIncrement(0);
|
||||
result.add(t);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
//------------------------------------------------------------------------
|
||||
// These may be useful beyond test cases...
|
||||
//------------------------------------------------------------------------
|
||||
|
||||
// This could probably be put in a utility class
|
||||
static List<Token> getTokens(TokenStream tstream) throws IOException {
|
||||
List<Token> tokens = new ArrayList<Token>();
|
||||
while (true) {
|
||||
Token t = tstream.next();
|
||||
if (t==null) break;
|
||||
tokens.add(t);
|
||||
}
|
||||
return tokens;
|
||||
}
|
||||
|
||||
// This could probably be put in a utility class
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
}
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
@Override
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -26,9 +26,9 @@ import java.io.IOException;
|
|||
import java.io.StringReader;
|
||||
|
||||
/**
|
||||
* Test that BufferedTokenStream behaves as advertized in subclasses.
|
||||
* Test that BufferedTokenStream behaves as advertised in subclasses.
|
||||
*/
|
||||
public class TestBufferedTokenStream extends TestCase {
|
||||
public class TestBufferedTokenStream extends BaseTokenTestCase {
|
||||
|
||||
/** Example of a class implementing the rule "A" "B" => "Q" "B" */
|
||||
public static class AB_Q_Stream extends BufferedTokenStream {
|
||||
|
@ -52,20 +52,7 @@ public class TestBufferedTokenStream extends TestCase {
|
|||
return t;
|
||||
}
|
||||
}
|
||||
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuffer out = new StringBuffer();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(t.termText());
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(t.termText());
|
||||
}
|
||||
in.close();
|
||||
return out.toString();
|
||||
}
|
||||
|
||||
public void testABQ() throws Exception {
|
||||
final String input = "How now A B brown A cow B like A B thing?";
|
||||
final String expected = "How now Q B brown A cow B like Q B thing?";
|
||||
|
|
|
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
|
|||
/**
|
||||
* HyphenatedWordsFilter test
|
||||
*/
|
||||
public class TestHyphenatedWordsFilter extends TestCase {
|
||||
public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
|
||||
public void testHyphenatedWords() throws Exception {
|
||||
String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
|
||||
String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
|
||||
|
@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase {
|
|||
assertEquals("Testing HyphenatedWordsFilter",
|
||||
outputAfterHyphenatedWordsFilter, actual);
|
||||
}
|
||||
|
||||
public static String tsToString(TokenStream in) throws IOException {
|
||||
StringBuffer out = new StringBuffer();
|
||||
Token t = in.next();
|
||||
if (null != t)
|
||||
out.append(t.termText());
|
||||
|
||||
for (t = in.next(); null != t; t = in.next()) {
|
||||
out.append(" ").append(t.termText());
|
||||
}
|
||||
return out.toString();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,24 +19,20 @@ package org.apache.solr.analysis;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.apache.commons.codec.Encoder;
|
||||
import org.apache.commons.codec.language.DoubleMetaphone;
|
||||
import org.apache.commons.codec.language.Metaphone;
|
||||
import org.apache.commons.codec.language.RefinedSoundex;
|
||||
import org.apache.commons.codec.language.Soundex;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
||||
/**
|
||||
* @version $Id:$
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TestPhoneticFilter extends TestCase {
|
||||
public class TestPhoneticFilter extends BaseTokenTestCase {
|
||||
|
||||
public void testFactory()
|
||||
{
|
||||
|
@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase {
|
|||
runner( new Soundex(), false );
|
||||
runner( new RefinedSoundex(), false );
|
||||
}
|
||||
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Iterator<Token> toks) {
|
||||
this.toks = toks;
|
||||
}
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
|
||||
package org.apache.solr.analysis;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
import org.apache.lucene.analysis.Token;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
|
||||
|
@ -31,67 +30,20 @@ import java.util.List;
|
|||
* @author yonik
|
||||
* @version $Id$
|
||||
*/
|
||||
public class TestSynonymFilter extends TestCase {
|
||||
public class TestSynonymFilter extends BaseTokenTestCase {
|
||||
|
||||
public List strings(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
return Arrays.asList(arr);
|
||||
}
|
||||
|
||||
/***
|
||||
* Return a list of tokens according to a test string format:
|
||||
* a b c => returns List<Token> [a,b,c]
|
||||
* a/b => tokens a and b share the same spot (b.positionIncrement=0)
|
||||
* a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
|
||||
* a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11
|
||||
*/
|
||||
public List tokens(String str) {
|
||||
String[] arr = str.split(" ");
|
||||
List result = new ArrayList();
|
||||
for (int i=0; i<arr.length; i++) {
|
||||
String[] toks = arr[i].split("/");
|
||||
String[] params = toks[0].split(",");
|
||||
|
||||
int posInc;
|
||||
int start;
|
||||
int end;
|
||||
|
||||
if (params.length > 1) {
|
||||
posInc = Integer.parseInt(params[1]);
|
||||
} else {
|
||||
posInc = 1;
|
||||
}
|
||||
|
||||
if (params.length > 2) {
|
||||
start = Integer.parseInt(params[2]);
|
||||
} else {
|
||||
start = 0;
|
||||
}
|
||||
|
||||
if (params.length > 3) {
|
||||
end = Integer.parseInt(params[3]);
|
||||
} else {
|
||||
end = start + params[0].length();
|
||||
}
|
||||
|
||||
Token t = new Token(params[0],start,end,"TEST");
|
||||
t.setPositionIncrement(posInc);
|
||||
|
||||
result.add(t);
|
||||
for (int j=1; j<toks.length; j++) {
|
||||
t = new Token(toks[j],0,0,"TEST");
|
||||
t.setPositionIncrement(0);
|
||||
result.add(t);
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
|
||||
ArrayList lst = new ArrayList();
|
||||
public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
|
||||
ArrayList<Token> lst = new ArrayList<Token>();
|
||||
final List toks = tokens(input);
|
||||
TokenStream ts = new TokenStream() {
|
||||
Iterator iter = toks.iterator();
|
||||
@Override
|
||||
public Token next() throws IOException {
|
||||
return iter.hasNext() ? (Token)iter.next() : null;
|
||||
}
|
||||
|
@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public List tok2str(List tokLst) {
|
||||
ArrayList lst = new ArrayList();
|
||||
for (Iterator iter = tokLst.iterator(); iter.hasNext();) {
|
||||
lst.add(((Token)(iter.next())).termText());
|
||||
}
|
||||
return lst;
|
||||
}
|
||||
|
||||
|
||||
public void assertTokEqual(List a, List b) {
|
||||
assertTokEq(a,b,false);
|
||||
assertTokEq(b,a,false);
|
||||
}
|
||||
|
||||
public void assertTokEqualOff(List a, List b) {
|
||||
assertTokEq(a,b,true);
|
||||
assertTokEq(b,a,true);
|
||||
}
|
||||
|
||||
private void assertTokEq(List a, List b, boolean checkOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = a.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (!tokAt(b, tok.termText(), pos
|
||||
, checkOff ? tok.startOffset() : -1
|
||||
, checkOff ? tok.endOffset() : -1
|
||||
))
|
||||
{
|
||||
fail(a + "!=" + b);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
|
||||
int pos=0;
|
||||
for (Iterator iter = lst.iterator(); iter.hasNext();) {
|
||||
Token tok = (Token)iter.next();
|
||||
pos += tok.getPositionIncrement();
|
||||
if (pos==tokPos && tok.termText().equals(val)
|
||||
&& (startOff==-1 || tok.startOffset()==startOff)
|
||||
&& (endOff==-1 || tok.endOffset()==endOff)
|
||||
)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
public void testMatching() throws IOException {
|
||||
SynonymMap map = new SynonymMap();
|
||||
|
|
|
@ -18,8 +18,10 @@
|
|||
package org.apache.solr.analysis;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
|
@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream;
|
|||
/**
|
||||
* @version $Id:$
|
||||
*/
|
||||
public class TestTrimFilter extends TestCase {
|
||||
public class TestTrimFilter extends BaseTokenTestCase {
|
||||
|
||||
public void testTrim() throws Exception {
|
||||
TokenStream ts = new TrimFilter
|
||||
(new IterTokenStream(new Token(" a ", 1, 5),
|
||||
new Token("b ",6,10),
|
||||
new Token("cCc",11,15),
|
||||
new Token(" ",16,20)));
|
||||
new Token(" ",16,20)), false );
|
||||
|
||||
assertEquals("a", ts.next().termText());
|
||||
assertEquals("b", ts.next().termText());
|
||||
assertEquals("cCc", ts.next().termText());
|
||||
assertEquals("", ts.next().termText());
|
||||
assertNull(ts.next());
|
||||
|
||||
ts = new TrimFilter( new IterTokenStream(
|
||||
new Token(" a", 0,2),
|
||||
new Token("b ", 0,2),
|
||||
new Token(" c ",0,3),
|
||||
new Token(" ",0,3)), true );
|
||||
|
||||
List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
|
||||
List<Token> real = getTokens(ts);
|
||||
for( Token t : expect ) {
|
||||
System.out.println( "TEST:" + t );
|
||||
}
|
||||
for( Token t : real ) {
|
||||
System.out.println( "REAL:" + t );
|
||||
}
|
||||
assertTokEqualOff( expect, real );
|
||||
}
|
||||
|
||||
public static class IterTokenStream extends TokenStream {
|
||||
Iterator<Token> toks;
|
||||
public IterTokenStream(Token... toks) {
|
||||
this.toks = Arrays.asList(toks).iterator();
|
||||
}
|
||||
public Token next() {
|
||||
if (toks.hasNext()) {
|
||||
return toks.next();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue