SOLR-234 TrimFilter can update the Token's startOffset and endOffset if updateOffsets="true". By default the Token offsets are unchanged.

This also refactors many of the Filter tests to share common code. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68
2007-05-23 17:18:05 +00:00 · 2007-05-23 17:18:05 +00:00 · c93dd85a74
parent 66b04d6ccc
commit c93dd85a74
9 changed files with 254 additions and 166 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -194,6 +194,10 @@ New Features
 32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory 
    and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
 33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset 
    if updateOffsets="true".  By default the Token offsets are unchanged.
    (ryan)
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
    user query, not boost or filter queries (klaas).
--- a/src/java/org/apache/solr/analysis/TrimFilter.java
+++ b/src/java/org/apache/solr/analysis/TrimFilter.java
@ -29,17 +29,50 @@ import java.io.IOException;
 * @version $Id:$
 */
 public final class TrimFilter extends TokenFilter {
  final boolean updateOffsets;
-  public TrimFilter(TokenStream in) {
+  public TrimFilter(TokenStream in, boolean updateOffsets ) {
    super(in);
    this.updateOffsets = updateOffsets;
  }
  @Override
  public final Token next() throws IOException {
    Token t = input.next();
    if (null == t || null == t.termText())
      return t;
-    t.setTermText(t.termText().trim());
+    if( updateOffsets ) {
      String txt = t.termText();
      int start = 0;
      int end = txt.length();
      int endOff = 0;
      // eat the first characters
      while ((start < end) && (txt.charAt(start) <= ' ')) {
        start++;
      }
      // eat the end characters
      while ((start < end) && (txt.charAt(end-1) <= ' ')) {
        end--;
        endOff++;
      }
      if( start > 0 || end < txt.length() ) {
        int incr = t.getPositionIncrement();
        t = new Token( t.termText().substring( start, end ),
             t.startOffset()+start,
             t.endOffset()-endOff,
             t.type() );
        t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
      }
    }
    else {
      t.setTermText( t.termText().trim() );
    }
    return t;
  }
 }
--- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@ -17,14 +17,35 @@
 package org.apache.solr.analysis;
 import java.util.Map;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.solr.core.SolrException;
 /**
 * @version $Id:$
 * @see TrimFilter
 */
 public class TrimFilterFactory extends BaseTokenFilterFactory {
  protected boolean updateOffsets = false;
  @Override
  public void init(Map<String,String> args) {
    super.init( args );
    String v = args.get( "updateOffsets" );
    if( v != null ) {
      try {
        updateOffsets = Boolean.valueOf( v );
      }
      catch( Exception ex ) {
        throw new SolrException( 400, "Error reading updateOffsets value.  Must be true or false.", ex );
      }
    }
  }
  public TokenStream create(TokenStream input) {
-    return new TrimFilter(input);
+    return new TrimFilter(input, updateOffsets);
  }
 }
--- a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
+++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
@ -0,0 +1,164 @@
 package org.apache.solr.analysis;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 import junit.framework.TestCase;
 /**
 * General token testing helper functions
 */
 public abstract class BaseTokenTestCase extends TestCase 
 {
  public static String tsToString(TokenStream in) throws IOException {
    StringBuffer out = new StringBuffer();
    Token t = in.next();
    if (null != t)
      out.append(t.termText());
    for (t = in.next(); null != t; t = in.next()) {
      out.append(" ").append(t.termText());
    }
    in.close();
    return out.toString();
  }
  public List<String> tok2str(Iterable<Token> tokLst) {
    ArrayList<String> lst = new ArrayList<String>();
    for ( Token t : tokLst ) {
      lst.add( t.termText());
    }
    return lst;
  }
  public void assertTokEqual(List<Token> a, List<Token> b) {
    assertTokEq(a,b,false);
    assertTokEq(b,a,false);
  }
  public void assertTokEqualOff(List<Token> a, List<Token> b) {
    assertTokEq(a,b,true);
    assertTokEq(b,a,true);
  }
  private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
    int pos=0;
    for (Iterator iter = a.iterator(); iter.hasNext();) {
      Token tok = (Token)iter.next();
      pos += tok.getPositionIncrement();
      if (!tokAt(b, tok.termText(), pos
              , checkOff ? tok.startOffset() : -1
              , checkOff ? tok.endOffset() : -1
              )) 
      {
        fail(a + "!=" + b);
      }
    }
  }
  public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
    int pos=0;
    for (Iterator iter = lst.iterator(); iter.hasNext();) {
      Token tok = (Token)iter.next();
      pos += tok.getPositionIncrement();
      if (pos==tokPos && tok.termText().equals(val)
          && (startOff==-1 || tok.startOffset()==startOff)
          && (endOff  ==-1 || tok.endOffset()  ==endOff  )
           )
      {
        return true;
      }
    }
    return false;
  }
  /***
   * Return a list of tokens according to a test string format:
   * a b c  =>  returns List<Token> [a,b,c]
   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
   */
  public List<Token> tokens(String str) {
    String[] arr = str.split(" ");
    List<Token> result = new ArrayList<Token>();
    for (int i=0; i<arr.length; i++) {
      String[] toks = arr[i].split("/");
      String[] params = toks[0].split(",");
      int posInc;
      int start;
      int end;
      if (params.length > 1) {
        posInc = Integer.parseInt(params[1]);
      } else {
        posInc = 1;
      }
      if (params.length > 2) {
        start = Integer.parseInt(params[2]);
      } else {
        start = 0;
      }
      if (params.length > 3) {
        end = Integer.parseInt(params[3]);
      } else {
        end = start + params[0].length();
      }
      Token t = new Token(params[0],start,end,"TEST");
      t.setPositionIncrement(posInc);
      result.add(t);
      for (int j=1; j<toks.length; j++) {
        t = new Token(toks[j],0,0,"TEST");
        t.setPositionIncrement(0);
        result.add(t);
      }
    }
    return result;
  }
  //------------------------------------------------------------------------
  // These may be useful beyond test cases...
  //------------------------------------------------------------------------
  // This could probably be put in a utility class
  static List<Token> getTokens(TokenStream tstream) throws IOException {
    List<Token> tokens = new ArrayList<Token>();
    while (true) {
      Token t = tstream.next();
      if (t==null) break;
      tokens.add(t);
    }
    return tokens;
  }
  // This could probably be put in a utility class
  public static class IterTokenStream extends TokenStream {
    Iterator<Token> toks;
    public IterTokenStream(Token... toks) {
      this.toks = Arrays.asList(toks).iterator();
    }
    public IterTokenStream(Iterator<Token> toks) {
      this.toks = toks;
    }
    @Override
    public Token next() {
      if (toks.hasNext()) {
        return toks.next();
      }
      return null;
    }
  }
 }
--- a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
+++ b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
@ -26,9 +26,9 @@ import java.io.IOException;
 import java.io.StringReader;
 /**
- * Test that BufferedTokenStream behaves as advertized in subclasses.
+ * Test that BufferedTokenStream behaves as advertised in subclasses.
 */
-public class TestBufferedTokenStream extends TestCase {
+public class TestBufferedTokenStream extends BaseTokenTestCase {
  /** Example of a class implementing the rule "A" "B" => "Q" "B" */
  public static class AB_Q_Stream extends BufferedTokenStream {
@ -52,20 +52,7 @@ public class TestBufferedTokenStream extends TestCase {
      return t;
    }
  }
  public static String tsToString(TokenStream in) throws IOException {
    StringBuffer out = new StringBuffer();
    Token t = in.next();
    if (null != t)
      out.append(t.termText());
    for (t = in.next(); null != t; t = in.next()) {
      out.append(" ").append(t.termText());
    }
    in.close();
    return out.toString();
  }
  public void testABQ() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    final String expected = "How now Q B brown A cow B like Q B thing?";
--- a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
+++ b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
 /**
 * HyphenatedWordsFilter test
 */
-public class TestHyphenatedWordsFilter extends TestCase {
+public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
 	public void testHyphenatedWords() throws Exception {
 		String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
 		String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase {
 		assertEquals("Testing HyphenatedWordsFilter",
 				outputAfterHyphenatedWordsFilter, actual);
 	}
 	public static String tsToString(TokenStream in) throws IOException {
 		StringBuffer out = new StringBuffer();
 		Token t = in.next();
 		if (null != t)
 			out.append(t.termText());
 		for (t = in.next(); null != t; t = in.next()) {
 			out.append(" ").append(t.termText());
 		}
 		return out.toString();
 	}
 }
--- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
@ -19,24 +19,20 @@ package org.apache.solr.analysis;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
 import junit.framework.TestCase;
 import org.apache.commons.codec.Encoder;
 import org.apache.commons.codec.language.DoubleMetaphone;
 import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 /**
- * @version $Id:$
+ * @version $Id$
 */
-public class TestPhoneticFilter extends TestCase {
+public class TestPhoneticFilter extends BaseTokenTestCase {
  public void testFactory()
  {
@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase {
    runner( new Soundex(), false );
    runner( new RefinedSoundex(), false );
  }
  public static class IterTokenStream extends TokenStream {
    Iterator<Token> toks;
    public IterTokenStream(Iterator<Token> toks) {
      this.toks = toks;
    }
    public Token next() {
      if (toks.hasNext()) {
        return toks.next();
      }
      return null;
    }
  }
 }
--- a/src/test/org/apache/solr/analysis/TestSynonymFilter.java
+++ b/src/test/org/apache/solr/analysis/TestSynonymFilter.java
@ -17,7 +17,6 @@
 package org.apache.solr.analysis;
 import junit.framework.TestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
@ -31,67 +30,20 @@ import java.util.List;
 * @author yonik
 * @version $Id$
 */
-public class TestSynonymFilter extends TestCase {
+public class TestSynonymFilter extends BaseTokenTestCase {
  public List strings(String str) {
    String[] arr = str.split(" ");
    return Arrays.asList(arr);
  }
  /***
   * Return a list of tokens according to a test string format:
   * a b c  =>  returns List<Token> [a,b,c]
   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
   */
  public List tokens(String str) {
    String[] arr = str.split(" ");
    List result = new ArrayList();
    for (int i=0; i<arr.length; i++) {
      String[] toks = arr[i].split("/");
      String[] params = toks[0].split(",");
-      int posInc;
+  public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
-      int start;
+    ArrayList<Token> lst = new ArrayList<Token>();
      int end;
      if (params.length > 1) {
        posInc = Integer.parseInt(params[1]);
      } else {
        posInc = 1;
      }
      if (params.length > 2) {
        start = Integer.parseInt(params[2]);
      } else {
        start = 0;
      }
      if (params.length > 3) {
        end = Integer.parseInt(params[3]);
      } else {
        end = start + params[0].length();
      }
      Token t = new Token(params[0],start,end,"TEST");
      t.setPositionIncrement(posInc);
      result.add(t);
      for (int j=1; j<toks.length; j++) {
        t = new Token(toks[j],0,0,"TEST");
        t.setPositionIncrement(0);
        result.add(t);
      }
    }
    return result;
  }
  public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
    ArrayList lst = new ArrayList();
    final List toks = tokens(input);
    TokenStream ts = new TokenStream() {
      Iterator iter = toks.iterator();
      @Override
      public Token next() throws IOException {
        return iter.hasNext() ? (Token)iter.next() : null;
      }
@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase {
    }
  }
  public List tok2str(List tokLst) {
    ArrayList lst = new ArrayList();
    for (Iterator iter = tokLst.iterator(); iter.hasNext();) {
      lst.add(((Token)(iter.next())).termText());
    }
    return lst;
  }
  public void assertTokEqual(List a, List b) {
    assertTokEq(a,b,false);
    assertTokEq(b,a,false);
  }
  public void assertTokEqualOff(List a, List b) {
    assertTokEq(a,b,true);
    assertTokEq(b,a,true);
  }
  private void assertTokEq(List a, List b, boolean checkOff) {
    int pos=0;
    for (Iterator iter = a.iterator(); iter.hasNext();) {
      Token tok = (Token)iter.next();
      pos += tok.getPositionIncrement();
      if (!tokAt(b, tok.termText(), pos
              , checkOff ? tok.startOffset() : -1
              , checkOff ? tok.endOffset() : -1
              )) 
      {
        fail(a + "!=" + b);
      }
    }
  }
  public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
    int pos=0;
    for (Iterator iter = lst.iterator(); iter.hasNext();) {
      Token tok = (Token)iter.next();
      pos += tok.getPositionIncrement();
      if (pos==tokPos && tok.termText().equals(val)
          && (startOff==-1 || tok.startOffset()==startOff)
          && (endOff==-1 || tok.endOffset()==endOff)
           )
      {
        return true;
      }
    }
    return false;
  }
  public void testMatching() throws IOException {
    SynonymMap map = new SynonymMap();
--- a/src/test/org/apache/solr/analysis/TestTrimFilter.java
+++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java
@ -18,8 +18,10 @@
 package org.apache.solr.analysis;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Arrays;
 import java.util.List;
 import junit.framework.TestCase;
@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream;
 /**
 * @version $Id:$
 */
-public class TestTrimFilter extends TestCase {
+public class TestTrimFilter extends BaseTokenTestCase {
  public void testTrim() throws Exception {
    TokenStream ts = new TrimFilter
      (new IterTokenStream(new Token(" a ", 1, 5),
                           new Token("b   ",6,10),
                           new Token("cCc",11,15),
-                           new Token("   ",16,20)));
+                           new Token("   ",16,20)), false );
    assertEquals("a", ts.next().termText());
    assertEquals("b", ts.next().termText());
    assertEquals("cCc", ts.next().termText());
    assertEquals("", ts.next().termText());
    assertNull(ts.next());
    ts = new TrimFilter( new IterTokenStream(
           new Token(" a", 0,2),
           new Token("b ", 0,2),
           new Token(" c ",0,3),
           new Token("   ",0,3)), true );
    List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
    List<Token> real = getTokens(ts);
    for( Token t : expect ) {
      System.out.println( "TEST:" + t );
    }
    for( Token t : real ) {
      System.out.println( "REAL:" + t );
    }
    assertTokEqualOff( expect, real );
  }
  public static class IterTokenStream extends TokenStream {
    Iterator<Token> toks;
    public IterTokenStream(Token... toks) {
      this.toks = Arrays.asList(toks).iterator();
    }
    public Token next() {
      if (toks.hasNext()) {
        return toks.next();
      }
      return null;
    }
  }
 }