SOLR-234 TrimFilter can update the Token's startOffset and endOffset if updateOffsets="true". By default the Token offsets are unchanged.

This also refactors many of the Filter tests to share common code. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68
2025-03-01 13:59:12 +00:00 · 2007-05-23 17:18:05 +00:00 · 2007-05-23 17:18:05 +00:00 · c93dd85a74
commit c93dd85a74
parent 66b04d6ccc
9 changed files with 254 additions and 166 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -194,6 +194,10 @@ New Features
 32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory 
    and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)

+33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset 
+    if updateOffsets="true".  By default the Token offsets are unchanged.
+    (ryan)
+    
 Changes in runtime behavior
 1. Highlighting using DisMax will only pick up terms from the main 
    user query, not boost or filter queries (klaas).
--- a/src/java/org/apache/solr/analysis/TrimFilter.java
+++ b/src/java/org/apache/solr/analysis/TrimFilter.java
@ -29,17 +29,50 @@ import java.io.IOException;
 * @version $Id:$
 */
 public final class TrimFilter extends TokenFilter {
+  
+  final boolean updateOffsets;

-  public TrimFilter(TokenStream in) {
+  public TrimFilter(TokenStream in, boolean updateOffsets ) {
    super(in);
+    this.updateOffsets = updateOffsets;
  }

+  @Override
  public final Token next() throws IOException {
    Token t = input.next();
    if (null == t || null == t.termText())
      return t;

-    t.setTermText(t.termText().trim());
+    if( updateOffsets ) {
+      String txt = t.termText();
+      int start = 0;
+      int end = txt.length();
+      int endOff = 0;
+      
+      // eat the first characters
+      while ((start < end) && (txt.charAt(start) <= ' ')) {
+        start++;
+      }
+      
+      // eat the end characters
+      while ((start < end) && (txt.charAt(end-1) <= ' ')) {
+        end--;
+        endOff++;
+      }
+      
+      if( start > 0 || end < txt.length() ) {
+        int incr = t.getPositionIncrement();
+        t = new Token( t.termText().substring( start, end ),
+             t.startOffset()+start,
+             t.endOffset()-endOff,
+             t.type() );
+        
+        t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
+      }
+    }
+    else {
+      t.setTermText( t.termText().trim() );
+    }
    return t;
  }
 }
--- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@ -17,14 +17,35 @@

 package org.apache.solr.analysis;

+import java.util.Map;
+
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.core.SolrException;

 /**
 * @version $Id:$
 * @see TrimFilter
 */
 public class TrimFilterFactory extends BaseTokenFilterFactory {
+  
+  protected boolean updateOffsets = false;
+  
+  @Override
+  public void init(Map<String,String> args) {
+    super.init( args );
+    
+    String v = args.get( "updateOffsets" );
+    if( v != null ) {
+      try {
+        updateOffsets = Boolean.valueOf( v );
+      }
+      catch( Exception ex ) {
+        throw new SolrException( 400, "Error reading updateOffsets value.  Must be true or false.", ex );
+      }
+    }
+  }
+  
  public TokenStream create(TokenStream input) {
-    return new TrimFilter(input);
+    return new TrimFilter(input, updateOffsets);
  }
 }
--- a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
+++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
@ -0,0 +1,164 @@
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import junit.framework.TestCase;
+
+/**
+ * General token testing helper functions
+ */
+public abstract class BaseTokenTestCase extends TestCase 
+{
+  public static String tsToString(TokenStream in) throws IOException {
+    StringBuffer out = new StringBuffer();
+    Token t = in.next();
+    if (null != t)
+      out.append(t.termText());
+    
+    for (t = in.next(); null != t; t = in.next()) {
+      out.append(" ").append(t.termText());
+    }
+    in.close();
+    return out.toString();
+  }
+
+  public List<String> tok2str(Iterable<Token> tokLst) {
+    ArrayList<String> lst = new ArrayList<String>();
+    for ( Token t : tokLst ) {
+      lst.add( t.termText());
+    }
+    return lst;
+  }
+
+
+  public void assertTokEqual(List<Token> a, List<Token> b) {
+    assertTokEq(a,b,false);
+    assertTokEq(b,a,false);
+  }
+
+  public void assertTokEqualOff(List<Token> a, List<Token> b) {
+    assertTokEq(a,b,true);
+    assertTokEq(b,a,true);
+  }
+
+  private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
+    int pos=0;
+    for (Iterator iter = a.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (!tokAt(b, tok.termText(), pos
+              , checkOff ? tok.startOffset() : -1
+              , checkOff ? tok.endOffset() : -1
+              )) 
+      {
+        fail(a + "!=" + b);
+      }
+    }
+  }
+
+  public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
+    int pos=0;
+    for (Iterator iter = lst.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (pos==tokPos && tok.termText().equals(val)
+          && (startOff==-1 || tok.startOffset()==startOff)
+          && (endOff  ==-1 || tok.endOffset()  ==endOff  )
+           )
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+
+  /***
+   * Return a list of tokens according to a test string format:
+   * a b c  =>  returns List<Token> [a,b,c]
+   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
+   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
+   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
+   */
+  public List<Token> tokens(String str) {
+    String[] arr = str.split(" ");
+    List<Token> result = new ArrayList<Token>();
+    for (int i=0; i<arr.length; i++) {
+      String[] toks = arr[i].split("/");
+      String[] params = toks[0].split(",");
+
+      int posInc;
+      int start;
+      int end;
+
+      if (params.length > 1) {
+        posInc = Integer.parseInt(params[1]);
+      } else {
+        posInc = 1;
+      }
+
+      if (params.length > 2) {
+        start = Integer.parseInt(params[2]);
+      } else {
+        start = 0;
+      }
+
+      if (params.length > 3) {
+        end = Integer.parseInt(params[3]);
+      } else {
+        end = start + params[0].length();
+      }
+
+      Token t = new Token(params[0],start,end,"TEST");
+      t.setPositionIncrement(posInc);
+      
+      result.add(t);
+      for (int j=1; j<toks.length; j++) {
+        t = new Token(toks[j],0,0,"TEST");
+        t.setPositionIncrement(0);
+        result.add(t);
+      }
+    }
+    return result;
+  }
+
+  //------------------------------------------------------------------------
+  // These may be useful beyond test cases...
+  //------------------------------------------------------------------------
+
+  // This could probably be put in a utility class
+  static List<Token> getTokens(TokenStream tstream) throws IOException {
+    List<Token> tokens = new ArrayList<Token>();
+    while (true) {
+      Token t = tstream.next();
+      if (t==null) break;
+      tokens.add(t);
+    }
+    return tokens;
+  }
+
+  // This could probably be put in a utility class
+  public static class IterTokenStream extends TokenStream {
+    Iterator<Token> toks;
+    public IterTokenStream(Token... toks) {
+      this.toks = Arrays.asList(toks).iterator();
+    }
+    public IterTokenStream(Iterator<Token> toks) {
+      this.toks = toks;
+    }
+    @Override
+    public Token next() {
+      if (toks.hasNext()) {
+        return toks.next();
+      }
+      return null;
+    }
+  }
+}
--- a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
+++ b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
@ -26,9 +26,9 @@ import java.io.IOException;
 import java.io.StringReader;

 /**
- * Test that BufferedTokenStream behaves as advertized in subclasses.
+ * Test that BufferedTokenStream behaves as advertised in subclasses.
 */
-public class TestBufferedTokenStream extends TestCase {
+public class TestBufferedTokenStream extends BaseTokenTestCase {

  /** Example of a class implementing the rule "A" "B" => "Q" "B" */
  public static class AB_Q_Stream extends BufferedTokenStream {
@ -52,20 +52,7 @@ public class TestBufferedTokenStream extends TestCase {
      return t;
    }
  }
-  
-  public static String tsToString(TokenStream in) throws IOException {
-    StringBuffer out = new StringBuffer();
-    Token t = in.next();
-    if (null != t)
-      out.append(t.termText());
    
-    for (t = in.next(); null != t; t = in.next()) {
-      out.append(" ").append(t.termText());
-    }
-    in.close();
-    return out.toString();
-  }
-  
  public void testABQ() throws Exception {
    final String input = "How now A B brown A cow B like A B thing?";
    final String expected = "How now Q B brown A cow B like Q B thing?";
--- a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
+++ b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
 /**
 * HyphenatedWordsFilter test
 */
-public class TestHyphenatedWordsFilter extends TestCase {
+public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
 	public void testHyphenatedWords() throws Exception {
 		String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
 		String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase {
 		assertEquals("Testing HyphenatedWordsFilter",
 				outputAfterHyphenatedWordsFilter, actual);
 	}
-
-	public static String tsToString(TokenStream in) throws IOException {
-		StringBuffer out = new StringBuffer();
-		Token t = in.next();
-		if (null != t)
-			out.append(t.termText());
-
-		for (t = in.next(); null != t; t = in.next()) {
-			out.append(" ").append(t.termText());
-		}
-		return out.toString();
-	}
 }
--- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
@ -19,24 +19,20 @@ package org.apache.solr.analysis;

 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.Map;

-import junit.framework.TestCase;
-
 import org.apache.commons.codec.Encoder;
 import org.apache.commons.codec.language.DoubleMetaphone;
 import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;


 /**
- * @version $Id:$
+ * @version $Id$
 */
-public class TestPhoneticFilter extends TestCase {
+public class TestPhoneticFilter extends BaseTokenTestCase {
  
  public void testFactory()
  {
@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase {
    runner( new Soundex(), false );
    runner( new RefinedSoundex(), false );
  }
-
-  public static class IterTokenStream extends TokenStream {
-    Iterator<Token> toks;
-    public IterTokenStream(Iterator<Token> toks) {
-      this.toks = toks;
-    }
-    public Token next() {
-      if (toks.hasNext()) {
-        return toks.next();
-      }
-      return null;
-    }
-  }
 }
--- a/src/test/org/apache/solr/analysis/TestSynonymFilter.java
+++ b/src/test/org/apache/solr/analysis/TestSynonymFilter.java
@ -17,7 +17,6 @@

 package org.apache.solr.analysis;

-import junit.framework.TestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;

@ -31,67 +30,20 @@ import java.util.List;
 * @author yonik
 * @version $Id$
 */
-public class TestSynonymFilter extends TestCase {
+public class TestSynonymFilter extends BaseTokenTestCase {

  public List strings(String str) {
    String[] arr = str.split(" ");
    return Arrays.asList(arr);
  }

-  /***
-   * Return a list of tokens according to a test string format:
-   * a b c  =>  returns List<Token> [a,b,c]
-   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
-   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
-   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
-   */
-  public List tokens(String str) {
-    String[] arr = str.split(" ");
-    List result = new ArrayList();
-    for (int i=0; i<arr.length; i++) {
-      String[] toks = arr[i].split("/");
-      String[] params = toks[0].split(",");

-      int posInc;
-      int start;
-      int end;
-
-      if (params.length > 1) {
-        posInc = Integer.parseInt(params[1]);
-      } else {
-        posInc = 1;
-      }
-
-      if (params.length > 2) {
-        start = Integer.parseInt(params[2]);
-      } else {
-        start = 0;
-      }
-
-      if (params.length > 3) {
-        end = Integer.parseInt(params[3]);
-      } else {
-        end = start + params[0].length();
-      }
-
-      Token t = new Token(params[0],start,end,"TEST");
-      t.setPositionIncrement(posInc);
-      
-      result.add(t);
-      for (int j=1; j<toks.length; j++) {
-        t = new Token(toks[j],0,0,"TEST");
-        t.setPositionIncrement(0);
-        result.add(t);
-      }
-    }
-    return result;
-  }
-
-  public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
-    ArrayList lst = new ArrayList();
+  public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
+    ArrayList<Token> lst = new ArrayList<Token>();
    final List toks = tokens(input);
    TokenStream ts = new TokenStream() {
      Iterator iter = toks.iterator();
+      @Override
      public Token next() throws IOException {
        return iter.hasNext() ? (Token)iter.next() : null;
      }
@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase {
    }
  }

-  public List tok2str(List tokLst) {
-    ArrayList lst = new ArrayList();
-    for (Iterator iter = tokLst.iterator(); iter.hasNext();) {
-      lst.add(((Token)(iter.next())).termText());
-    }
-    return lst;
-  }
-
-
-  public void assertTokEqual(List a, List b) {
-    assertTokEq(a,b,false);
-    assertTokEq(b,a,false);
-  }
-
-  public void assertTokEqualOff(List a, List b) {
-    assertTokEq(a,b,true);
-    assertTokEq(b,a,true);
-  }
-
-  private void assertTokEq(List a, List b, boolean checkOff) {
-    int pos=0;
-    for (Iterator iter = a.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (!tokAt(b, tok.termText(), pos
-              , checkOff ? tok.startOffset() : -1
-              , checkOff ? tok.endOffset() : -1
-              )) 
-      {
-        fail(a + "!=" + b);
-      }
-    }
-  }
-
-  public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
-    int pos=0;
-    for (Iterator iter = lst.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (pos==tokPos && tok.termText().equals(val)
-          && (startOff==-1 || tok.startOffset()==startOff)
-          && (endOff==-1 || tok.endOffset()==endOff)
-           )
-      {
-        return true;
-      }
-    }
-    return false;
-  }
-

  public void testMatching() throws IOException {
    SynonymMap map = new SynonymMap();
--- a/src/test/org/apache/solr/analysis/TestTrimFilter.java
+++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java
@ -18,8 +18,10 @@
 package org.apache.solr.analysis;

 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Arrays;
+import java.util.List;

 import junit.framework.TestCase;

@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream;
 /**
 * @version $Id:$
 */
-public class TestTrimFilter extends TestCase {
+public class TestTrimFilter extends BaseTokenTestCase {
  
  public void testTrim() throws Exception {
    TokenStream ts = new TrimFilter
      (new IterTokenStream(new Token(" a ", 1, 5),
                           new Token("b   ",6,10),
                           new Token("cCc",11,15),
-                           new Token("   ",16,20)));
+                           new Token("   ",16,20)), false );

    assertEquals("a", ts.next().termText());
    assertEquals("b", ts.next().termText());
    assertEquals("cCc", ts.next().termText());
    assertEquals("", ts.next().termText());
    assertNull(ts.next());
+    
+    ts = new TrimFilter( new IterTokenStream(
+           new Token(" a", 0,2),
+           new Token("b ", 0,2),
+           new Token(" c ",0,3),
+           new Token("   ",0,3)), true );
+    
+    List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
+    List<Token> real = getTokens(ts);
+    for( Token t : expect ) {
+      System.out.println( "TEST:" + t );
+    }
+    for( Token t : real ) {
+      System.out.println( "REAL:" + t );
+    }
+    assertTokEqualOff( expect, real );
  }

-  public static class IterTokenStream extends TokenStream {
-    Iterator<Token> toks;
-    public IterTokenStream(Token... toks) {
-      this.toks = Arrays.asList(toks).iterator();
-    }
-    public Token next() {
-      if (toks.hasNext()) {
-        return toks.next();
-      }
-      return null;
-    }
-  }
 }