From c93dd85a7407931d48a6b553eff18b400d9481e1 Mon Sep 17 00:00:00 2001
From: Ryan McKinley <ryan@apache.org>
Date: Wed, 23 May 2007 17:18:05 +0000
Subject: [PATCH] SOLR-234 TrimFilter can update the Token's startOffset and
 endOffset if updateOffsets="true".  By default the Token offsets are
 unchanged.

This also refactors many of the Filter tests to share common code.

git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68
---
 CHANGES.txt                                   |   4 +
 .../org/apache/solr/analysis/TrimFilter.java  |  37 +++-
 .../solr/analysis/TrimFilterFactory.java      |  23 ++-
 .../solr/analysis/BaseTokenTestCase.java      | 164 ++++++++++++++++++
 .../analysis/TestBufferedTokenStream.java     |  17 +-
 .../analysis/TestHyphenatedWordsFilter.java   |  14 +-
 .../solr/analysis/TestPhoneticFilter.java     |  21 +--
 .../solr/analysis/TestSynonymFilter.java      | 106 +----------
 .../apache/solr/analysis/TestTrimFilter.java  |  34 ++--
 9 files changed, 254 insertions(+), 166 deletions(-)
 create mode 100644 src/test/org/apache/solr/analysis/BaseTokenTestCase.java

diff --git a/CHANGES.txt b/CHANGES.txt
index 27c1dbd40c7..cb1df879ff6 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -194,6 +194,10 @@ New Features
 32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory 
     and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik)
 
+33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset 
+    if updateOffsets="true".  By default the Token offsets are unchanged.
+    (ryan)
+    
 Changes in runtime behavior
  1. Highlighting using DisMax will only pick up terms from the main 
     user query, not boost or filter queries (klaas).
diff --git a/src/java/org/apache/solr/analysis/TrimFilter.java b/src/java/org/apache/solr/analysis/TrimFilter.java
index c4a2d3f8304..a4ff190b0f0 100644
--- a/src/java/org/apache/solr/analysis/TrimFilter.java
+++ b/src/java/org/apache/solr/analysis/TrimFilter.java
@@ -29,17 +29,50 @@ import java.io.IOException;
  * @version $Id:$
  */
 public final class TrimFilter extends TokenFilter {
+  
+  final boolean updateOffsets;
 
-  public TrimFilter(TokenStream in) {
+  public TrimFilter(TokenStream in, boolean updateOffsets ) {
     super(in);
+    this.updateOffsets = updateOffsets;
   }
 
+  @Override
   public final Token next() throws IOException {
     Token t = input.next();
     if (null == t || null == t.termText())
       return t;
 
-    t.setTermText(t.termText().trim());
+    if( updateOffsets ) {
+      String txt = t.termText();
+      int start = 0;
+      int end = txt.length();
+      int endOff = 0;
+      
+      // eat the first characters
+      while ((start < end) && (txt.charAt(start) <= ' ')) {
+        start++;
+      }
+      
+      // eat the end characters
+      while ((start < end) && (txt.charAt(end-1) <= ' ')) {
+        end--;
+        endOff++;
+      }
+      
+      if( start > 0 || end < txt.length() ) {
+        int incr = t.getPositionIncrement();
+        t = new Token( t.termText().substring( start, end ),
+             t.startOffset()+start,
+             t.endOffset()-endOff,
+             t.type() );
+        
+        t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset
+      }
+    }
+    else {
+      t.setTermText( t.termText().trim() );
+    }
     return t;
   }
 }
diff --git a/src/java/org/apache/solr/analysis/TrimFilterFactory.java b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
index 3e5e03ed42e..66fb5b1f686 100644
--- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java
+++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java
@@ -17,14 +17,35 @@
 
 package org.apache.solr.analysis;
 
+import java.util.Map;
+
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.solr.core.SolrException;
 
 /**
  * @version $Id:$
  * @see TrimFilter
  */
 public class TrimFilterFactory extends BaseTokenFilterFactory {
+  
+  protected boolean updateOffsets = false;
+  
+  @Override
+  public void init(Map<String,String> args) {
+    super.init( args );
+    
+    String v = args.get( "updateOffsets" );
+    if( v != null ) {
+      try {
+        updateOffsets = Boolean.valueOf( v );
+      }
+      catch( Exception ex ) {
+        throw new SolrException( 400, "Error reading updateOffsets value.  Must be true or false.", ex );
+      }
+    }
+  }
+  
   public TokenStream create(TokenStream input) {
-    return new TrimFilter(input);
+    return new TrimFilter(input, updateOffsets);
   }
 }
diff --git a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
new file mode 100644
index 00000000000..84683e70f3b
--- /dev/null
+++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java
@@ -0,0 +1,164 @@
+package org.apache.solr.analysis;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import junit.framework.TestCase;
+
+/**
+ * General token testing helper functions
+ */
+public abstract class BaseTokenTestCase extends TestCase 
+{
+  public static String tsToString(TokenStream in) throws IOException {
+    StringBuffer out = new StringBuffer();
+    Token t = in.next();
+    if (null != t)
+      out.append(t.termText());
+    
+    for (t = in.next(); null != t; t = in.next()) {
+      out.append(" ").append(t.termText());
+    }
+    in.close();
+    return out.toString();
+  }
+
+  public List<String> tok2str(Iterable<Token> tokLst) {
+    ArrayList<String> lst = new ArrayList<String>();
+    for ( Token t : tokLst ) {
+      lst.add( t.termText());
+    }
+    return lst;
+  }
+
+
+  public void assertTokEqual(List<Token> a, List<Token> b) {
+    assertTokEq(a,b,false);
+    assertTokEq(b,a,false);
+  }
+
+  public void assertTokEqualOff(List<Token> a, List<Token> b) {
+    assertTokEq(a,b,true);
+    assertTokEq(b,a,true);
+  }
+
+  private void assertTokEq(List<Token> a, List<Token> b, boolean checkOff) {
+    int pos=0;
+    for (Iterator iter = a.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (!tokAt(b, tok.termText(), pos
+              , checkOff ? tok.startOffset() : -1
+              , checkOff ? tok.endOffset() : -1
+              )) 
+      {
+        fail(a + "!=" + b);
+      }
+    }
+  }
+
+  public boolean tokAt(List<Token> lst, String val, int tokPos, int startOff, int endOff) {
+    int pos=0;
+    for (Iterator iter = lst.iterator(); iter.hasNext();) {
+      Token tok = (Token)iter.next();
+      pos += tok.getPositionIncrement();
+      if (pos==tokPos && tok.termText().equals(val)
+          && (startOff==-1 || tok.startOffset()==startOff)
+          && (endOff  ==-1 || tok.endOffset()  ==endOff  )
+           )
+      {
+        return true;
+      }
+    }
+    return false;
+  }
+
+
+  /***
+   * Return a list of tokens according to a test string format:
+   * a b c  =>  returns List<Token> [a,b,c]
+   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
+   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
+   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
+   */
+  public List<Token> tokens(String str) {
+    String[] arr = str.split(" ");
+    List<Token> result = new ArrayList<Token>();
+    for (int i=0; i<arr.length; i++) {
+      String[] toks = arr[i].split("/");
+      String[] params = toks[0].split(",");
+
+      int posInc;
+      int start;
+      int end;
+
+      if (params.length > 1) {
+        posInc = Integer.parseInt(params[1]);
+      } else {
+        posInc = 1;
+      }
+
+      if (params.length > 2) {
+        start = Integer.parseInt(params[2]);
+      } else {
+        start = 0;
+      }
+
+      if (params.length > 3) {
+        end = Integer.parseInt(params[3]);
+      } else {
+        end = start + params[0].length();
+      }
+
+      Token t = new Token(params[0],start,end,"TEST");
+      t.setPositionIncrement(posInc);
+      
+      result.add(t);
+      for (int j=1; j<toks.length; j++) {
+        t = new Token(toks[j],0,0,"TEST");
+        t.setPositionIncrement(0);
+        result.add(t);
+      }
+    }
+    return result;
+  }
+
+  //------------------------------------------------------------------------
+  // These may be useful beyond test cases...
+  //------------------------------------------------------------------------
+
+  // This could probably be put in a utility class
+  static List<Token> getTokens(TokenStream tstream) throws IOException {
+    List<Token> tokens = new ArrayList<Token>();
+    while (true) {
+      Token t = tstream.next();
+      if (t==null) break;
+      tokens.add(t);
+    }
+    return tokens;
+  }
+
+  // This could probably be put in a utility class
+  public static class IterTokenStream extends TokenStream {
+    Iterator<Token> toks;
+    public IterTokenStream(Token... toks) {
+      this.toks = Arrays.asList(toks).iterator();
+    }
+    public IterTokenStream(Iterator<Token> toks) {
+      this.toks = toks;
+    }
+    @Override
+    public Token next() {
+      if (toks.hasNext()) {
+        return toks.next();
+      }
+      return null;
+    }
+  }
+}
diff --git a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
index 7779366053c..46c056f5aa6 100644
--- a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
+++ b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java
@@ -26,9 +26,9 @@ import java.io.IOException;
 import java.io.StringReader;
 
 /**
- * Test that BufferedTokenStream behaves as advertized in subclasses.
+ * Test that BufferedTokenStream behaves as advertised in subclasses.
  */
-public class TestBufferedTokenStream extends TestCase {
+public class TestBufferedTokenStream extends BaseTokenTestCase {
 
   /** Example of a class implementing the rule "A" "B" => "Q" "B" */
   public static class AB_Q_Stream extends BufferedTokenStream {
@@ -52,20 +52,7 @@ public class TestBufferedTokenStream extends TestCase {
       return t;
     }
   }
-  
-  public static String tsToString(TokenStream in) throws IOException {
-    StringBuffer out = new StringBuffer();
-    Token t = in.next();
-    if (null != t)
-      out.append(t.termText());
     
-    for (t = in.next(); null != t; t = in.next()) {
-      out.append(" ").append(t.termText());
-    }
-    in.close();
-    return out.toString();
-  }
-  
   public void testABQ() throws Exception {
     final String input = "How now A B brown A cow B like A B thing?";
     final String expected = "How now Q B brown A cow B like Q B thing?";
diff --git a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
index 06f0f3f0dc0..4d42e6cc635 100755
--- a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
+++ b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java
@@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer;
 /**
  * HyphenatedWordsFilter test
  */
-public class TestHyphenatedWordsFilter extends TestCase {
+public class TestHyphenatedWordsFilter extends BaseTokenTestCase {
 	public void testHyphenatedWords() throws Exception {
 		String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on";
 		String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on";
@@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase {
 		assertEquals("Testing HyphenatedWordsFilter",
 				outputAfterHyphenatedWordsFilter, actual);
 	}
-
-	public static String tsToString(TokenStream in) throws IOException {
-		StringBuffer out = new StringBuffer();
-		Token t = in.next();
-		if (null != t)
-			out.append(t.termText());
-
-		for (t = in.next(); null != t; t = in.next()) {
-			out.append(" ").append(t.termText());
-		}
-		return out.toString();
-	}
 }
diff --git a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
index 6d65f6a78f6..55aca6670fd 100644
--- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
+++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java
@@ -19,24 +19,20 @@ package org.apache.solr.analysis;
 
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.Map;
 
-import junit.framework.TestCase;
-
 import org.apache.commons.codec.Encoder;
 import org.apache.commons.codec.language.DoubleMetaphone;
 import org.apache.commons.codec.language.Metaphone;
 import org.apache.commons.codec.language.RefinedSoundex;
 import org.apache.commons.codec.language.Soundex;
 import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
 
 
 /**
- * @version $Id:$
+ * @version $Id$
  */
-public class TestPhoneticFilter extends TestCase {
+public class TestPhoneticFilter extends BaseTokenTestCase {
   
   public void testFactory()
   {
@@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase {
     runner( new Soundex(), false );
     runner( new RefinedSoundex(), false );
   }
-
-  public static class IterTokenStream extends TokenStream {
-    Iterator<Token> toks;
-    public IterTokenStream(Iterator<Token> toks) {
-      this.toks = toks;
-    }
-    public Token next() {
-      if (toks.hasNext()) {
-        return toks.next();
-      }
-      return null;
-    }
-  }
 }
diff --git a/src/test/org/apache/solr/analysis/TestSynonymFilter.java b/src/test/org/apache/solr/analysis/TestSynonymFilter.java
index ee59b7218a7..f9339784d7c 100644
--- a/src/test/org/apache/solr/analysis/TestSynonymFilter.java
+++ b/src/test/org/apache/solr/analysis/TestSynonymFilter.java
@@ -17,7 +17,6 @@
 
 package org.apache.solr.analysis;
 
-import junit.framework.TestCase;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenStream;
 
@@ -31,67 +30,20 @@ import java.util.List;
  * @author yonik
  * @version $Id$
  */
-public class TestSynonymFilter extends TestCase {
+public class TestSynonymFilter extends BaseTokenTestCase {
 
   public List strings(String str) {
     String[] arr = str.split(" ");
     return Arrays.asList(arr);
   }
 
-  /***
-   * Return a list of tokens according to a test string format:
-   * a b c  =>  returns List<Token> [a,b,c]
-   * a/b   => tokens a and b share the same spot (b.positionIncrement=0)
-   * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0)
-   * a,1,10,11  => "a" with positionIncrement=1, startOffset=10, endOffset=11
-   */
-  public List tokens(String str) {
-    String[] arr = str.split(" ");
-    List result = new ArrayList();
-    for (int i=0; i<arr.length; i++) {
-      String[] toks = arr[i].split("/");
-      String[] params = toks[0].split(",");
 
-      int posInc;
-      int start;
-      int end;
-
-      if (params.length > 1) {
-        posInc = Integer.parseInt(params[1]);
-      } else {
-        posInc = 1;
-      }
-
-      if (params.length > 2) {
-        start = Integer.parseInt(params[2]);
-      } else {
-        start = 0;
-      }
-
-      if (params.length > 3) {
-        end = Integer.parseInt(params[3]);
-      } else {
-        end = start + params[0].length();
-      }
-
-      Token t = new Token(params[0],start,end,"TEST");
-      t.setPositionIncrement(posInc);
-      
-      result.add(t);
-      for (int j=1; j<toks.length; j++) {
-        t = new Token(toks[j],0,0,"TEST");
-        t.setPositionIncrement(0);
-        result.add(t);
-      }
-    }
-    return result;
-  }
-
-  public List getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
-    ArrayList lst = new ArrayList();
+  public List<Token> getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException {
+    ArrayList<Token> lst = new ArrayList<Token>();
     final List toks = tokens(input);
     TokenStream ts = new TokenStream() {
       Iterator iter = toks.iterator();
+      @Override
       public Token next() throws IOException {
         return iter.hasNext() ? (Token)iter.next() : null;
       }
@@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase {
     }
   }
 
-  public List tok2str(List tokLst) {
-    ArrayList lst = new ArrayList();
-    for (Iterator iter = tokLst.iterator(); iter.hasNext();) {
-      lst.add(((Token)(iter.next())).termText());
-    }
-    return lst;
-  }
-
-
-  public void assertTokEqual(List a, List b) {
-    assertTokEq(a,b,false);
-    assertTokEq(b,a,false);
-  }
-
-  public void assertTokEqualOff(List a, List b) {
-    assertTokEq(a,b,true);
-    assertTokEq(b,a,true);
-  }
-
-  private void assertTokEq(List a, List b, boolean checkOff) {
-    int pos=0;
-    for (Iterator iter = a.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (!tokAt(b, tok.termText(), pos
-              , checkOff ? tok.startOffset() : -1
-              , checkOff ? tok.endOffset() : -1
-              )) 
-      {
-        fail(a + "!=" + b);
-      }
-    }
-  }
-
-  public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) {
-    int pos=0;
-    for (Iterator iter = lst.iterator(); iter.hasNext();) {
-      Token tok = (Token)iter.next();
-      pos += tok.getPositionIncrement();
-      if (pos==tokPos && tok.termText().equals(val)
-          && (startOff==-1 || tok.startOffset()==startOff)
-          && (endOff==-1 || tok.endOffset()==endOff)
-           )
-      {
-        return true;
-      }
-    }
-    return false;
-  }
-
 
   public void testMatching() throws IOException {
     SynonymMap map = new SynonymMap();
diff --git a/src/test/org/apache/solr/analysis/TestTrimFilter.java b/src/test/org/apache/solr/analysis/TestTrimFilter.java
index a61c63e4060..7cebd7be069 100644
--- a/src/test/org/apache/solr/analysis/TestTrimFilter.java
+++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java
@@ -18,8 +18,10 @@
 package org.apache.solr.analysis;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Arrays;
+import java.util.List;
 
 import junit.framework.TestCase;
 
@@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream;
 /**
  * @version $Id:$
  */
-public class TestTrimFilter extends TestCase {
+public class TestTrimFilter extends BaseTokenTestCase {
   
   public void testTrim() throws Exception {
     TokenStream ts = new TrimFilter
       (new IterTokenStream(new Token(" a ", 1, 5),
                            new Token("b   ",6,10),
                            new Token("cCc",11,15),
-                           new Token("   ",16,20)));
+                           new Token("   ",16,20)), false );
 
     assertEquals("a", ts.next().termText());
     assertEquals("b", ts.next().termText());
     assertEquals("cCc", ts.next().termText());
     assertEquals("", ts.next().termText());
     assertNull(ts.next());
+    
+    ts = new TrimFilter( new IterTokenStream(
+           new Token(" a", 0,2),
+           new Token("b ", 0,2),
+           new Token(" c ",0,3),
+           new Token("   ",0,3)), true );
+    
+    List<Token> expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" );
+    List<Token> real = getTokens(ts);
+    for( Token t : expect ) {
+      System.out.println( "TEST:" + t );
+    }
+    for( Token t : real ) {
+      System.out.println( "REAL:" + t );
+    }
+    assertTokEqualOff( expect, real );
   }
 
-  public static class IterTokenStream extends TokenStream {
-    Iterator<Token> toks;
-    public IterTokenStream(Token... toks) {
-      this.toks = Arrays.asList(toks).iterator();
-    }
-    public Token next() {
-      if (toks.hasNext()) {
-        return toks.next();
-      }
-      return null;
-    }
-  }
 }