From c93dd85a7407931d48a6b553eff18b400d9481e1 Mon Sep 17 00:00:00 2001 From: Ryan McKinley Date: Wed, 23 May 2007 17:18:05 +0000 Subject: [PATCH] SOLR-234 TrimFilter can update the Token's startOffset and endOffset if updateOffsets="true". By default the Token offsets are unchanged. This also refactors many of the Filter tests to share common code. git-svn-id: https://svn.apache.org/repos/asf/lucene/solr/trunk@540995 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 4 + .../org/apache/solr/analysis/TrimFilter.java | 37 +++- .../solr/analysis/TrimFilterFactory.java | 23 ++- .../solr/analysis/BaseTokenTestCase.java | 164 ++++++++++++++++++ .../analysis/TestBufferedTokenStream.java | 17 +- .../analysis/TestHyphenatedWordsFilter.java | 14 +- .../solr/analysis/TestPhoneticFilter.java | 21 +-- .../solr/analysis/TestSynonymFilter.java | 106 +---------- .../apache/solr/analysis/TestTrimFilter.java | 34 ++-- 9 files changed, 254 insertions(+), 166 deletions(-) create mode 100644 src/test/org/apache/solr/analysis/BaseTokenTestCase.java diff --git a/CHANGES.txt b/CHANGES.txt index 27c1dbd40c7..cb1df879ff6 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -194,6 +194,10 @@ New Features 32. SOLR-199: new n-gram tokenizers available via NGramTokenizerFactory and EdgeNGramTokenizerFactory. (Adam Hiatt via yonik) +33. SOLR-234: TrimFilter can update the Token's startOffset and endOffset + if updateOffsets="true". By default the Token offsets are unchanged. + (ryan) + Changes in runtime behavior 1. Highlighting using DisMax will only pick up terms from the main user query, not boost or filter queries (klaas). diff --git a/src/java/org/apache/solr/analysis/TrimFilter.java b/src/java/org/apache/solr/analysis/TrimFilter.java index c4a2d3f8304..a4ff190b0f0 100644 --- a/src/java/org/apache/solr/analysis/TrimFilter.java +++ b/src/java/org/apache/solr/analysis/TrimFilter.java @@ -29,17 +29,50 @@ import java.io.IOException; * @version $Id:$ */ public final class TrimFilter extends TokenFilter { + + final boolean updateOffsets; - public TrimFilter(TokenStream in) { + public TrimFilter(TokenStream in, boolean updateOffsets ) { super(in); + this.updateOffsets = updateOffsets; } + @Override public final Token next() throws IOException { Token t = input.next(); if (null == t || null == t.termText()) return t; - t.setTermText(t.termText().trim()); + if( updateOffsets ) { + String txt = t.termText(); + int start = 0; + int end = txt.length(); + int endOff = 0; + + // eat the first characters + while ((start < end) && (txt.charAt(start) <= ' ')) { + start++; + } + + // eat the end characters + while ((start < end) && (txt.charAt(end-1) <= ' ')) { + end--; + endOff++; + } + + if( start > 0 || end < txt.length() ) { + int incr = t.getPositionIncrement(); + t = new Token( t.termText().substring( start, end ), + t.startOffset()+start, + t.endOffset()-endOff, + t.type() ); + + t.setPositionIncrement( incr ); //+ start ); TODO? what should happen with the offset + } + } + else { + t.setTermText( t.termText().trim() ); + } return t; } } diff --git a/src/java/org/apache/solr/analysis/TrimFilterFactory.java b/src/java/org/apache/solr/analysis/TrimFilterFactory.java index 3e5e03ed42e..66fb5b1f686 100644 --- a/src/java/org/apache/solr/analysis/TrimFilterFactory.java +++ b/src/java/org/apache/solr/analysis/TrimFilterFactory.java @@ -17,14 +17,35 @@ package org.apache.solr.analysis; +import java.util.Map; + import org.apache.lucene.analysis.TokenStream; +import org.apache.solr.core.SolrException; /** * @version $Id:$ * @see TrimFilter */ public class TrimFilterFactory extends BaseTokenFilterFactory { + + protected boolean updateOffsets = false; + + @Override + public void init(Map args) { + super.init( args ); + + String v = args.get( "updateOffsets" ); + if( v != null ) { + try { + updateOffsets = Boolean.valueOf( v ); + } + catch( Exception ex ) { + throw new SolrException( 400, "Error reading updateOffsets value. Must be true or false.", ex ); + } + } + } + public TokenStream create(TokenStream input) { - return new TrimFilter(input); + return new TrimFilter(input, updateOffsets); } } diff --git a/src/test/org/apache/solr/analysis/BaseTokenTestCase.java b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java new file mode 100644 index 00000000000..84683e70f3b --- /dev/null +++ b/src/test/org/apache/solr/analysis/BaseTokenTestCase.java @@ -0,0 +1,164 @@ +package org.apache.solr.analysis; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +import junit.framework.TestCase; + +/** + * General token testing helper functions + */ +public abstract class BaseTokenTestCase extends TestCase +{ + public static String tsToString(TokenStream in) throws IOException { + StringBuffer out = new StringBuffer(); + Token t = in.next(); + if (null != t) + out.append(t.termText()); + + for (t = in.next(); null != t; t = in.next()) { + out.append(" ").append(t.termText()); + } + in.close(); + return out.toString(); + } + + public List tok2str(Iterable tokLst) { + ArrayList lst = new ArrayList(); + for ( Token t : tokLst ) { + lst.add( t.termText()); + } + return lst; + } + + + public void assertTokEqual(List a, List b) { + assertTokEq(a,b,false); + assertTokEq(b,a,false); + } + + public void assertTokEqualOff(List a, List b) { + assertTokEq(a,b,true); + assertTokEq(b,a,true); + } + + private void assertTokEq(List a, List b, boolean checkOff) { + int pos=0; + for (Iterator iter = a.iterator(); iter.hasNext();) { + Token tok = (Token)iter.next(); + pos += tok.getPositionIncrement(); + if (!tokAt(b, tok.termText(), pos + , checkOff ? tok.startOffset() : -1 + , checkOff ? tok.endOffset() : -1 + )) + { + fail(a + "!=" + b); + } + } + } + + public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) { + int pos=0; + for (Iterator iter = lst.iterator(); iter.hasNext();) { + Token tok = (Token)iter.next(); + pos += tok.getPositionIncrement(); + if (pos==tokPos && tok.termText().equals(val) + && (startOff==-1 || tok.startOffset()==startOff) + && (endOff ==-1 || tok.endOffset() ==endOff ) + ) + { + return true; + } + } + return false; + } + + + /*** + * Return a list of tokens according to a test string format: + * a b c => returns List [a,b,c] + * a/b => tokens a and b share the same spot (b.positionIncrement=0) + * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0) + * a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11 + */ + public List tokens(String str) { + String[] arr = str.split(" "); + List result = new ArrayList(); + for (int i=0; i 1) { + posInc = Integer.parseInt(params[1]); + } else { + posInc = 1; + } + + if (params.length > 2) { + start = Integer.parseInt(params[2]); + } else { + start = 0; + } + + if (params.length > 3) { + end = Integer.parseInt(params[3]); + } else { + end = start + params[0].length(); + } + + Token t = new Token(params[0],start,end,"TEST"); + t.setPositionIncrement(posInc); + + result.add(t); + for (int j=1; j getTokens(TokenStream tstream) throws IOException { + List tokens = new ArrayList(); + while (true) { + Token t = tstream.next(); + if (t==null) break; + tokens.add(t); + } + return tokens; + } + + // This could probably be put in a utility class + public static class IterTokenStream extends TokenStream { + Iterator toks; + public IterTokenStream(Token... toks) { + this.toks = Arrays.asList(toks).iterator(); + } + public IterTokenStream(Iterator toks) { + this.toks = toks; + } + @Override + public Token next() { + if (toks.hasNext()) { + return toks.next(); + } + return null; + } + } +} diff --git a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java index 7779366053c..46c056f5aa6 100644 --- a/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java +++ b/src/test/org/apache/solr/analysis/TestBufferedTokenStream.java @@ -26,9 +26,9 @@ import java.io.IOException; import java.io.StringReader; /** - * Test that BufferedTokenStream behaves as advertized in subclasses. + * Test that BufferedTokenStream behaves as advertised in subclasses. */ -public class TestBufferedTokenStream extends TestCase { +public class TestBufferedTokenStream extends BaseTokenTestCase { /** Example of a class implementing the rule "A" "B" => "Q" "B" */ public static class AB_Q_Stream extends BufferedTokenStream { @@ -52,20 +52,7 @@ public class TestBufferedTokenStream extends TestCase { return t; } } - - public static String tsToString(TokenStream in) throws IOException { - StringBuffer out = new StringBuffer(); - Token t = in.next(); - if (null != t) - out.append(t.termText()); - for (t = in.next(); null != t; t = in.next()) { - out.append(" ").append(t.termText()); - } - in.close(); - return out.toString(); - } - public void testABQ() throws Exception { final String input = "How now A B brown A cow B like A B thing?"; final String expected = "How now Q B brown A cow B like Q B thing?"; diff --git a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java index 06f0f3f0dc0..4d42e6cc635 100755 --- a/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java +++ b/src/test/org/apache/solr/analysis/TestHyphenatedWordsFilter.java @@ -29,7 +29,7 @@ import org.apache.lucene.analysis.WhitespaceTokenizer; /** * HyphenatedWordsFilter test */ -public class TestHyphenatedWordsFilter extends TestCase { +public class TestHyphenatedWordsFilter extends BaseTokenTestCase { public void testHyphenatedWords() throws Exception { String input = "ecologi-\r\ncal devel-\r\n\r\nop compre-\u0009hensive-hands-on"; String outputAfterHyphenatedWordsFilter = "ecological develop comprehensive-hands-on"; @@ -40,16 +40,4 @@ public class TestHyphenatedWordsFilter extends TestCase { assertEquals("Testing HyphenatedWordsFilter", outputAfterHyphenatedWordsFilter, actual); } - - public static String tsToString(TokenStream in) throws IOException { - StringBuffer out = new StringBuffer(); - Token t = in.next(); - if (null != t) - out.append(t.termText()); - - for (t = in.next(); null != t; t = in.next()) { - out.append(" ").append(t.termText()); - } - return out.toString(); - } } diff --git a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java index 6d65f6a78f6..55aca6670fd 100644 --- a/src/test/org/apache/solr/analysis/TestPhoneticFilter.java +++ b/src/test/org/apache/solr/analysis/TestPhoneticFilter.java @@ -19,24 +19,20 @@ package org.apache.solr.analysis; import java.util.ArrayList; import java.util.HashMap; -import java.util.Iterator; import java.util.Map; -import junit.framework.TestCase; - import org.apache.commons.codec.Encoder; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.commons.codec.language.Metaphone; import org.apache.commons.codec.language.RefinedSoundex; import org.apache.commons.codec.language.Soundex; import org.apache.lucene.analysis.Token; -import org.apache.lucene.analysis.TokenStream; /** - * @version $Id:$ + * @version $Id$ */ -public class TestPhoneticFilter extends TestCase { +public class TestPhoneticFilter extends BaseTokenTestCase { public void testFactory() { @@ -102,17 +98,4 @@ public class TestPhoneticFilter extends TestCase { runner( new Soundex(), false ); runner( new RefinedSoundex(), false ); } - - public static class IterTokenStream extends TokenStream { - Iterator toks; - public IterTokenStream(Iterator toks) { - this.toks = toks; - } - public Token next() { - if (toks.hasNext()) { - return toks.next(); - } - return null; - } - } } diff --git a/src/test/org/apache/solr/analysis/TestSynonymFilter.java b/src/test/org/apache/solr/analysis/TestSynonymFilter.java index ee59b7218a7..f9339784d7c 100644 --- a/src/test/org/apache/solr/analysis/TestSynonymFilter.java +++ b/src/test/org/apache/solr/analysis/TestSynonymFilter.java @@ -17,7 +17,6 @@ package org.apache.solr.analysis; -import junit.framework.TestCase; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; @@ -31,67 +30,20 @@ import java.util.List; * @author yonik * @version $Id$ */ -public class TestSynonymFilter extends TestCase { +public class TestSynonymFilter extends BaseTokenTestCase { public List strings(String str) { String[] arr = str.split(" "); return Arrays.asList(arr); } - /*** - * Return a list of tokens according to a test string format: - * a b c => returns List [a,b,c] - * a/b => tokens a and b share the same spot (b.positionIncrement=0) - * a,3/b/c => a,b,c all share same position (a.positionIncrement=3, b.positionIncrement=0, c.positionIncrement=0) - * a,1,10,11 => "a" with positionIncrement=1, startOffset=10, endOffset=11 - */ - public List tokens(String str) { - String[] arr = str.split(" "); - List result = new ArrayList(); - for (int i=0; i 1) { - posInc = Integer.parseInt(params[1]); - } else { - posInc = 1; - } - - if (params.length > 2) { - start = Integer.parseInt(params[2]); - } else { - start = 0; - } - - if (params.length > 3) { - end = Integer.parseInt(params[3]); - } else { - end = start + params[0].length(); - } - - Token t = new Token(params[0],start,end,"TEST"); - t.setPositionIncrement(posInc); - - result.add(t); - for (int j=1; j getTokList(SynonymMap dict, String input, boolean includeOrig) throws IOException { + ArrayList lst = new ArrayList(); final List toks = tokens(input); TokenStream ts = new TokenStream() { Iterator iter = toks.iterator(); + @Override public Token next() throws IOException { return iter.hasNext() ? (Token)iter.next() : null; } @@ -106,56 +58,6 @@ public class TestSynonymFilter extends TestCase { } } - public List tok2str(List tokLst) { - ArrayList lst = new ArrayList(); - for (Iterator iter = tokLst.iterator(); iter.hasNext();) { - lst.add(((Token)(iter.next())).termText()); - } - return lst; - } - - - public void assertTokEqual(List a, List b) { - assertTokEq(a,b,false); - assertTokEq(b,a,false); - } - - public void assertTokEqualOff(List a, List b) { - assertTokEq(a,b,true); - assertTokEq(b,a,true); - } - - private void assertTokEq(List a, List b, boolean checkOff) { - int pos=0; - for (Iterator iter = a.iterator(); iter.hasNext();) { - Token tok = (Token)iter.next(); - pos += tok.getPositionIncrement(); - if (!tokAt(b, tok.termText(), pos - , checkOff ? tok.startOffset() : -1 - , checkOff ? tok.endOffset() : -1 - )) - { - fail(a + "!=" + b); - } - } - } - - public boolean tokAt(List lst, String val, int tokPos, int startOff, int endOff) { - int pos=0; - for (Iterator iter = lst.iterator(); iter.hasNext();) { - Token tok = (Token)iter.next(); - pos += tok.getPositionIncrement(); - if (pos==tokPos && tok.termText().equals(val) - && (startOff==-1 || tok.startOffset()==startOff) - && (endOff==-1 || tok.endOffset()==endOff) - ) - { - return true; - } - } - return false; - } - public void testMatching() throws IOException { SynonymMap map = new SynonymMap(); diff --git a/src/test/org/apache/solr/analysis/TestTrimFilter.java b/src/test/org/apache/solr/analysis/TestTrimFilter.java index a61c63e4060..7cebd7be069 100644 --- a/src/test/org/apache/solr/analysis/TestTrimFilter.java +++ b/src/test/org/apache/solr/analysis/TestTrimFilter.java @@ -18,8 +18,10 @@ package org.apache.solr.analysis; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; import java.util.Arrays; +import java.util.List; import junit.framework.TestCase; @@ -30,32 +32,36 @@ import org.apache.lucene.analysis.TokenStream; /** * @version $Id:$ */ -public class TestTrimFilter extends TestCase { +public class TestTrimFilter extends BaseTokenTestCase { public void testTrim() throws Exception { TokenStream ts = new TrimFilter (new IterTokenStream(new Token(" a ", 1, 5), new Token("b ",6,10), new Token("cCc",11,15), - new Token(" ",16,20))); + new Token(" ",16,20)), false ); assertEquals("a", ts.next().termText()); assertEquals("b", ts.next().termText()); assertEquals("cCc", ts.next().termText()); assertEquals("", ts.next().termText()); assertNull(ts.next()); + + ts = new TrimFilter( new IterTokenStream( + new Token(" a", 0,2), + new Token("b ", 0,2), + new Token(" c ",0,3), + new Token(" ",0,3)), true ); + + List expect = tokens( "a,1,1,2 b,1,0,1 c,1,1,2 ,1,3,3" ); + List real = getTokens(ts); + for( Token t : expect ) { + System.out.println( "TEST:" + t ); + } + for( Token t : real ) { + System.out.println( "REAL:" + t ); + } + assertTokEqualOff( expect, real ); } - public static class IterTokenStream extends TokenStream { - Iterator toks; - public IterTokenStream(Token... toks) { - this.toks = Arrays.asList(toks).iterator(); - } - public Token next() { - if (toks.hasNext()) { - return toks.next(); - } - return null; - } - } }